# Libraries import

In [1]:
import pandas as pd
import numpy as np
from tensorflow import keras
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

NLTK data that must be downloaded only once

In [9]:
# nltk.download('wordnet')
# nltk.download('punkt')

# Data and model loading

In [2]:
df = pd.read_csv('.\preprocessed_df.csv')

In [3]:
sentiment_model = keras.models.load_model('.\Model_full_extended\Model_full')

# Lemmatiation and tokenization of comments using NLTK library

In [4]:
def preprocessing_for_sent(df):
    '''
    The function takes dataframe with cleaned data and return Series with comments prepared for model.
    Comments are lemmatized and tokenized.
    '''
    wn_lemmatizer = WordNetLemmatizer()
    lemmatized_text_for_sen = []
    
    for comment in df['comment']:
        lemmatized_text_for_sen.append(' '.join([wn_lemmatizer.lemmatize(word) for word in comment.split()]))
    
    for i in range(len(lemmatized_text_for_sen)):
        lemmatized_text_for_sen[i] = word_tokenize(lemmatized_text_for_sen[i])
    clean_tokenized_comment = [] 
    
    for i, element in enumerate(lemmatized_text_for_sen):
        clean_tokenized_comment.append(' '.join([word for word in element]))    
    
    return pd.Series(clean_tokenized_comment)

# Updating dataframe with predicted sentiment

In [5]:
def sentiment_calculation(df, series, model):
    '''
    The function returns origianl df with following columns added:
    Comment_for_sen - preprocessed comment
    Original_sent - original sentiment calculated as Positive in case of prod_eval >= 4 and Negative in other cases
    Pos_prob - probability of positive sentiment according to model
    Predicted_sent - predicted sentiment as Positive in case of pos_prob >= 0.6, Negative in case of pos_prob <= 0.4 and Neutral in other cases
    '''
    
    df['Comment_for_sen'] = series
    df['Original_sent'] = np.where(df['prod_eval'] >= 4, 'Positive', 'Negative')
    sen_pred = model.predict(df['Comment_for_sen'])
    df['Pos_prob'] = sen_pred
    df['Predicted_sent'] = np.where(df['Pos_prob'] >= 0.6, 'Positive', np.where(df['Pos_prob'] <= 0.4, 'Negative', 'Neutral'))
    return df
    

# Functions application

In [6]:
final_df = sentiment_calculation(
    df, 
    preprocessing_for_sent(df), 
    sentiment_model
)

In [7]:
final_df[['comment', 'Original_sent', 'Predicted_sent', 'Pos_prob']]

Unnamed: 0,comment,Original_sent,Predicted_sent,Pos_prob
0,"Замечательный планитарный миксер, не шумный,мо...",Positive,Positive,0.999524
1,"Спасибо огромное, все пришло в отличном состоя...",Positive,Positive,0.909823
2,Товар получила 11.05. Попробовала вчера сделат...,Positive,Positive,0.986011
3,"Мощность чувствуется, в работе пока не пробова...",Positive,Positive,0.997701
4,Просто бомба! 💣Я мечтала о нем! Купила за 6029...,Positive,Positive,0.999577
...,...,...,...,...
698,Это просто супер. Немогу нарадоваться ему. Раб...,Positive,Positive,0.988223
699,"18.07.2019 прибыл данный агрегат в наш дом, же...",Positive,Positive,0.999081
700,Миксер просто бомба! Пришёл в двойной упаковке...,Positive,Positive,0.999532
701,"Вчера получила миксер,муж подарил на 30 летие ...",Positive,Positive,0.992505


# Dataframe with predicted sentiments saving

In [8]:
final_df.to_csv('./df_with_sentiments.csv')