# Libraries import

In [1]:
import pandas as pd
import numpy as np
from tensorflow import keras
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

NLTK data that must be downloaded only once

In [9]:
# nltk.download('wordnet')
# nltk.download('punkt')

# Data and model loading

In [2]:
df = pd.read_csv('.\preprocessed_df.csv')

In [3]:
sentiment_model = keras.models.load_model('.\Model_full_extended\Model_full')

# Lemmatiation and tokenization of comments using NLTK library

In [4]:
def preprocessing_for_sent(df):
    '''
    The function takes dataframe with cleaned data and return Series with comments prepared for model.
    Comments are lemmatized and tokenized.
    '''
    wn_lemmatizer = WordNetLemmatizer()
    lemmatized_text_for_sen = []
    
    for comment in df['comment']:
        lemmatized_text_for_sen.append(' '.join([wn_lemmatizer.lemmatize(word) for word in comment.split()]))
    
    for i in range(len(lemmatized_text_for_sen)):
        lemmatized_text_for_sen[i] = word_tokenize(lemmatized_text_for_sen[i])
    clean_tokenized_comment = [] 
    
    for i, element in enumerate(lemmatized_text_for_sen):
        clean_tokenized_comment.append(' '.join([word for word in element]))    
    
    return pd.Series(clean_tokenized_comment)

# Updating dataframe with predicted sentiment

In [5]:
def sentiment_calculation(df, series, model):
    '''
    The function returns origianl df with following columns added:
    Comment_for_sen - preprocessed comment
    Original_sent - original sentiment calculated as Positive in case of prod_eval >= 4 and Negative in other cases
    Pos_prob - probability of positive sentiment according to model
    Predicted_sent - predicted sentiment as Positive in case of pos_prob >= 0.6, Negative in case of pos_prob <= 0.4 and Neutral in other cases
    '''
    
    df['Comment_for_sen'] = series
    df['Original_sent'] = np.where(df['prod_eval'] >= 4, 'Positive', 'Negative')
    sen_pred = model.predict(df['Comment_for_sen'])
    df['Pos_prob'] = sen_pred
    df['Predicted_sent'] = np.where(df['Pos_prob'] >= 0.6, 'Positive', np.where(df['Pos_prob'] <= 0.4, 'Negative', 'Neutral'))
    return df
    

# Functions application

In [6]:
final_df = sentiment_calculation(
    df, 
    preprocessing_for_sent(df), 
    sentiment_model
)

In [7]:
final_df[['comment', 'Original_sent', 'Predicted_sent', 'Pos_prob']]

Unnamed: 0,comment,Original_sent,Predicted_sent,Pos_prob
0,"–ó–∞–º–µ—á–∞—Ç–µ–ª—å–Ω—ã–π –ø–ª–∞–Ω–∏—Ç–∞—Ä–Ω—ã–π –º–∏–∫—Å–µ—Ä, –Ω–µ —à—É–º–Ω—ã–π,–º–æ...",Positive,Positive,0.999524
1,"–°–ø–∞—Å–∏–±–æ –æ–≥—Ä–æ–º–Ω–æ–µ, –≤—Å–µ –ø—Ä–∏—à–ª–æ –≤ –æ—Ç–ª–∏—á–Ω–æ–º —Å–æ—Å—Ç–æ—è...",Positive,Positive,0.909823
2,–¢–æ–≤–∞—Ä –ø–æ–ª—É—á–∏–ª–∞ 11.05. –ü–æ–ø—Ä–æ–±–æ–≤–∞–ª–∞ –≤—á–µ—Ä–∞ —Å–¥–µ–ª–∞—Ç...,Positive,Positive,0.986011
3,"–ú–æ—â–Ω–æ—Å—Ç—å —á—É–≤—Å—Ç–≤—É–µ—Ç—Å—è, –≤ —Ä–∞–±–æ—Ç–µ –ø–æ–∫–∞ –Ω–µ –ø—Ä–æ–±–æ–≤–∞...",Positive,Positive,0.997701
4,–ü—Ä–æ—Å—Ç–æ –±–æ–º–±–∞! üí£–Ø –º–µ—á—Ç–∞–ª–∞ –æ –Ω–µ–º! –ö—É–ø–∏–ª–∞ –∑–∞ 6029...,Positive,Positive,0.999577
...,...,...,...,...
698,–≠—Ç–æ –ø—Ä–æ—Å—Ç–æ —Å—É–ø–µ—Ä. –ù–µ–º–æ–≥—É –Ω–∞—Ä–∞–¥–æ–≤–∞—Ç—å—Å—è –µ–º—É. –†–∞–±...,Positive,Positive,0.988223
699,"18.07.2019 –ø—Ä–∏–±—ã–ª –¥–∞–Ω–Ω—ã–π –∞–≥—Ä–µ–≥–∞—Ç –≤ –Ω–∞—à –¥–æ–º, –∂–µ...",Positive,Positive,0.999081
700,–ú–∏–∫—Å–µ—Ä –ø—Ä–æ—Å—Ç–æ –±–æ–º–±–∞! –ü—Ä–∏—à—ë–ª –≤ –¥–≤–æ–π–Ω–æ–π —É–ø–∞–∫–æ–≤–∫–µ...,Positive,Positive,0.999532
701,"–í—á–µ—Ä–∞ –ø–æ–ª—É—á–∏–ª–∞ –º–∏–∫—Å–µ—Ä,–º—É–∂ –ø–æ–¥–∞—Ä–∏–ª –Ω–∞ 30 –ª–µ—Ç–∏–µ ...",Positive,Positive,0.992505


# Dataframe with predicted sentiments saving

In [8]:
final_df.to_csv('./df_with_sentiments.csv')