In [35]:
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [36]:
final_file = pd.read_csv('preprocessing_and_embeddings/cleaned_dataset.csv')

In [37]:
sentiment_map = pd.read_csv('KMeans_clustering//sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

Getting tfidf scores of words in every sentence, and replacing them with their associated tfidf weights:

In [38]:
file_weighting = final_file.copy()

In [39]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.text)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weighting.text)



Replacing words in sentences with their tfidf scores

In [40]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    inspired  by function from this wonderful article: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x.text.split()))

In [41]:
%%time
replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)#this step takes around 3-4 minutes minutes to calculate

CPU times: user 25.9 s, sys: 1.85 s, total: 27.7 s
Wall time: 33.2 s


Replacing words in sentences with their sentiment score

In [42]:
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [43]:
replaced_closeness_scores = file_weighting.text.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

Merging both previous steps and getting the predictions:

In [44]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.text, file_weighting.business_id,file_weighting.review_id]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence', 'business_id','review_id']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')
# replacement_df['sentiment'] = [1 if i==1 else 0 for i in replacement_df.sentiment]

In [45]:
replacement_df.head()

Unnamed: 0,sentiment_coeff,tfidf_scores,sentence,business_id,review_id,sentiment_rate,prediction
0,"[0.8838982981511367, 0.9008086114248798, 0.906...","[2.393876517559999, 2.009711825974233, 6.14987...",if you decide to eat here just be aware it is ...,XQfwVwDr-v0ZS3_CbbE5Xw,KU_O5udG6zpxOg-VcAEodg,81.66376,1
1,"[-0.9134369709351798, 0.947611305133934, 0.956...","[5.186615426306712, 1.5492577432740098, 7.6979...",this is the second time we tried turning_point...,XQfwVwDr-v0ZS3_CbbE5Xw,VJxlBnJmCDIy8DFG0kjSow,265.887909,1
2,"[0.956277369160358, 0.8909423442102302, 0.9476...","[3.299124774870296, 4.089742885936934, 1.54925...",the place is cute and the staff was very frien...,XQfwVwDr-v0ZS3_CbbE5Xw,S6pQZQocMB1WHMjTRbt77A,100.572222,1
3,"[0.8860928278216307, 0.931627456293174, 0.9290...","[10.414923613302891, 6.425692546252347, 1.8909...",we came on a saturday_morning after waiting a ...,XQfwVwDr-v0ZS3_CbbE5Xw,WqgTKVqWVHDHjnjEsBvUgg,358.032481,1
4,"[0.9557299394709, -0.9019079713949786, 1.01417...","[5.504098941718361, 2.2049644267547137, 2.8955...",mediocre at best the decor is very nice and i ...,XQfwVwDr-v0ZS3_CbbE5Xw,M0wzFFb7pefOPcxeRVbLag,548.10158,1


Reporting model's metrics

In [46]:
# predicted_classes = replacement_df.prediction
# y_test = replacement_df.sentiment

# conf_matrix = pd.DataFrame(confusion_matrix(replacement_df.sentiment, replacement_df.prediction))
# print('Confusion Matrix')
# display(conf_matrix)

# test_scores = accuracy_score(y_test,predicted_classes), precision_score(y_test, predicted_classes), recall_score(y_test, predicted_classes), f1_score(y_test, predicted_classes)

# print('\n \n Scores')
# scores = pd.DataFrame(data=[test_scores])
# scores.columns = ['accuracy', 'precision', 'recall', 'f1']
# scores = scores.T
# scores.columns = ['scores']
# display(scores)

In [47]:
replacement_df.to_csv('nlp_output.csv')