In [None]:
# import pkl with cleaned text- the only preprocessing done is to remove non-letter or number characters from the text and lowercase it
# next remove stopwords and lemmatize or stem the text
# then use LDA to parse topics


In [None]:
#from google.colab import drive
#drive.mount('/content/drive')


In [None]:
import pandas as pd
import pickle
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gensim
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

import matplotlib.pyplot as plt
from gensim.models import Phrases
from gensim.models.phrases import Phraser
import numpy as np
#from kneed import KneeLocator # ! pip install kneed

In [None]:
# Read in the data
reviews = pickle.load(open('/Users/Melissa/Desktop/NLP_Fall2021/final_project/reviews_with_topics.pkl', 'rb'))


In [None]:
# Remove stopwords + a few more- this is the same processing as for the topic modeling, just without the str_split 
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
sw = stopwords.words('english')
sw.extend(['many', 'good', 'like', 'liked', 'well', 'great', 'get', 'also', 'really', 'very', 'put'])
 
# Remove custom stopwords
def rem_sw(var):
    my_test = [word for word in var.split() if word not in sw]
    my_test = ' '.join(my_test)
    return my_test

# Remove words less than 3 characters
def length_fun(var):
    tmp_txt = [word for word in var.split() if len(word) > 2]
    tmp_txt = ' '.join(tmp_txt)
    return tmp_txt

# Stem text
def stem_fun(var):
    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()
    tmp_txt = [stemmer.stem(word) for word in var.split()]
    tmp_txt = ' '.join(tmp_txt)
    return tmp_txt

# Lemmatize the text
nltk.download('punkt')
nltk.download('wordnet')
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatize_fun(var):
    tmp_txt = [wordnet_lemmatizer.lemmatize(word) for word in var.split()]
    tmp_txt = ' '.join(tmp_txt)
    return tmp_txt


# Text has been cleaned to only include words and numbers, had stopwords removed, words less than 3 characters removed, and lemmatized (not stemmed here)
reviews['review_body_clean_3'] = reviews['review_body_clean'].apply(rem_sw).apply(length_fun).apply(lemmatize_fun).apply(rem_sw) 


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Melissa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Melissa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Melissa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Embeddings for Review Classification 


In [None]:
#! pip install --upgrade gensim
#from gensim.models import Word2Vec

In [None]:
reviews['review_category'].value_counts()

# Flag reviews by sentiment
def review_sentiment(df):
    if df['star_rating'] >= 4:
        return 'positive'
    elif df['star_rating'] == 3:
        return 'neutral'
    elif df['star_rating'] <= 2:
        return 'negative'

reviews['review_category'] = reviews.apply(review_sentiment, axis = 1)

In [None]:
reviews['review_category'].value_counts()

positive    42252
neutral      4830
negative     3935
Name: review_category, dtype: int64

In [None]:
# Review_body_clean_3 is the input that we will create embeddings from
# the only difference is removing the split at the end
# Review_category is the dependent variable in the predictive model

the_path = '/Users/Melissa/Desktop/NLP_Fall2021/final_project/'

def extract_embeddings_domain(df_in, num_vec_in, path_in):
    from gensim.models import Word2Vec
    import pandas as pd
    import numpy as np
    import pickle
    
    model = Word2Vec(df_in.str.split(), min_count=1,vector_size=num_vec_in, workers=3, window=5, sg=0)
    
    wrd_dict = model.wv.key_to_index
    def get_score(var):
        try:
            tmp_arr = list()
            for word in var:
                tmp_arr.append(list(model.wv[word]))
        except:
            pass
        return np.mean(np.array(tmp_arr), axis=0)
    tmp_out = df_in.str.split().apply(get_score)
    tmp_data = tmp_out.apply(pd.Series).fillna(0)
    pickle.dump(model, open(path_in + 'melissa_embeddings_domain_model.pkl', 'wb'))
    pickle.dump(tmp_data, open(path_in + 'melissa_embeddings_df_domain.pkl', 'wb'))
    return tmp_data, wrd_dict, model


In [None]:
emb_data, word_dict, emb_domain_model =  extract_embeddings_domain(reviews.review_body_clean_3, 300, the_path)


In [None]:
reviews.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date', 'review_category',
       'review_body_clean', 'review_body_clean_2', 'topic_1', 'topic_2',
       'topic_3', 'topic_4', 'topic_5', 'topic_6', 'topic_7',
       'review_body_clean_3'],
      dtype='object')

### Append the embeddings to the reviews dataframe as additional columns

In [None]:
topics = reviews[['topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5', 'topic_6', 'topic_7']]
                  
categories = reviews['review_category']

topics_embeddings = pd.concat([topics, emb_data], axis = 1)


In [None]:
# Must encode the target variable

from sklearn import preprocessing
labeler = preprocessing.LabelEncoder()
categories_encoded = labeler.fit_transform(categories)


In [None]:
categories.value_counts()

positive    42252
neutral      4830
negative     3935
Name: review_category, dtype: int64

In [None]:
pd.DataFrame(categories_encoded).value_counts()

2    42252
1     4830
0     3935
dtype: int64

In [None]:
# Dump the dataframes
#pickle.dump(topics_embeddings, open('/Users/Melissa/Desktop/NLP_Fall2021/final_project/topics_embeddings_df.pkl', 'wb'))

#pickle.dump(categories, open('/Users/Melissa/Desktop/NLP_Fall2021/final_project/categories_df_CORRECTED.pkl', 'wb'))


In [None]:
# Random Forest
# Model with the topics and the embeddings as predictors of review category
def my_model_fun_grid(df_in, label_in):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split
    import pandas as pd
    from sklearn.metrics import precision_recall_fscore_support
    from sklearn.metrics import classification_report
    import pickle
    from sklearn.model_selection import GridSearchCV
    my_model = RandomForestClassifier(random_state = 123)
    parameters = {'n_estimators':[10, 100], 'max_depth':[None, 1, 10],
                  'random_state': [123], 'class_weight': ['balanced']}
    my_grid = GridSearchCV(my_model, parameters)
    
    #80/20 train,test,split
    X_train, X_test, y_train, y_test = train_test_split(df_in, label_in, test_size = 0.20, random_state = 123)
    my_grid.fit(X_train, y_train)
    print ("Best Score:", my_grid.best_score_)
    best_params = my_grid.best_params_
    my_model_opt = RandomForestClassifier(**best_params)
    my_model_opt.fit(X_train, y_train)
    #pickle.dump(my_model_opt, open(path_o + "my_model.pkl", "wb"))
    y_pred = my_model_opt.predict(X_test)
    #model_metrics = pd.DataFrame(precision_recall_fscore_support(y_test, y_pred, average = 'weighted'))
    #model_metrics.index = ['precision', 'recall', 'fscore', 'none']
    model_metrics = classification_report(y_test, y_pred)
    
    # function 2 prediction
    the_preds = pd.DataFrame(my_model_opt.predict_proba(X_test))
    the_preds.columns = my_model_opt.classes_
    return my_model_opt, model_metrics, the_preds


In [None]:
model1, metrics1, preds1 = my_model_fun_grid(topics_embeddings, categories_encoded)

Best Score: 0.8351505347594465


In [None]:
print('Random Forest')
print(metrics1)

Random Forest
              precision    recall  f1-score   support

           0       0.60      0.12      0.20       794
           1       0.41      0.02      0.03       976
           2       0.84      1.00      0.91      8434

    accuracy                           0.83     10204
   macro avg       0.61      0.38      0.38     10204
weighted avg       0.78      0.83      0.77     10204



In [None]:
# KNN Classifier

def my_model_fun_grid_2(df_in, label_in):
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.model_selection import train_test_split
    import pandas as pd
    from sklearn.metrics import precision_recall_fscore_support
    from sklearn.metrics import classification_report
    import pickle
    from sklearn.model_selection import GridSearchCV
    my_model = KNeighborsClassifier()
    parameters = {'n_neighbors':[5, 50, 100]}
    my_grid = GridSearchCV(my_model, parameters)
    
    #80/20 train,test,split
    X_train, X_test, y_train, y_test = train_test_split(df_in, label_in, test_size = 0.20, random_state = 123)
    my_grid.fit(X_train, y_train)
    print ("Best Score:", my_grid.best_score_)
    best_params = my_grid.best_params_
    my_model_opt = KNeighborsClassifier(**best_params)
    my_model_opt.fit(X_train, y_train)
    y_pred = my_model_opt.predict(X_test)
    model_metrics = classification_report(y_test, y_pred)
    
    # function 2 prediction
    the_preds = pd.DataFrame(my_model_opt.predict_proba(X_test))
    the_preds.columns = my_model_opt.classes_
    return my_model_opt, model_metrics, the_preds


In [None]:
model2, metrics2, preds2 = my_model_fun_grid_2(topics_embeddings, categories_encoded)

Best Score: 0.8357631297116642


In [None]:
print('K-Nearest Neighbors Classifier')
print(metrics2)

K-Nearest Neighbors Classifier
              precision    recall  f1-score   support

           0       0.56      0.13      0.21       794
           1       0.39      0.03      0.05       976
           2       0.84      0.99      0.91      8434

    accuracy                           0.83     10204
   macro avg       0.60      0.38      0.39     10204
weighted avg       0.78      0.83      0.78     10204



In [None]:
model2

KNeighborsClassifier(n_neighbors=50)