In [1]:
import pandas as pd
import numpy as np


In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from nltk.stem.snowball import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score


In [23]:
df = pd.read_csv(r"data/data_trustpilot.csv")
df.head()

Unnamed: 0,rating,location,username,number_reviews,verification,repeat_reviewer,repeat_reviewer_encoded,company,text,text_processed,...,date_posted,local_date_posted,month_local,local_hour,time_of_day,day_of_week_posted,day_type,days_between_experience_and_post,review_time,review_time_encoded
0,5,CA,Rob Crane,2,Redirected,repeat,1,Flashbay,The company rep I worked with made my transact...,company rep worked made transaction smooth qui...,...,2024-10-23 04:17:44,2024-10-22,10,21,Evening,1,Business Day,129,late_review,0
1,5,US,Pat Anderson,1,Verified,one-time,0,Flashbay,I highly recommend using Flashbay. Immediately...,highly recommend using flashbay immediately or...,...,2024-10-16 19:34:05,2024-10-16,10,12,Business Hours,2,Business Day,0,quick_review,1
2,5,CZ,Margarita Orlova,1,Verified,one-time,0,Flashbay,I had the pleasure of working with Shelby Gibs...,pleasure working shelby gibson large order nee...,...,2024-10-17 10:27:44,2024-10-17,10,10,Business Hours,3,Business Day,7,late_review,0
3,5,US,Paola Rivas,1,Verified,one-time,0,Flashbay,I had a fantastic experience with Brian Truong...,fantastic experience brian truong attentive tr...,...,2024-10-21 22:38:50,2024-10-21,10,15,Business Hours,0,Business Day,0,quick_review,1
4,5,CA,Fiona Mckelvey Keenan,3,Not Verified,repeat,1,Flashbay,My number-one go-to for computer accessories. ...,numberone goto computer accessories rachel sup...,...,2024-10-23 04:09:05,2024-10-22,10,21,Evening,1,Business Day,103,late_review,0


In [24]:
X= df["text_processed"]
y = df["rating"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 30)


In [25]:
# BoW
vectorizer = CountVectorizer()
X_train_BoW = vectorizer.fit_transform(X_train)
X_test_BoW = vectorizer.transform(X_test)

In [27]:
#classifier on basis of Bow
clf_BoW = neighbors.KNeighborsClassifier(n_neighbors = 3, metric = 'minkowski')
clf_BoW.fit(X_train_BoW, y_train)
y_pred_BoW = clf_BoW.predict(X_test_BoW)

In [28]:
# results classification with BoW
print(classification_report(y_test, y_pred_BoW))

cm_K_BoW = pd.crosstab(y_test, y_pred_BoW, rownames=['real'], colnames=['Pred'])
print(cm_K_BoW) 

              precision    recall  f1-score   support

           1       0.64      0.48      0.54      3279
           2       0.17      0.06      0.09       854
           3       0.29      0.09      0.14      1271
           4       0.17      0.06      0.09      1248
           5       0.63      0.94      0.76      6223

    accuracy                           0.59     12875
   macro avg       0.38      0.33      0.33     12875
weighted avg       0.52      0.59      0.53     12875

Pred     1    2    3    4     5
real                           
1     1560  106  115   96  1402
2      303   55   68   52   376
3      301   89  117   98   666
4      141   38   47   78   944
5      147   31   51  145  5849


In [54]:
# grid search
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': [3, 5, 7, 9, 11], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan', 'minkowski']}
grid_search_BoW = GridSearchCV(estimator=clf_BoW, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
grid_search_BoW.fit(X_train_BoW, y_train)

print("Best Parameters:", grid_search_BoW.best_params_)
print("Best Cross-Validation Score:", grid_search_BoW.best_score_)

 nan nan nan nan nan nan nan nan nan nan nan nan]


Best Parameters: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
Best Cross-Validation Score: nan


In [56]:
# Tfidf
vec_tfidf = TfidfVectorizer()
X_train_tfidf = vec_tfidf.fit_transform(X_train)
X_test_tfidf = vec_tfidf.transform(X_test)

In [57]:
#classifier on basis of Tfidf
clf_tfidf = neighbors.KNeighborsClassifier(n_neighbors = 3, metric = 'minkowski')
clf_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)

In [58]:
# results classification with Tfidf
print(classification_report(y_test, y_pred_tfidf))

cm_K_tfidf = pd.crosstab(y_test, y_pred_tfidf, rownames=['real'], colnames=['Pred'])
print(cm_K_tfidf) 

              precision    recall  f1-score   support

           1       0.66      0.71      0.69      3279
           2       0.20      0.13      0.16       854
           3       0.33      0.18      0.23      1271
           4       0.25      0.09      0.14      1248
           5       0.75      0.92      0.83      6223

    accuracy                           0.66     12875
   macro avg       0.44      0.41      0.41     12875
weighted avg       0.60      0.66      0.62     12875

Pred     1    2    3    4     5
real                           
1     2343  139  149   60   588
2      366  113  114   49   212
3      403  159  224  102   383
4      198   76  131  118   725
5      232   66   69  144  5712


In [59]:
grid_search_tfidf = GridSearchCV(estimator=clf_tfidf, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
grid_search_tfidf.fit(X_train_tfidf, y_train)

print("Best Parameters:", grid_search_tfidf.best_params_)
print("Best Cross-Validation Score:", grid_search_tfidf.best_score_)

        nan        nan        nan        nan 0.47764034 0.4784559
 0.47789292 0.48092207 0.48022302 0.48253373 0.48738812 0.4880095
 0.48830076 0.48824252        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan]


Best Parameters: {'metric': 'manhattan', 'n_neighbors': 11, 'weights': 'uniform'}
Best Cross-Validation Score: 0.4883007625398639


In [34]:
# w2v vector creation
def get_review_vector(review, model):
    vectors = [model.wv[word] for word in review if word in model.wv]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

In [None]:
# creating list of vectors
word2vec_model = Word2Vec(X_train, vector_size=100, window=5, min_count=1, workers=4)
X_train_vectors = [get_review_vector(review, word2vec_model) for review in X_train]
X_test_vectors = [get_review_vector(review, word2vec_model) for review in X_test]

In [36]:
# converting list of vectors into arrays
X_train_w2v = np.array(X_train_vectors)
X_test_w2v = np.array(X_test_vectors)

In [37]:
#classifier on basis of Word2Vec
clf_w2v = neighbors.KNeighborsClassifier(n_neighbors = 3, metric = 'minkowski')
clf_w2v.fit(X_train_w2v, y_train)
y_pred_w2v = clf_w2v.predict(X_test_w2v)

In [38]:
# results classification with Word2Vec
print(classification_report(y_test, y_pred_w2v))

cm_K_w2v = pd.crosstab(y_test, y_pred_w2v, rownames=['real'], colnames=['Pred'])
print(cm_K_w2v) 

              precision    recall  f1-score   support

           1       0.44      0.71      0.55      3279
           2       0.11      0.11      0.11       854
           3       0.20      0.14      0.17      1271
           4       0.15      0.04      0.07      1248
           5       0.71      0.63      0.66      6223

    accuracy                           0.51     12875
   macro avg       0.32      0.33      0.31     12875
weighted avg       0.50      0.51      0.49     12875

Pred     1    2    3    4     5
real                           
1     2321  176  146   62   574
2      443   97  106   22   186
3      505  150  178   53   385
4      430  127  148   53   490
5     1522  313  300  174  3914


In [60]:
grid_search_w2v = GridSearchCV(estimator=clf_w2v, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
grid_search_w2v.fit(X_train_w2v, y_train)

print("Best Parameters:", grid_search_w2v.best_params_)
print("Best Cross-Validation Score:", grid_search_w2v.best_score_)

Best Parameters: {'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'distance'}
Best Cross-Validation Score: 0.5678556500442593


In [39]:
# loading GloVe embedding
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Path to GloVe file
glove_file = "data/glove.6B.300d.txt"  
glove_embeddings = load_glove_embeddings(glove_file)

In [44]:
# GloVe vector creation
def get_review_vector_glove(review, embeddings, dim = 100):
    vectors_glove = [embeddings[word] for word in review if word in embeddings]
    if not vectors_glove:
        return np.zeros(dim)
    return np.mean(vectors_glove, axis=0)

In [45]:
# creating list of glove vectors

X_train_glove = np.array([get_review_vector_glove(review, glove_embeddings) for review in X_train])
X_test_glove = np.array([get_review_vector_glove(review, glove_embeddings) for review in X_test])


In [46]:
# converting list of vectors into arrays
X_train_GloVe = np.array(X_train_glove)
X_test_GloVe = np.array(X_test_glove)

In [47]:
#classifier on basis of GloVe
clf_GloVe = neighbors.KNeighborsClassifier(n_neighbors = 3, metric = 'minkowski')
clf_GloVe.fit(X_train_GloVe, y_train)
y_pred_GloVe = clf_GloVe.predict(X_test_GloVe)

In [48]:
# results classification with GloVe
print(classification_report(y_test, y_pred_GloVe))

cm_K_GloVe = pd.crosstab(y_test, y_pred_GloVe, rownames=['real'], colnames=['Pred'])
print(cm_K_GloVe) 

              precision    recall  f1-score   support

           1       0.41      0.69      0.51      3279
           2       0.10      0.09      0.10       854
           3       0.17      0.10      0.13      1271
           4       0.17      0.05      0.07      1248
           5       0.67      0.59      0.63      6223

    accuracy                           0.48     12875
   macro avg       0.31      0.30      0.29     12875
weighted avg       0.47      0.48      0.46     12875

Pred     1    2    3    4     5
real                           
1     2261  158  141   47   672
2      453   81   85   29   206
3      538  153  131   48   401
4      483   99  124   59   483
5     1823  298  288  160  3654


In [61]:
grid_search_GloVe = GridSearchCV(estimator=clf_GloVe, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
grid_search_GloVe.fit(X_train_GloVe, y_train)

print("Best Parameters:", grid_search_GloVe.best_params_)
print("Best Cross-Validation Score:", grid_search_GloVe.best_score_)

Best Parameters: {'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'distance'}
Best Cross-Validation Score: 0.5532728259978111
