In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn import metrics
from sklearn.svm import LinearSVC, SVC
import string
from nltk import pos_tag
from nltk.corpus import stopwords
# nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from nltk.corpus import wordnet
import re
from preprocessing import *
from numpy import array 
import numpy as np

In [3]:
w2v_pretrained = Word2Vec.load("w2vPreTrained")

In [18]:
dataset = pd.read_csv("amazon_alexa.tsv", sep = "\t", encoding = "utf-8")
print(dataset.shape)
dataset.dropna(inplace = True)
print(dataset.shape)
dataset.drop(dataset[dataset.rating == 3].index, inplace=True)
print(dataset.shape)
dataset.drop_duplicates(subset = "verified_reviews", inplace = True)
print(dataset.shape)

(3150, 5)
(3150, 5)
(2998, 5)
(2196, 5)


In [19]:
X = np.array(dataset["verified_reviews"].values).reshape(-1, 1)
y = list(dataset["feedback"].values)

In [20]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

undersampler = RandomUnderSampler(sampling_strategy=0.5, random_state = 0)

X, y = undersampler.fit_resample(X, y)


print('Resampled dataset shape %s' % Counter(y))

Resampled dataset shape Counter({1: 412, 0: 206})


In [21]:
X_temp = []

for rev in X:
  X_temp.append(rev[0])

In [22]:
tokenized_reviews, sentences = tokenize_list_of_text(X_temp, custom_stopwords, False,2)

total number of types extracted is: 1798


In [23]:
tokenizer = nltk.tokenize.TweetTokenizer()
cv = CountVectorizer(stop_words="english", ngram_range=(1, 3), tokenizer=tokenizer.tokenize, min_df = 2) #count == frequenza
text_counts = cv.fit_transform(tokenized_reviews) 
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, y, test_size = 0.20, random_state=10) #divisione in train e test

In [24]:
text_counts

<618x1222 sparse matrix of type '<class 'numpy.int64'>'
	with 6973 stored elements in Compressed Sparse Row format>

In [25]:
len(cv.vocabulary_)

1222

In [26]:
cv.vocabulary_ # A mapping of terms to feature indices.

{'like': 520,
 'fact': 311,
 'answer': 44,
 'not_see': 679,
 'real': 822,
 'need': 638,
 'household': 452,
 'good': 377,
 'day': 208,
 'deal': 213,
 'sound': 967,
 'terrible': 1031,
 'want': 1151,
 'music': 624,
 'bose': 104,
 'sound terrible': 975,
 'good music': 383,
 'little': 538,
 'feature': 330,
 'stop': 998,
 'work': 1188,
 'week': 1171,
 'command': 160,
 'really': 825,
 'fun': 362,
 'stop work': 1000,
 'work week': 1206,
 'joke': 488,
 'worthless': 1211,
 'disappointed': 248,
 'plug': 755,
 'wall': 1150,
 'time': 1046,
 'fault': 327,
 'disagree': 246,
 'make': 595,
 'company': 163,
 'advance': 28,
 'sell': 909,
 'product': 784,
 'battery': 86,
 'return': 869,
 'apple': 52,
 'speaker': 979,
 'really disappointed': 826,
 'apple music': 53,
 'great': 393,
 'bass': 84,
 'idle': 459,
 'hot': 449,
 'miss': 614,
 'recognize': 836,
 'thing': 1036,
 'great sound': 411,
 'idle time': 461,
 'impressed': 466,
 'tightness': 1045,
 'stupid': 1007,
 'spotify': 988,
 'account': 14,
 'not_use':

In [27]:
#get_feature_names_out([input_features])
#Get output feature names for transformation.

len(cv.get_feature_names_out())

1222

In [28]:
X_train[0,:] # prima recensione

<1x1222 sparse matrix of type '<class 'numpy.int64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [29]:
print(X_train[0,:])

  (0, 520)	1
  (0, 1188)	1
  (0, 1171)	1
  (0, 106)	1
  (0, 550)	1
  (0, 556)	1
  (0, 844)	1
  (0, 643)	1
  (0, 734)	1
  (0, 1202)	1
  (0, 797)	1
  (0, 320)	1
  (0, 422)	2
  (0, 715)	1
  (0, 423)	1
  (0, 524)	1
  (0, 324)	1
  (0, 1172)	1


In [30]:
cv.inverse_transform(X_train[0,:]) #0 ==> prima recensione

[array(['like', 'work', 'week', 'box', 'look', 'look like', 'refurbish',
        'new', 'perfectly', 'work perfectly', 'purchase', 'far', 'happy',
        'original', 'happy purchase', 'like new', 'far work', 'week far'],
       dtype='<U25')]

In [31]:
for feat,freq in zip(cv.inverse_transform(X_train[0,:])[0],X_train[0,:].data):
    print(feat,freq)

like 1
work 1
week 1
box 1
look 1
look like 1
refurbish 1
new 1
perfectly 1
work perfectly 1
purchase 1
far 1
happy 2
original 1
happy purchase 1
like new 1
far work 1
week far 1


In [32]:
#For classification we'll set 'chi2'  method as a scoring function. The target number of features is defined by k parameter
select = SelectKBest(chi2, k=800)  # feature selection
# select = SelectKBest(chi2, k="all")  # feature selection for balanced with pos filter
select.fit(X_train,Y_train)
X_train_sel = select.transform(X_train)
X_test_sel = select.transform(X_test)

In [33]:
#We've selected 3 best features in x data. To identify the selected features we use get_support() function and filter out them from the features name list. The z object contains selected x data
filter = select.get_support() #filtra le parole selezionando quelle contenenti nei k 5000
sum(filter)

800

In [34]:
X_train_sel

<494x800 sparse matrix of type '<class 'numpy.int64'>'
	with 4262 stored elements in Compressed Sparse Row format>

In [35]:
print(X_test_sel[0,:])

  (0, 773)	1
  (0, 357)	1
  (0, 377)	1


In [36]:
print(cv.inverse_transform(select.inverse_transform(X_train_sel[0,:]))) #applichi l'inverse_transform sui vettori che poi hai utilizzato con il Select

[array(['box', 'far', 'far work', 'happy', 'happy purchase', 'like new',
       'look like', 'new', 'original', 'refurbish', 'week', 'week far',
       'work'], dtype='<U25')]


In [37]:
tfidf = TfidfTransformer()  # weighting
tfidf.fit(X_train_sel)
X_train_vec = tfidf.transform(X_train_sel)
X_test_vec =tfidf.transform(X_test_sel)

In [38]:
print(X_train_vec[0,:])

  (0, 773)	0.13215721023034152
  (0, 762)	0.3104364762375252
  (0, 761)	0.2545811907451864
  (0, 540)	0.22860996199910866
  (0, 457)	0.24922448598010277
  (0, 407)	0.1987259052528476
  (0, 352)	0.28446524749144747
  (0, 329)	0.27519571472618043
  (0, 265)	0.28446524749144747
  (0, 264)	0.4717708962714922
  (0, 201)	0.3104364762375252
  (0, 198)	0.1987259052528476
  (0, 72)	0.2673584364439322


In [39]:
cv.inverse_transform(select.inverse_transform(X_train_vec[0,:]))

[array(['box', 'far', 'far work', 'happy', 'happy purchase', 'like new',
        'look like', 'new', 'original', 'refurbish', 'week', 'week far',
        'work'], dtype='<U25')]

In [40]:
for feat,weight,freq in zip(cv.inverse_transform(select.inverse_transform(X_train_vec[0,:]))[0],X_train_vec[0,:].data,X_train_sel[1,:].data):
    print(feat,round(weight,4),freq)

box 0.1322 1
far 0.3104 1
far work 0.2546 1
happy 0.2286 1
happy purchase 0.2492 1
like new 0.1987 1
look like 0.2845 1
new 0.2752 1


In [41]:
# create a dataframe with words, tf_idf score and freq
total_scores = []

for i in range(X_train_vec.shape[0]):
    for feat,weight,freq in zip(cv.inverse_transform(select.inverse_transform(X_train_vec[i,:]))[0],X_train_vec[i,:].data,X_train_sel[i,:].data):
        total_scores.append((feat,round(weight,4),freq))

words = []
tf_idf_score = []
freq = []
for i in range(len(total_scores)):
    words.append(total_scores[i][0])
    tf_idf_score.append(total_scores[i][1])
    freq.append(total_scores[i][2])

scores_df = pd.DataFrame()
scores_df["words"] = words
scores_df["tf_idf_score"] = tf_idf_score
scores_df["freq"] = freq

In [42]:
scores_df.sort_values(by = "tf_idf_score", ascending = False).head(50) # sort and show the df

Unnamed: 0,words,tf_idf_score,freq
1536,switch,1.0,1
1532,love,1.0,1
3924,need,1.0,1
385,perfect,1.0,1
1044,tell,1.0,1
2330,dont,1.0,1
3841,love,1.0,1
2327,good,1.0,1
1636,good,1.0,1
2253,nice,1.0,1


In [43]:
X_train_vec

<494x800 sparse matrix of type '<class 'numpy.float64'>'
	with 4262 stored elements in Compressed Sparse Row format>

In [47]:
svm = LinearSVC()  # linear svm with default parameters
svm_clf = svm.fit(X_train_vec,Y_train)
predictions = svm_clf.predict(X_test_vec)

In [48]:
len(predictions), sum(predictions)

(124, 85)

In [49]:
print(metrics.classification_report(Y_test, predictions))

              precision    recall  f1-score   support

           0       0.72      0.68      0.70        41
           1       0.85      0.87      0.86        83

    accuracy                           0.81       124
   macro avg       0.78      0.78      0.78       124
weighted avg       0.80      0.81      0.81       124



In [50]:
feature_names = cv.get_feature_names_out()
feature_names

array(["'", "' ll", "' ve", ..., 'yes', 'zero', 'zigbee'], dtype=object)

In [51]:
print(select.scores_)

[6.22348715 1.99393939 0.24772958 ... 1.99393939 3.98787879 7.97575758]


In [52]:
feature_names = cv.get_feature_names_out()
feats_w_score = list()
for index,(selected,score) in enumerate(zip(filter,select.scores_)):
    feats_w_score.append((score,selected,feature_names[index]))
feats_w_score = sorted(feats_w_score, reverse = True)
len(feats_w_score)

1222

In [53]:
feats_w_score[:10] # this contains k score, wether it is a selcted feature and the word

[(26.1792791786454, True, 'great'),
 (22.24837695232832, True, 'send'),
 (22.24837695232832, True, 'month'),
 (18.399484203739526, True, 'bulb'),
 (13.95757575757576, True, 'actually'),
 (13.450001535107923, True, 'money'),
 (13.44703782400041, True, 'ask'),
 (13.265352056067726, True, 'buy'),
 (11.963636363636367, True, 'bridge'),
 (11.963636363636367, True, 'act')]

In [54]:
type(svm)

sklearn.svm._classes.LinearSVC

In [55]:
len(svm.coef_)

1

In [56]:
feats_w_classifier_weight = list()
for index,weight in enumerate(select.inverse_transform(svm.coef_)[0]):
    if weight!=0:
        feats_w_classifier_weight.append((weight,feature_names[index]))
feats_w_classifier_weight = sorted(feats_w_classifier_weight)
len(feats_w_classifier_weight)

783

In [57]:
feats_w_classifier_weight[-100:] #features positive

[(0.3646362318165575, 'ring'),
 (0.37080613643352844, 'love want'),
 (0.3709112025589579, 'speaker good'),
 (0.3736134701528984, 'love thing'),
 (0.3829117561534839, 'clarity'),
 (0.3836083694066893, 'added'),
 (0.38571651966297316, 'sound quality'),
 (0.3879282601167612, 'tell'),
 (0.3894584632683591, 'light'),
 (0.38978820317079704, 'amazing'),
 (0.3901526395093828, 'way use'),
 (0.3906462084786386, 'microphone'),
 (0.3906462084786386, 'okay'),
 (0.39277292133734815, 'love buy'),
 (0.39314889938769404, 'variety'),
 (0.3941926166623807, 'new like'),
 (0.3944593518777128, 'primarily'),
 (0.39747616098284627, 'set'),
 (0.39861001110508215, 'job'),
 (0.3995695826752927, 'good buy'),
 (0.4005164606278419, 'bedside'),
 (0.4022928713606754, 'like new'),
 (0.40429630179373394, 'music'),
 (0.4049381579411509, 'ease'),
 (0.4049381579411509, 'ease use'),
 (0.4083090535755037, 'listen music'),
 (0.41220312989611607, 'additional'),
 (0.41413936616512004, 'product'),
 (0.4167321318970658, 'small')

In [58]:
feats_w_classifier_weight[:100] #features negative

[(-1.6295474858174477, 'idle'),
 (-1.2485570366394725, 'terrible'),
 (-1.1570638870444832, 'not_see'),
 (-1.1341104282546033, 'try'),
 (-1.0891477497330833, 'pay'),
 (-1.0531071779436063, 'disconnect'),
 (-0.9755813141274605, 'difficult'),
 (-0.968897081173351, 'useless'),
 (-0.9449259998818695, 'return'),
 (-0.9371587578881112, 'disappointed'),
 (-0.8691432679476301, 'speak'),
 (-0.8412122244767526, 'dont'),
 (-0.8326291806375559, 'poor'),
 (-0.8282920354166196, 'functionality'),
 (-0.8109489504879781, 'switch'),
 (-0.8059936051440455, 'make'),
 (-0.8026094895179263, 'cheap'),
 (-0.7835791242554105, 'meh'),
 (-0.7772260440939901, 'turn'),
 (-0.7742612097918373, 'poor quality'),
 (-0.7726445069929958, 'hue'),
 (-0.7721625846778426, 'obviate'),
 (-0.7676506474195964, 'slow'),
 (-0.76049212149354, 'maybe'),
 (-0.7545963372256022, 'not_youtube'),
 (-0.7545956662165216, 'worthless'),
 (-0.7537428686824256, 'party'),
 (-0.7461655937305928, 'sound terrible'),
 (-0.7376669251977346, 'month'),

In [59]:
df_scores = pd.DataFrame()
scores = []
words = []
for entry in feats_w_classifier_weight:
    scores.append(entry[0])
    words.append(entry[1])

df_scores["scores"] = scores
df_scores["words"] = words

df_scores.sort_values(by = "scores", ascending = False).head(15) # use tail for seeing the positive words

Unnamed: 0,scores,words
782,1.993119,love
781,1.401219,great
780,1.23769,easy
779,0.912732,search
778,0.909007,good
777,0.843577,far
776,0.798825,best
775,0.774797,lol
774,0.763845,figure
773,0.763056,perfect


In [60]:
stringa = "the product is disappointing, not good at all"
clean = tokenize_list_of_text([stringa])[0]
clean

total number of types extracted is: 5


['the product disappointing evil all']

In [61]:
vector = cv.transform(clean)
vector = select.transform(vector)
vector = tfidf.transform(vector)
predicted = svm.predict(vector)
print(predicted)

[1]


In [62]:
MNB = MultinomialNB()  # MNB with default parameters
MNB_clsf = MNB.fit(X_train_vec,Y_train)
predictions = MNB_clsf.predict(X_test_vec)
print(metrics.classification_report(predictions, Y_test))

              precision    recall  f1-score   support

           0       0.46      0.90      0.61        21
           1       0.98      0.79      0.87       103

    accuracy                           0.81       124
   macro avg       0.72      0.85      0.74       124
weighted avg       0.89      0.81      0.83       124



In [63]:
len(Y_test)

124

In [64]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100],
              'gamma': [1, 0.1, 0.01, 0.001],
              'kernel': ['rbf','linear', 'poly'],
              'class_weight':['balanced', None]
}

grid = GridSearchCV(SVC(), param_grid, refit ='accuracy', scoring="accuracy", cv=kfold, verbose=3)
grid.fit(X_train_vec,Y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV 1/5] END C=0.1, class_weight=balanced, gamma=1, kernel=rbf;, score=0.808 total time=   0.0s
[CV 2/5] END C=0.1, class_weight=balanced, gamma=1, kernel=rbf;, score=0.919 total time=   0.0s
[CV 3/5] END C=0.1, class_weight=balanced, gamma=1, kernel=rbf;, score=0.818 total time=   0.0s
[CV 4/5] END C=0.1, class_weight=balanced, gamma=1, kernel=rbf;, score=0.717 total time=   0.0s
[CV 5/5] END C=0.1, class_weight=balanced, gamma=1, kernel=rbf;, score=0.408 total time=   0.0s
[CV 1/5] END C=0.1, class_weight=balanced, gamma=1, kernel=linear;, score=0.677 total time=   0.0s
[CV 2/5] END C=0.1, class_weight=balanced, gamma=1, kernel=linear;, score=0.677 total time=   0.0s
[CV 3/5] END C=0.1, class_weight=balanced, gamma=1, kernel=linear;, score=0.747 total time=   0.0s
[CV 4/5] END C=0.1, class_weight=balanced, gamma=1, kernel=linear;, score=0.646 total time=   0.0s
[CV 5/5] END C=0.1, class_weight=balanced, gamma=1, kernel=lin

In [65]:
print(grid.best_params_)

{'C': 1, 'class_weight': None, 'gamma': 1, 'kernel': 'linear'}


In [66]:
grid_predictions = grid.predict(X_test_vec)

print(metrics.classification_report(Y_test, grid_predictions))

              precision    recall  f1-score   support

           0       0.76      0.68      0.72        41
           1       0.85      0.89      0.87        83

    accuracy                           0.82       124
   macro avg       0.80      0.79      0.79       124
weighted avg       0.82      0.82      0.82       124

