In [5]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn import metrics
from sklearn.svm import LinearSVC, SVC
import string
from nltk import pos_tag
from nltk.corpus import stopwords
# nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from nltk.corpus import wordnet
import re
from preprocessing import *
from numpy import array 
import numpy as np

In [12]:
dataset = pd.read_csv("amazon_alexa.tsv", sep = "\t", encoding = "utf-8")
print(dataset.shape)
dataset.dropna(inplace = True)
print(dataset.shape)
dataset.drop(dataset[dataset.rating == 3].index, inplace=True)
print(dataset.shape)
dataset.drop_duplicates(inplace = True)
print(dataset.shape)

(3150, 5)
(3150, 5)
(2998, 5)
(2322, 5)


In [13]:
X = np.array(dataset["verified_reviews"].values).reshape(-1, 1)
y = list(dataset["feedback"].values)

In [14]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

undersampler = RandomUnderSampler(sampling_strategy=0.5, random_state = 0)

X, y = undersampler.fit_resample(X, y)


print('Resampled dataset shape %s' % Counter(y))

Resampled dataset shape Counter({1: 442, 0: 221})


In [15]:
X_temp = []

for rev in X:
  X_temp.append(rev[0])

In [16]:
tokenized_reviews, sentences = tokenize_list_of_text(X_temp, custom_stopwords, pos_filter=False, pos_list = pos_list)

total number of types extracted is: 1810


In [17]:
tokenizer = nltk.tokenize.TweetTokenizer()
cv = CountVectorizer(stop_words="english", ngram_range=(1, 3), tokenizer=tokenizer.tokenize, min_df = 2) #count == frequenza
text_counts = cv.fit_transform(tokenized_reviews) 
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, y, test_size = 0.20, random_state=10) #divisione in train e test

In [18]:
text_counts

<663x1177 sparse matrix of type '<class 'numpy.int64'>'
	with 6818 stored elements in Compressed Sparse Row format>

In [19]:
len(cv.vocabulary_)

1177

In [20]:
cv.vocabulary_ # A mapping of terms to feature indices.

{'like': 524,
 'fact': 321,
 'answer': 42,
 "n't_see": 639,
 'real': 801,
 'need': 647,
 'household': 447,
 'good': 386,
 'day': 222,
 'deal': 229,
 'like fact': 529,
 'sound': 932,
 'terrible': 1003,
 'want': 1115,
 'music': 629,
 'bose': 104,
 'sound terrible': 938,
 'little': 542,
 'feature': 340,
 'stop': 967,
 'work': 1149,
 'week': 1137,
 'command': 167,
 'really': 804,
 'fun': 370,
 'stop work': 968,
 'work week': 1165,
 'command really': 168,
 'joke': 493,
 'worthless': 1168,
 'disappointed': 261,
 'plug': 743,
 'wall': 1114,
 'time': 1017,
 'fault': 337,
 'make': 595,
 'company': 171,
 'advance': 25,
 'sell': 887,
 'product': 768,
 'battery': 83,
 'return': 849,
 'apple': 51,
 'speaker': 941,
 'really disappointed': 805,
 'apple music': 52,
 'great': 397,
 'bass': 81,
 'idle': 455,
 'miss': 618,
 'recognize': 820,
 'thing': 1007,
 'great sound': 411,
 'idle time': 457,
 'impress': 463,
 'tightness': 1016,
 'stupid': 975,
 'spotify': 958,
 'account': 13,
 "n't_use": 641,
 'rand

In [21]:
#get_feature_names_out([input_features])
#Get output feature names for transformation.

len(cv.get_feature_names_out())

1177

In [22]:
X_train[0,:] # prima recensione

<1x1177 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [23]:
print(X_train[0,:])

  (0, 768)	1
  (0, 815)	1
  (0, 1170)	1
  (0, 309)	1
  (0, 1005)	1
  (0, 475)	1
  (0, 308)	1


In [24]:
cv.inverse_transform(X_train[0,:]) #0 ==> prima recensione

[array(['product', 'receive', 'wrong', 'excitement', 'thank', 'install',
        'excited'], dtype='<U26')]

In [25]:
for feat,freq in zip(cv.inverse_transform(X_train[0,:])[0],X_train[0,:].data):
    print(feat,freq)

product 1
receive 1
wrong 1
excitement 1
thank 1
install 1
excited 1


In [26]:
#For classification we'll set 'chi2'  method as a scoring function. The target number of features is defined by k parameter
select = SelectKBest(chi2, k=1000)  # feature selection
# select = SelectKBest(chi2, k="all")  # feature selection for balanced with pos filter
select.fit(X_train,Y_train)
X_train_sel = select.transform(X_train)
X_test_sel = select.transform(X_test)

In [27]:
#We've selected 3 best features in x data. To identify the selected features we use get_support() function and filter out them from the features name list. The z object contains selected x data
filter = select.get_support() #filtra le parole selezionando quelle contenenti nei k 5000
sum(filter)

1000

In [28]:
X_train_sel

<530x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 4498 stored elements in Compressed Sparse Row format>

In [29]:
print(X_test_sel[0,:])

  (0, 304)	1
  (0, 974)	1
  (0, 980)	1


In [30]:
print(cv.inverse_transform(select.inverse_transform(X_train_sel[0,:]))) #applichi l'inverse_transform sui vettori che poi hai utilizzato con il Select

[array(['excited', 'excitement', 'install', 'product', 'receive', 'wrong'],
      dtype='<U26')]


In [31]:
tfidf = TfidfTransformer()  # weighting
tfidf.fit(X_train_sel)
X_train_vec = tfidf.transform(X_train_sel)
X_test_vec =tfidf.transform(X_test_sel)

In [32]:
print(X_train_vec[0,:])

  (0, 994)	0.4146904866706988
  (0, 680)	0.40303176139531016
  (0, 637)	0.25585867163761705
  (0, 383)	0.42847983190630784
  (0, 240)	0.44535662651183466
  (0, 239)	0.46711459899128316


In [33]:
cv.inverse_transform(select.inverse_transform(X_train_vec[0,:]))

[array(['excited', 'excitement', 'install', 'product', 'receive', 'wrong'],
       dtype='<U26')]

In [34]:
for feat,weight,freq in zip(cv.inverse_transform(select.inverse_transform(X_train_vec[0,:]))[0],X_train_vec[0,:].data,X_train_sel[1,:].data):
    print(feat,round(weight,4),freq)

excited 0.4147 1
excitement 0.403 1
install 0.2559 1
product 0.4285 1
receive 0.4454 1
wrong 0.4671 1


In [35]:
# create a dataframe with words, tf_idf score and freq
total_scores = []

for i in range(X_train_vec.shape[0]):
    for feat,weight,freq in zip(cv.inverse_transform(select.inverse_transform(X_train_vec[i,:]))[0],X_train_vec[i,:].data,X_train_sel[i,:].data):
        total_scores.append((feat,round(weight,4),freq))

words = []
tf_idf_score = []
freq = []
for i in range(len(total_scores)):
    words.append(total_scores[i][0])
    tf_idf_score.append(total_scores[i][1])
    freq.append(total_scores[i][2])

scores_df = pd.DataFrame()
scores_df["words"] = words
scores_df["tf_idf_score"] = tf_idf_score
scores_df["freq"] = freq

In [36]:
scores_df.sort_values(by = "tf_idf_score", ascending = False).head(50) # sort and show the df

Unnamed: 0,words,tf_idf_score,freq
1715,work,1.0,1
2307,great,1.0,1
2287,love,1.0,1
1599,good,1.0,1
3893,fun,1.0,1
639,apps,1.0,1
1598,love,1.0,1
2760,love,1.0,1
3923,worthless,1.0,1
1562,useless,1.0,1


In [37]:
X_train_vec

<530x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 4498 stored elements in Compressed Sparse Row format>

In [38]:
svm = LinearSVC(max_iter=3000, class_weight = "balanced")  # linear svm with default parameters
svm_clf = svm.fit(X_train_vec,Y_train)
predictions = svm_clf.predict(X_test_vec)

In [39]:
len(predictions), sum(predictions)

(133, 79)

In [40]:
print(metrics.classification_report(Y_test, predictions))

              precision    recall  f1-score   support

           0       0.70      0.90      0.79        42
           1       0.95      0.82      0.88        91

    accuracy                           0.85       133
   macro avg       0.83      0.86      0.84       133
weighted avg       0.87      0.85      0.85       133



In [41]:
feature_names = cv.get_feature_names_out()
feature_names

array(["'", "' ll", "' ve", ..., 'yes', 'zero', 'zigbee'], dtype=object)

In [42]:
print(select.scores_)

[6.58871965 1.96089385 0.50997151 ... 3.92178771 0.23543268 7.84357542]


In [43]:
feature_names = cv.get_feature_names_out()
feats_w_score = list()
for index,(selected,score) in enumerate(zip(filter,select.scores_)):
    feats_w_score.append((score,selected,feature_names[index]))
feats_w_score = sorted(feats_w_score, reverse = True)
len(feats_w_score)

1177

In [44]:
feats_w_score[:10] # this contains k score, wether it is a selcted feature and the word

[(56.08629265401997, True, 'love'),
 (22.277984851009588, True, 'great'),
 (20.648442598163268, True, 'money'),
 (19.509007732233695, True, 'device'),
 (18.762063166160004, True, 'month'),
 (14.134237374460838, True, 'plug'),
 (12.22301980154246, True, 'fix'),
 (11.765363128491622, True, 'customer'),
 (11.765363128491622, True, 'bridge'),
 (11.454593685506167, True, 'product')]

In [45]:
type(svm)

sklearn.svm._classes.LinearSVC

In [46]:
len(svm.coef_)

1

In [47]:
feats_w_classifier_weight = list()
for index,weight in enumerate(select.inverse_transform(svm.coef_)[0]):
    if weight!=0:
        feats_w_classifier_weight.append((weight,feature_names[index]))
feats_w_classifier_weight = sorted(feats_w_classifier_weight)
len(feats_w_classifier_weight)

985

In [48]:
feats_w_classifier_weight[-100:] #features positive

[(0.3745648651510574, 'likely'),
 (0.3753863439651026, 'small speaker'),
 (0.3770420234323098, 'not_disappoint'),
 (0.3784534932164922, 'brand'),
 (0.38646233845648614, 'walk'),
 (0.38654203207370863, 'flash briefing'),
 (0.3883648398457712, 'ship'),
 (0.39164269223350584, 'best thing'),
 (0.39333496669658335, 'command'),
 (0.396113218983499, 'life'),
 (0.3962907140037131, 'phone app'),
 (0.39702924982330245, "n't_tell"),
 (0.3978723261452342, 'control'),
 (0.401164930341507, 'excited'),
 (0.40484102808211736, 'need buy'),
 (0.40745315639801566, 'ask play music'),
 (0.41590261683407026, 'favorite'),
 (0.4167765904876177, 'speaker bedroom'),
 (0.4177398114202369, 'far like'),
 (0.4203681167002176, 'problem'),
 (0.42160601147903887, 'sale'),
 (0.4223072557252193, 'spell'),
 (0.4223072557252193, 'trial'),
 (0.42858019690008764, 'device house'),
 (0.4301846755979269, 'loud'),
 (0.4383191010575247, 'amazing device'),
 (0.4394728632014534, 'alarm clock'),
 (0.44002534131992305, 'plus build')

In [49]:
feats_w_classifier_weight[:100] #features negative

[(-1.2343758596700953, 'idle'),
 (-1.1567222519074816, 'terrible'),
 (-1.1478718021885217, 'try'),
 (-1.1191801263990153, 'difficult'),
 (-1.1063142287217262, 'stop'),
 (-1.0279613153323213, 'disappointed'),
 (-1.0278790488695455, 'money'),
 (-0.98102463084639, 'figure use'),
 (-0.9549700172559542, 'send'),
 (-0.9071518000347977, 'sound terrible'),
 (-0.8997330639456496, 'cheap'),
 (-0.8966194252749441, 'mode'),
 (-0.8940142756581723, "n't_see"),
 (-0.8428564900569561, 'return'),
 (-0.8310712366015081, 'party'),
 (-0.8010744183303478, 'evil'),
 (-0.7971552633021901, 'start'),
 (-0.7950168351374739, 'sell'),
 (-0.7938271551430874, 'useless'),
 (-0.7924478655005118, 'leave'),
 (-0.7917805641457109, 'adapter'),
 (-0.7802554009912361, 'stupid'),
 (-0.7782510034812938, 'turn'),
 (-0.7777828301699016, 'disappointing'),
 (-0.7777642614528028, 'dont'),
 (-0.7777642614528028, 'dont know'),
 (-0.7657564809174603, 'new refurbish'),
 (-0.7569176603286438, 'pay'),
 (-0.7530213016484771, 'worthlessn

In [50]:
df_scores = pd.DataFrame()
scores = []
words = []
for entry in feats_w_classifier_weight:
    scores.append(entry[0])
    words.append(entry[1])

df_scores["scores"] = scores
df_scores["words"] = words

df_scores.sort_values(by = "scores", ascending = False).head(15) # use tail for seeing the positive words

Unnamed: 0,scores,words
984,2.590864,love
983,1.811143,great
982,1.474898,easy
981,1.05231,best
980,1.028758,good
979,0.990851,enjoy
978,0.976349,learn
977,0.892057,room
976,0.879941,work expect
975,0.875217,fun


In [51]:
stringa = "the product is disappointing, not good at all"
clean = tokenize_list_of_text([stringa])[0]
clean

total number of types extracted is: 5


['the product disappointing evil all']

In [52]:
vector = cv.transform(clean)
vector = select.transform(vector)
vector = tfidf.transform(vector)
predicted = svm.predict(vector)
print(predicted)

[0]


In [53]:
MNB = MultinomialNB()  # MNB with default parameters
MNB_clsf = MNB.fit(X_train_vec,Y_train)
predictions = MNB_clsf.predict(X_test_vec)
print(metrics.classification_report(predictions, Y_test))

              precision    recall  f1-score   support

           0       0.45      0.86      0.59        22
           1       0.97      0.79      0.87       111

    accuracy                           0.80       133
   macro avg       0.71      0.83      0.73       133
weighted avg       0.88      0.80      0.83       133



In [54]:
len(Y_test)

133

In [55]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100],
              'gamma': [1, 0.1, 0.01, 0.001],
              'kernel': ['rbf','linear', 'poly'],
              'class_weight':['balanced', None]
}

grid = GridSearchCV(SVC(), param_grid, refit ='f1_macro', scoring="f1_macro", cv=kfold, verbose=3)
grid.fit(X_train_vec,Y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV 1/5] END C=0.1, class_weight=balanced, gamma=1, kernel=rbf;, score=0.329 total time=   0.0s
[CV 2/5] END C=0.1, class_weight=balanced, gamma=1, kernel=rbf;, score=0.285 total time=   0.0s
[CV 3/5] END C=0.1, class_weight=balanced, gamma=1, kernel=rbf;, score=0.315 total time=   0.0s
[CV 4/5] END C=0.1, class_weight=balanced, gamma=1, kernel=rbf;, score=0.315 total time=   0.0s
[CV 5/5] END C=0.1, class_weight=balanced, gamma=1, kernel=rbf;, score=0.378 total time=   0.0s
[CV 1/5] END C=0.1, class_weight=balanced, gamma=1, kernel=linear;, score=0.495 total time=   0.0s
[CV 2/5] END C=0.1, class_weight=balanced, gamma=1, kernel=linear;, score=0.460 total time=   0.0s
[CV 3/5] END C=0.1, class_weight=balanced, gamma=1, kernel=linear;, score=0.460 total time=   0.0s
[CV 4/5] END C=0.1, class_weight=balanced, gamma=1, kernel=linear;, score=0.460 total time=   0.0s
[CV 5/5] END C=0.1, class_weight=balanced, gamma=1, kernel=lin

In [56]:
print(grid.best_params_)

{'C': 1, 'class_weight': 'balanced', 'gamma': 1, 'kernel': 'rbf'}


In [57]:
grid_predictions = grid.predict(X_test_vec)

print(metrics.classification_report(Y_test, grid_predictions))

              precision    recall  f1-score   support

           0       0.77      0.86      0.81        42
           1       0.93      0.88      0.90        91

    accuracy                           0.87       133
   macro avg       0.85      0.87      0.86       133
weighted avg       0.88      0.87      0.87       133

