In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn import metrics
from sklearn.svm import LinearSVC, SVC
import string
from nltk import pos_tag
from nltk.corpus import stopwords
# nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from nltk.corpus import wordnet
import re
from preprocessing import *
from numpy import array 

In [19]:
dataset = pd.read_csv("amazon_alexa.tsv", sep = "\t")
dataset.drop(dataset[dataset.rating == 3].index, inplace=True) #droppa recensioni con 3 stelle
print(dataset.shape)

(2998, 5)


In [20]:
dataset = drop_positive_duplicates(dataset)
dataset.shape

(2453, 5)

In [21]:
tokenized_reviews, sentences = tokenize_list_of_text(dataset["verified_reviews"], custom_stopwords, pos_filter=False, pos_list = pos_list)

dataset["verified_reviews"] = tokenized_reviews

total number of types extracted is: 3174


In [23]:
# se si vuole bilanciare il dataset con undersampling eseguire la cella
# dataset = undersample_positive(dataset)

In [24]:
tokenizer = nltk.tokenize.TweetTokenizer()
cv = CountVectorizer(stop_words="english", ngram_range=(1, 3), tokenizer=tokenizer.tokenize, min_df = 4) #count == frequenza
text_counts = cv.fit_transform(dataset["verified_reviews"]) 
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, dataset["feedback"], test_size = 0.20, random_state=10) #divisione in train e test

In [25]:
text_counts

<2453x1620 sparse matrix of type '<class 'numpy.int64'>'
	with 26210 stored elements in Compressed Sparse Row format>

In [26]:
len(cv.vocabulary_)

1620

In [27]:
cv.vocabulary_ # A mapping of terms to feature indices.

{'like': 718,
 'fact': 449,
 'answer': 59,
 'real': 1097,
 'need': 909,
 'household': 624,
 'good': 526,
 'bargain': 109,
 'day': 312,
 'deal': 316,
 'like fact': 722,
 'fact answer': 450,
 'sound': 1288,
 'terrible': 1385,
 'want': 1537,
 'music': 875,
 'bose': 148,
 'sound terrible': 1300,
 'good music': 531,
 'little': 747,
 'feature': 473,
 'stop': 1337,
 'work': 1579,
 'week': 1565,
 'command': 235,
 'really': 1099,
 'fun': 507,
 'stop work': 1338,
 'work week': 1600,
 'command really': 236,
 'fun work': 510,
 'sad': 1176,
 'joke': 676,
 'worthless': 1608,
 'disappointed': 364,
 'wall': 1536,
 'socket': 1273,
 'time': 1406,
 'fault': 468,
 'disagree': 362,
 'make': 833,
 'company': 240,
 'technologically': 1375,
 'advance': 42,
 'sell': 1207,
 'product': 1047,
 'rechargeable': 1118,
 'battery': 115,
 'return': 1158,
 'apple': 69,
 'boom': 147,
 'speaker': 1308,
 'really disappointed': 1100,
 'sell product': 1208,
 'apple music': 70,
 'great': 546,
 'bass': 113,
 'idle': 632,
 'hot

In [28]:
#get_feature_names_out([input_features])
#Get output feature names for transformation.

len(cv.get_feature_names_out())

1620

In [29]:
X_train[0,:] # prima recensione

<1x1620 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [30]:
print(X_train[0,:])

  (0, 1176)	1
  (0, 676)	1
  (0, 1608)	1


In [31]:
cv.inverse_transform(X_train[0,:]) #0 ==> prima recensione

[array(['sad', 'joke', 'worthless'], dtype='<U25')]

In [32]:
for feat,freq in zip(cv.inverse_transform(X_train[0,:])[0],X_train[0,:].data):
    print(feat,freq)

sad 1
joke 1
worthless 1


In [35]:
#For classification we'll set 'chi2'  method as a scoring function. The target number of features is defined by k parameter
select = SelectKBest(chi2, k="all")  # feature selection
# select = SelectKBest(chi2, k="all")  # feature selection for balanced with pos filter
select.fit(X_train,Y_train)
X_train_sel = select.transform(X_train)
X_test_sel = select.transform(X_test)

In [36]:
#We've selected 3 best features in x data. To identify the selected features we use get_support() function and filter out them from the features name list. The z object contains selected x data
filter = select.get_support() #filtra le parole selezionando quelle contenenti nei k 5000
sum(filter)

1620

In [37]:
X_train_sel

<1962x1620 sparse matrix of type '<class 'numpy.int64'>'
	with 20731 stored elements in Compressed Sparse Row format>

In [38]:
print(X_test_sel[0,:])

  (0, 718)	1
  (0, 526)	1
  (0, 312)	1
  (0, 316)	1
  (0, 1288)	1
  (0, 473)	1
  (0, 546)	1
  (0, 1391)	1
  (0, 231)	1
  (0, 1126)	1
  (0, 170)	1
  (0, 1063)	1
  (0, 1575)	1
  (0, 1034)	1
  (0, 1193)	1
  (0, 100)	1
  (0, 907)	1
  (0, 707)	1
  (0, 1010)	1
  (0, 434)	1
  (0, 1605)	1
  (0, 551)	1
  (0, 1392)	1
  (0, 1513)	1


In [39]:
print(cv.inverse_transform(select.inverse_transform(X_train_sel[0,:]))) #applichi l'inverse_transform sui vettori che poi hai utilizzato con il Select

[array(['joke', 'sad', 'worthless'], dtype='<U25')]


In [40]:
tfidf = TfidfTransformer()  # weighting
tfidf.fit(X_train_sel)
X_train_vec = tfidf.transform(X_train_sel)
X_test_vec =tfidf.transform(X_test_sel)

In [41]:
print(X_train_vec[0,:])

  (0, 1608)	0.6151558380235335
  (0, 1176)	0.5990710069241998
  (0, 676)	0.5125399726932421


In [42]:
cv.inverse_transform(select.inverse_transform(X_train_vec[0,:]))

[array(['joke', 'sad', 'worthless'], dtype='<U25')]

In [43]:
for feat,weight,freq in zip(cv.inverse_transform(select.inverse_transform(X_train_vec[0,:]))[0],X_train_vec[0,:].data,X_train_sel[1,:].data):
    print(feat,round(weight,4),freq)

joke 0.6152 1
sad 0.5991 1
worthless 0.5125 1


In [44]:
# create a dataframe with words, tf_idf score and freq
total_scores = []

for i in range(X_train_vec.shape[0]):
    for feat,weight,freq in zip(cv.inverse_transform(select.inverse_transform(X_train_vec[i,:]))[0],X_train_vec[i,:].data,X_train_sel[i,:].data):
        total_scores.append((feat,round(weight,4),freq))

words = []
tf_idf_score = []
freq = []
for i in range(len(total_scores)):
    words.append(total_scores[i][0])
    tf_idf_score.append(total_scores[i][1])
    freq.append(total_scores[i][2])

scores_df = pd.DataFrame()
scores_df["words"] = words
scores_df["tf_idf_score"] = tf_idf_score
scores_df["freq"] = freq

In [45]:
scores_df.sort_values(by = "tf_idf_score", ascending = False).head(50) # sort and show the df

Unnamed: 0,words,tf_idf_score,freq
16936,love,1.0,1
18521,nice,1.0,1
10772,expect,1.0,1
10763,expect,1.0,1
1533,cool,1.0,1
10722,love,1.0,1
10625,amazing,1.0,1
360,new,1.0,1
10550,like,1.0,1
5436,turn,1.0,1


In [46]:
X_train_vec

<1962x1620 sparse matrix of type '<class 'numpy.float64'>'
	with 20731 stored elements in Compressed Sparse Row format>

In [47]:
svm = LinearSVC(max_iter=3000, class_weight = "balanced")  # linear svm with default parameters
svm_clf = svm.fit(X_train_vec,Y_train)
predictions = svm_clf.predict(X_test_vec)

In [48]:
len(predictions), sum(predictions)

(491, 381)

In [49]:
print(metrics.classification_report(Y_test, predictions))

              precision    recall  f1-score   support

           0       0.75      0.98      0.85        85
           1       0.99      0.93      0.96       406

    accuracy                           0.94       491
   macro avg       0.87      0.95      0.91       491
weighted avg       0.95      0.94      0.94       491



In [50]:
feature_names = cv.get_feature_names_out()
feature_names

array(["'", "' ll", "' t", ..., 'z-wave', 'zero', 'zigbee'], dtype=object)

In [51]:
print(select.scores_)

[22.43272826 12.57142857  4.33351371 ... 20.95238095  8.38095238
 20.95238095]


In [52]:
feature_names = cv.get_feature_names_out()
feats_w_score = list()
for index,(selected,score) in enumerate(zip(filter,select.scores_)):
    feats_w_score.append((score,selected,feature_names[index]))
feats_w_score = sorted(feats_w_score, reverse = True)
len(feats_w_score)

1620

In [53]:
feats_w_score[:10] # this contains k score, wether it is a selcted feature and the word

[(133.93941455370026, True, 'try'),
 (120.09690177756738, True, 'love'),
 (83.37144935378632, True, 'device'),
 (83.00075757575756, True, 'month'),
 (76.10395160395161, True, 'idle'),
 (69.57646958304854, True, 'half'),
 (58.66666666666666, True, 'card'),
 (50.99570768214835, True, 'bulb'),
 (49.173100325174055, True, 'great'),
 (48.74493661100803, True, 'cycle')]

In [54]:
type(svm)

sklearn.svm._classes.LinearSVC

In [55]:
len(svm.coef_)

1

In [56]:
feats_w_classifier_weight = list()
for index,weight in enumerate(select.inverse_transform(svm.coef_)[0]):
    if weight!=0:
        feats_w_classifier_weight.append((weight,feature_names[index]))
feats_w_classifier_weight = sorted(feats_w_classifier_weight)
len(feats_w_classifier_weight)

1583

In [57]:
feats_w_classifier_weight[-100:] #features positive

[(0.640708419859608, 'change'),
 (0.6425632602459987, 'color'),
 (0.6428390849530761, 'certain'),
 (0.6471725524509592, 'instal'),
 (0.6479579654477818, 'convenient'),
 (0.6488602831562793, 'helpful'),
 (0.6539153060584592, 'course'),
 (0.6552636356236344, 'life'),
 (0.6613436809110377, 'awhile'),
 (0.6711271546713687, 'purchase'),
 (0.6738901096836659, 'work good'),
 (0.6739352069735067, 'house'),
 (0.6746500671977227, 'look brand'),
 (0.6752087033420396, 'calendar'),
 (0.6772868972951396, 'soft'),
 (0.6832267620547295, 'really like'),
 (0.6839588099232883, 'love feature'),
 (0.686405922886453, 'process'),
 (0.6872126113180064, 'large'),
 (0.6879408256504717, 'home'),
 (0.6940962122878404, 'new'),
 (0.6953400066701746, 'kitchen'),
 (0.7060032468261738, 'watch'),
 (0.7075612361418846, 'quickly'),
 (0.7119334581516934, 'advertised'),
 (0.7144658135856864, 'internet'),
 (0.7179109076983993, 'sale'),
 (0.7181912197069082, 'daughter'),
 (0.7247942497267248, 'buy day'),
 (0.7256224649666144

In [58]:
feats_w_classifier_weight[:100] #features negative

[(-1.9698854981104332, 'return'),
 (-1.6544503942217057, 'honestly'),
 (-1.6536787869247669, 'terrible'),
 (-1.4654327056528855, 'send'),
 (-1.464799349229321, 'figure use'),
 (-1.3881048160686924, 'realize'),
 (-1.3723645913530405, 'party'),
 (-1.341018355536529, 'poor'),
 (-1.3409960508310612, 'try'),
 (-1.3355799273323608, 'unplug'),
 (-1.2655839057279332, 'device really'),
 (-1.2640751654841724, 'adapter'),
 (-1.2591510855746624, 'useless'),
 (-1.1957120564562735, 'excitement'),
 (-1.1866983659156463, 'pair'),
 (-1.185898684413376, 'money'),
 (-1.1691481899862346, 'stop work'),
 (-1.1628775125924262, 'sound terrible'),
 (-1.157317928386945, 'half'),
 (-1.1543458795445443, 'plug'),
 (-1.13978165767762, 'home good'),
 (-1.1383831228261623, 'hardly'),
 (-1.1164388922355089, 'firmware'),
 (-1.1064688043239008, 'worthless'),
 (-1.0977172263801733, 'dumb'),
 (-1.079952772606637, 'good thing'),
 (-1.0745292423732755, 'sell'),
 (-1.071832679881962, '.'),
 (-1.068686206699244, 'feature'),
 

In [59]:
df_scores = pd.DataFrame()
scores = []
words = []
for entry in feats_w_classifier_weight:
    scores.append(entry[0])
    words.append(entry[1])

df_scores["scores"] = scores
df_scores["words"] = words

df_scores.sort_values(by = "scores", ascending = False).head(15) # use tail for seeing the positive words

Unnamed: 0,scores,words
1582,3.708933,love
1581,2.5197,great
1580,2.363729,easy
1579,1.729595,good
1578,1.542472,enjoy
1577,1.423122,fun
1576,1.309938,music
1575,1.203875,like
1574,1.187288,smart
1573,1.168767,best


In [60]:
stringa = "the product is disappointing, not good at all"
clean = tokenize_list_of_text([stringa])[0]
clean

total number of types extracted is: 5


['the product disappointing evil all']

In [61]:
vector = cv.transform(clean)
vector = select.transform(vector)
vector = tfidf.transform(vector)
predicted = svm.predict(vector)
print(predicted)

[0]


In [62]:
MNB = MultinomialNB()  # MNB with default parameters
MNB_clsf = MNB.fit(X_train_vec,Y_train)
predictions = MNB_clsf.predict(X_test_vec)
print(metrics.classification_report(predictions, Y_test))

              precision    recall  f1-score   support

           0       0.48      0.98      0.65        42
           1       1.00      0.90      0.95       449

    accuracy                           0.91       491
   macro avg       0.74      0.94      0.80       491
weighted avg       0.95      0.91      0.92       491



In [63]:
len(Y_test)

491

In [64]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100],
              'gamma': [1, 0.1, 0.01, 0.001],
              'kernel': ['rbf','linear', 'poly'],
              'class_weight':['balanced', None]
}

grid = GridSearchCV(SVC(), param_grid, refit ='f1_macro', scoring="f1_macro", cv=kfold, verbose=3)
grid.fit(X_train_vec, Y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV 1/5] END C=0.1, class_weight=balanced, gamma=1, kernel=rbf;, score=0.699 total time=   0.6s
[CV 2/5] END C=0.1, class_weight=balanced, gamma=1, kernel=rbf;, score=0.625 total time=   0.5s
[CV 3/5] END C=0.1, class_weight=balanced, gamma=1, kernel=rbf;, score=0.690 total time=   0.5s
[CV 4/5] END C=0.1, class_weight=balanced, gamma=1, kernel=rbf;, score=0.608 total time=   0.5s
[CV 5/5] END C=0.1, class_weight=balanced, gamma=1, kernel=rbf;, score=0.661 total time=   0.5s
[CV 1/5] END C=0.1, class_weight=balanced, gamma=1, kernel=linear;, score=0.784 total time=   0.4s
[CV 2/5] END C=0.1, class_weight=balanced, gamma=1, kernel=linear;, score=0.768 total time=   0.4s
[CV 3/5] END C=0.1, class_weight=balanced, gamma=1, kernel=linear;, score=0.784 total time=   0.4s
[CV 4/5] END C=0.1, class_weight=balanced, gamma=1, kernel=linear;, score=0.807 total time=   0.4s
[CV 5/5] END C=0.1, class_weight=balanced, gamma=1, kernel=lin

In [65]:
print(grid.best_params_)

{'C': 10, 'class_weight': None, 'gamma': 1, 'kernel': 'rbf'}


In [66]:
grid_predictions = grid.predict(X_test_vec)

print(metrics.classification_report(Y_test, grid_predictions))

              precision    recall  f1-score   support

           0       0.94      0.87      0.90        85
           1       0.97      0.99      0.98       406

    accuracy                           0.97       491
   macro avg       0.96      0.93      0.94       491
weighted avg       0.97      0.97      0.97       491

