### Il notebook implementa un classificatore con la seguente pipeline: countVectorizer, SelectKBest, Tf-Idf, SVM

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn import metrics
from sklearn.svm import LinearSVC, SVC
import string
from nltk import pos_tag
from nltk.corpus import stopwords
# nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from nltk.corpus import wordnet
import re
from preprocessing import *
from numpy import array 
import numpy as np

In [2]:
dataset = pd.read_csv("amazon_alexa.tsv", sep = "\t", encoding = "utf-8")
print(dataset.shape)
dataset.dropna(inplace = True)
print(dataset.shape)
dataset.drop(dataset[dataset.rating == 3].index, inplace=True)
print(dataset.shape)
dataset.drop_duplicates(subset = "verified_reviews", inplace = True)
print(dataset.shape)

(3150, 6)
(3150, 6)
(2998, 6)
(2196, 6)


In [3]:
X = np.array(dataset["verified_reviews"].values).reshape(-1, 1)
y = list(dataset["feedback"].values)

In [4]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

undersampler = RandomUnderSampler(sampling_strategy=0.5, random_state = 0)

X, y = undersampler.fit_resample(X, y)


print('Resampled dataset shape %s' % Counter(y))

Resampled dataset shape Counter({1: 412, 0: 206})


In [5]:
X_temp = []

for rev in X:
  X_temp.append(rev[0])

In [6]:
tokenized_reviews, sentences = tokenize_list_of_text(X_temp, custom_stopwords, False,2)

total number of types extracted is: 1788


In [7]:
tokenizer = nltk.tokenize.TweetTokenizer()
cv = CountVectorizer(stop_words="english", ngram_range=(1, 3), tokenizer=tokenizer.tokenize, min_df = 2) #count == frequenza
text_counts = cv.fit_transform(tokenized_reviews) 
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, y, test_size = 0.20, random_state=10) #divisione in train e test

In [8]:
text_counts

<618x1224 sparse matrix of type '<class 'numpy.int64'>'
	with 6987 stored elements in Compressed Sparse Row format>

In [9]:
len(cv.vocabulary_)

1224

In [10]:
cv.vocabulary_ # A mapping of terms to feature indices.

{'like': 520,
 'fact': 311,
 'answer': 44,
 'not_see': 680,
 'real': 823,
 'need': 638,
 'household': 452,
 'good': 377,
 'day': 208,
 'deal': 213,
 'sound': 968,
 'terrible': 1032,
 'want': 1153,
 'music': 624,
 'bose': 104,
 'sound terrible': 976,
 'good music': 383,
 'little': 538,
 'feature': 330,
 'stop': 999,
 'work': 1190,
 'week': 1173,
 'command': 160,
 'really': 826,
 'fun': 362,
 'stop work': 1001,
 'work week': 1208,
 'joke': 488,
 'worthless': 1213,
 'disappointed': 248,
 'plug': 756,
 'wall': 1152,
 'time': 1047,
 'fault': 327,
 'disagree': 246,
 'make': 595,
 'company': 163,
 'advance': 28,
 'sell': 910,
 'product': 785,
 'battery': 86,
 'return': 870,
 'apple': 52,
 'speaker': 980,
 'really disappointed': 827,
 'apple music': 53,
 'great': 393,
 'bass': 84,
 'idle': 459,
 'hot': 449,
 'miss': 614,
 'recognize': 837,
 'thing': 1037,
 'great sound': 411,
 'idle time': 461,
 'impressed': 466,
 'tightness': 1046,
 'stupid': 1008,
 'spotify': 989,
 'account': 14,
 'not_use':

In [11]:
#get_feature_names_out([input_features])
#Get output feature names for transformation.

len(cv.get_feature_names_out())

1224

In [12]:
X_train[0,:] # prima recensione

<1x1224 sparse matrix of type '<class 'numpy.int64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [13]:
print(X_train[0,:])

  (0, 520)	1
  (0, 1190)	1
  (0, 1173)	1
  (0, 106)	1
  (0, 550)	1
  (0, 556)	1
  (0, 845)	1
  (0, 644)	1
  (0, 735)	1
  (0, 1204)	1
  (0, 798)	1
  (0, 320)	1
  (0, 422)	2
  (0, 716)	1
  (0, 423)	1
  (0, 524)	1
  (0, 324)	1
  (0, 1174)	1


In [14]:
cv.inverse_transform(X_train[0,:]) #0 ==> prima recensione

[array(['like', 'work', 'week', 'box', 'look', 'look like', 'refurbish',
        'new', 'perfectly', 'work perfectly', 'purchase', 'far', 'happy',
        'original', 'happy purchase', 'like new', 'far work', 'week far'],
       dtype='<U25')]

In [15]:
for feat,freq in zip(cv.inverse_transform(X_train[0,:])[0],X_train[0,:].data):
    print(feat,freq)

like 1
work 1
week 1
box 1
look 1
look like 1
refurbish 1
new 1
perfectly 1
work perfectly 1
purchase 1
far 1
happy 2
original 1
happy purchase 1
like new 1
far work 1
week far 1


In [16]:
#For classification we'll set 'chi2'  method as a scoring function. The target number of features is defined by k parameter
select = SelectKBest(chi2, k=800)  # feature selection
select.fit(X_train,Y_train)
X_train_sel = select.transform(X_train)
X_test_sel = select.transform(X_test)

In [17]:
#We've selected 3 best features in x data. To identify the selected features we use get_support() function and filter out them from the features name list. The z object contains selected x data
filter = select.get_support() #filtra le parole selezionando quelle contenenti nei k 5000
sum(filter)

800

In [18]:
X_train_sel

<494x800 sparse matrix of type '<class 'numpy.int64'>'
	with 4270 stored elements in Compressed Sparse Row format>

In [19]:
print(X_test_sel[0,:])

  (0, 773)	1
  (0, 356)	1
  (0, 376)	1


In [20]:
print(cv.inverse_transform(select.inverse_transform(X_train_sel[0,:]))) #applichi l'inverse_transform sui vettori che poi hai utilizzato con il Select

[array(['box', 'far', 'far work', 'happy', 'happy purchase', 'like new',
       'look like', 'new', 'original', 'refurbish', 'week', 'week far',
       'work'], dtype='<U25')]


In [21]:
# guardare tf-idf
tfidf = TfidfTransformer()  # weighting
tfidf.fit(X_train_sel)
X_train_vec = tfidf.transform(X_train_sel)
X_test_vec =tfidf.transform(X_test_sel)

In [22]:
print(X_train_vec[0,:])

  (0, 773)	0.13215721023034152
  (0, 762)	0.3104364762375252
  (0, 761)	0.2545811907451864
  (0, 539)	0.22860996199910866
  (0, 456)	0.24922448598010277
  (0, 406)	0.1987259052528476
  (0, 351)	0.28446524749144747
  (0, 328)	0.27519571472618043
  (0, 264)	0.28446524749144747
  (0, 263)	0.4717708962714922
  (0, 201)	0.3104364762375252
  (0, 198)	0.1987259052528476
  (0, 72)	0.2673584364439322


In [23]:
cv.inverse_transform(select.inverse_transform(X_train_vec[0,:]))

[array(['box', 'far', 'far work', 'happy', 'happy purchase', 'like new',
        'look like', 'new', 'original', 'refurbish', 'week', 'week far',
        'work'], dtype='<U25')]

In [24]:
for feat,weight,freq in zip(cv.inverse_transform(select.inverse_transform(X_train_vec[0,:]))[0],X_train_vec[0,:].data,X_train_sel[1,:].data):
    print(feat,round(weight,4),freq)

box 0.1322 1
far 0.3104 1
far work 0.2546 1
happy 0.2286 1
happy purchase 0.2492 1
like new 0.1987 1
look like 0.2845 1
new 0.2752 1


In [25]:
# cercati support vector machine
svm = LinearSVC()  # linear svm with default parameters
svm_clf = svm.fit(X_train_vec,Y_train)
predictions = svm_clf.predict(X_test_vec)

In [26]:
len(predictions), sum(predictions)

(124, 86)

In [27]:
print(metrics.classification_report(Y_test, predictions))

              precision    recall  f1-score   support

           0       0.74      0.68      0.71        41
           1       0.85      0.88      0.86        83

    accuracy                           0.81       124
   macro avg       0.79      0.78      0.79       124
weighted avg       0.81      0.81      0.81       124



In [28]:
feature_names = cv.get_feature_names_out()
feats_w_score = list()
for index,(selected,score) in enumerate(zip(filter,select.scores_)):
    feats_w_score.append((score,selected,feature_names[index]))
feats_w_score = sorted(feats_w_score, reverse = True)
len(feats_w_score)

1224

In [29]:
feats_w_score[:10] # this contains k score, wether it is a selcted feature and the word

[(26.1792791786454, True, 'great'),
 (22.24837695232832, True, 'send'),
 (22.24837695232832, True, 'month'),
 (18.399484203739526, True, 'bulb'),
 (13.95757575757576, True, 'actually'),
 (13.450001535107923, True, 'money'),
 (13.44703782400041, True, 'ask'),
 (13.265352056067726, True, 'buy'),
 (11.963636363636367, True, 'bridge'),
 (11.963636363636367, True, 'act')]

In [30]:
svm.coef_.shape

(1, 800)

In [31]:
feats_w_classifier_weight = list()
for index,weight in enumerate(select.inverse_transform(svm.coef_)[0]):
    if weight!=0:
        feats_w_classifier_weight.append((weight,feature_names[index]))
feats_w_classifier_weight = sorted(feats_w_classifier_weight)
len(feats_w_classifier_weight)

786

In [32]:
feats_w_classifier_weight[-10:] #features positive

[(0.738025328365856, 'perfect'),
 (0.7524924027100869, 'figure'),
 (0.7695295916811415, 'lol'),
 (0.7718550212720972, 'best'),
 (0.8067264850001578, 'far'),
 (0.8772116417713494, 'good'),
 (0.9308675254347062, 'search'),
 (1.219153646736551, 'easy'),
 (1.3561212583317543, 'great'),
 (1.947915824613227, 'love')]

In [33]:
feats_w_classifier_weight[:10] #features negative

[(-1.6567074717899566, 'idle'),
 (-1.273770180316041, 'terrible'),
 (-1.1607271748038621, 'return'),
 (-1.1531572176571936, 'try'),
 (-1.1475952655456032, 'not_see'),
 (-1.0912370793678843, 'pay'),
 (-1.082249317364118, 'disconnect'),
 (-0.9931743270264118, 'difficult'),
 (-0.9919062667609163, 'useless'),
 (-0.9357413574576593, 'disappointed')]

In [34]:
df_scores = pd.DataFrame()
scores = []
words = []
for entry in feats_w_classifier_weight:
    scores.append(entry[0])
    words.append(entry[1])

df_scores["scores"] = scores
df_scores["words"] = words

df_scores.sort_values(by = "scores", ascending = False).head(15) # use tail for seeing the positive words

Unnamed: 0,scores,words
785,1.947916,love
784,1.356121,great
783,1.219154,easy
782,0.930868,search
781,0.877212,good
780,0.806726,far
779,0.771855,best
778,0.76953,lol
777,0.752492,figure
776,0.738025,perfect


In [35]:
stringa = "Bad experience, it's too loud"
clean = tokenize_list_of_text([stringa])[0]
clean

total number of types extracted is: 4


['bad experience too loud']

In [36]:
vector = cv.transform(clean)
vector = select.transform(vector)
vector = tfidf.transform(vector)
predicted = svm.predict(vector)
print(predicted)

[0]


In [37]:
MNB = MultinomialNB()  # MNB with default parameters
MNB_clsf = MNB.fit(X_train_vec,Y_train)
predictions = MNB_clsf.predict(X_test_vec)
print(metrics.classification_report(Y_test, predictions))

              precision    recall  f1-score   support

           0       0.90      0.46      0.61        41
           1       0.79      0.98      0.87        83

    accuracy                           0.81       124
   macro avg       0.85      0.72      0.74       124
weighted avg       0.83      0.81      0.79       124



In [38]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100],
              'gamma': [1, 0.1, 0.01, 0.001],
              'kernel': ['rbf','linear', 'poly'],
              'class_weight':['balanced', None]
}

grid = GridSearchCV(SVC(), param_grid, refit ='f1_macro', scoring="f1_macro", cv=kfold, verbose=3)
grid.fit(X_train_vec,Y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV 1/5] END C=0.1, class_weight=balanced, gamma=1, kernel=rbf;, score=0.739 total time=   0.0s
[CV 2/5] END C=0.1, class_weight=balanced, gamma=1, kernel=rbf;, score=0.910 total time=   0.0s
[CV 3/5] END C=0.1, class_weight=balanced, gamma=1, kernel=rbf;, score=0.792 total time=   0.0s
[CV 4/5] END C=0.1, class_weight=balanced, gamma=1, kernel=rbf;, score=0.716 total time=   0.0s
[CV 5/5] END C=0.1, class_weight=balanced, gamma=1, kernel=rbf;, score=0.363 total time=   0.0s
[CV 1/5] END C=0.1, class_weight=balanced, gamma=1, kernel=linear;, score=0.687 total time=   0.0s
[CV 2/5] END C=0.1, class_weight=balanced, gamma=1, kernel=linear;, score=0.677 total time=   0.0s
[CV 3/5] END C=0.1, class_weight=balanced, gamma=1, kernel=linear;, score=0.744 total time=   0.0s
[CV 4/5] END C=0.1, class_weight=balanced, gamma=1, kernel=linear;, score=0.646 total time=   0.0s
[CV 5/5] END C=0.1, class_weight=balanced, gamma=1, kernel=lin

In [39]:
print(grid.best_params_)

{'C': 1, 'class_weight': None, 'gamma': 1, 'kernel': 'linear'}


In [40]:
grid_predictions = grid.predict(X_test_vec)

print(metrics.classification_report(Y_test, grid_predictions))

              precision    recall  f1-score   support

           0       0.74      0.68      0.71        41
           1       0.85      0.88      0.86        83

    accuracy                           0.81       124
   macro avg       0.79      0.78      0.79       124
weighted avg       0.81      0.81      0.81       124



In [41]:
rep_dict = metrics.classification_report(Y_test, grid_predictions, output_dict=True)

print(pd.DataFrame(rep_dict).T.to_latex(bold_rows = True, float_format="%.2f" ))

\begin{tabular}{lrrrr}
\toprule
{} &  precision &  recall &  f1-score &  support \\
\midrule
\textbf{0           } &       0.74 &    0.68 &      0.71 &    41.00 \\
\textbf{1           } &       0.85 &    0.88 &      0.86 &    83.00 \\
\textbf{accuracy    } &       0.81 &    0.81 &      0.81 &     0.81 \\
\textbf{macro avg   } &       0.79 &    0.78 &      0.79 &   124.00 \\
\textbf{weighted avg} &       0.81 &    0.81 &      0.81 &   124.00 \\
\bottomrule
\end{tabular}



  print(pd.DataFrame(rep_dict).T.to_latex(bold_rows = True, float_format="%.2f" ))
