In [226]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn import metrics
from sklearn.svm import LinearSVC
import string
from nltk.corpus import stopwords
# nltk.download('stopwords')

from mi_helper import *

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline

from numpy import array 

In [227]:
dataset = pd.read_csv("amazon_alexa.tsv", sep = "\t")
# data_test = pd.read_csv("test_set.csv", sep = "\t")
# data_test.drop("Unnamed: 0", axis = 1, inplace = True)
# dataset = pd.concat([data_test, dataset])
# dataset.dropna(inplace=True)
dataset.drop(dataset[dataset.rating == 3].index, inplace=True) #droppa recensioni con 3-4 stelle
dataset.drop(dataset[dataset.rating == 4].index, inplace=True) #droppa recensioni con 3-4 stelle
print(dataset["feedback"].value_counts())
dataset.info()

1    2286
0     257
Name: feedback, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2543 entries, 0 to 3148
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   rating            2543 non-null   int64 
 1   date              2543 non-null   object
 2   variation         2543 non-null   object
 3   verified_reviews  2543 non-null   object
 4   feedback          2543 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 119.2+ KB


In [228]:
# levare i numeri
# disambiguare la lemmatizzazione con pos

custom_stopwords = ["echo", "alexa", "dot", "amzon", "prime", "2nd", "generation", "1st", "3rd", "4th"]

def tokenize_list_of_text(list_of_text, custom_stopwords = []):
    """Tokenizza tutte le recensioni, pulisce da stopwords, elimina token <= 2 caratteri e lemmatizza. Ritorna la colonna delle recensioni."""

    tokenizer = nltk.tokenize.TweetTokenizer()
    lemmatizer = nltk.WordNetLemmatizer()
    tokenized_reviews = []
    for review in list_of_text: #pulisce le recensioni
        clean_text = ""
        tokens = tokenizer.tokenize(review)
        tokens = [w.lower() for w in tokens]
        clean_tokens = [w for w in tokens if w not in string.punctuation and len(w)>2 and w not in stopwords.words("english") and w not in custom_stopwords]
        lemmatized_tokens = [lemmatizer.lemmatize(w) for w in clean_tokens]
        for t in lemmatized_tokens:
            clean_text += " " + t
        tokenized_reviews.append(clean_text)

    return tokenized_reviews

In [229]:
new_text = tokenize_list_of_text(dataset["verified_reviews"], custom_stopwords)

dataset["verified_reviews"] = new_text

In [230]:
# CREAZIONE DI UN SAMPLE DATASET BILANCIATO
# prende le prime n recensioni positive di lunghezza maggiore, dove n è il numero di recensioni negative
def create_balanced_dataset(dataset):
    """Bilancia il dataset uniformando il numero di recensioni negative e positive. Prende in input il dataset"""
    reviews_1 = list(dataset[dataset["feedback"] == 1]["verified_reviews"])
    reviews_0 = list(dataset[dataset["feedback"] == 0]["verified_reviews"])
    reviews_1.sort(key=len, reverse = True)
    sample_1 = reviews_1[:len(reviews_0)]
    verified_reviews_sample = []
    feedback_sample = []
    verified_reviews_sample.extend(sample_1)
    verified_reviews_sample.extend(reviews_0)
    feedback_sample.extend([1 for i in range(len(sample_1))])
    feedback_sample.extend([0 for i in range(len(reviews_0))])
    dataset = pd.DataFrame({"verified_reviews":verified_reviews_sample, "feedback": feedback_sample})
    print(dataset["feedback"].value_counts())
    return dataset


dataset = create_balanced_dataset(dataset)

1    257
0    257
Name: feedback, dtype: int64


In [231]:
tokenizer = nltk.tokenize.TweetTokenizer()
cv = CountVectorizer(stop_words="english", ngram_range=(1, 3), tokenizer=tokenizer.tokenize, min_df = 2) #count == frequenza
text_counts = cv.fit_transform(dataset["verified_reviews"]) 
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, dataset["feedback"], test_size = 0.20, random_state=10) #divisione in train e test

In [232]:
text_counts

<514x5075 sparse matrix of type '<class 'numpy.int64'>'
	with 18147 stored elements in Compressed Sparse Row format>

In [233]:
len(cv.vocabulary_)

5075

In [234]:
cv.vocabulary_ # A mapping of terms to feature indices.

{'piece': 3097,
 'technology': 4322,
 'right': 3642,
 'center': 676,
 'living': 2424,
 'room': 3658,
 'island': 2109,
 'kitchen': 2174,
 'counter': 923,
 'mic': 2608,
 'speaker': 4069,
 'direction': 1141,
 'quality': 3378,
 'sound': 4010,
 'quite': 3436,
 'good': 1633,
 'connected': 852,
 'bluetooth': 490,
 'soundbar': 4056,
 '360': 24,
 'audiophile': 339,
 'equipment': 1300,
 'range': 3464,
 'decent': 1026,
 'bass': 396,
 'indoor': 2036,
 'entertaining': 1294,
 'loud': 2482,
 'bother': 528,
 'building': 570,
 'work': 4988,
 'great': 1702,
 'volume': 4793,
 'device': 1062,
 'imagine': 2007,
 'button': 586,
 'large': 2219,
 'precise': 3259,
 'recommend': 3554,
 'regular': 3569,
 'look': 2454,
 'weight': 4928,
 'material': 2592,
 'order': 2959,
 'came': 642,
 'free': 1511,
 'philip': 3069,
 'hue': 1949,
 'bulb': 577,
 'installed': 2061,
 'extra': 1363,
 'bought': 532,
 'floor': 1491,
 'lamp': 2207,
 'turned': 4609,
 'light': 2276,
 'say': 3712,
 'connect': 835,
 'default': 1033,
 'second

In [235]:
#get_feature_names_out([input_features])
#Get output feature names for transformation.

len(cv.get_feature_names_out())

5075

In [236]:
X_train[0,:] # prima recensione

<1x5075 sparse matrix of type '<class 'numpy.int64'>'
	with 82 stored elements in Compressed Sparse Row format>

In [237]:
print(X_train[0,:])

  (0, 3658)	1
  (0, 4069)	1
  (0, 3712)	1
  (0, 2701)	1
  (0, 2488)	3
  (0, 4779)	1
  (0, 3175)	1
  (0, 2378)	1
  (0, 33)	1
  (0, 3178)	1
  (0, 3271)	1
  (0, 4709)	1
  (0, 3345)	1
  (0, 352)	1
  (0, 119)	1
  (0, 1236)	1
  (0, 2667)	1
  (0, 609)	1
  (0, 4170)	1
  (0, 2269)	1
  (0, 4804)	1
  (0, 689)	1
  (0, 3788)	1
  (0, 1689)	1
  (0, 1820)	1
  :	:
  (0, 691)	1
  (0, 3790)	1
  (0, 45)	1
  (0, 4174)	1
  (0, 2681)	1
  (0, 4806)	1
  (0, 130)	1
  (0, 3181)	1
  (0, 2732)	1
  (0, 1693)	1
  (0, 4719)	1
  (0, 4787)	1
  (0, 3670)	1
  (0, 2391)	1
  (0, 3723)	1
  (0, 2498)	1
  (0, 1240)	1
  (0, 1822)	1
  (0, 3282)	1
  (0, 4119)	1
  (0, 4632)	1
  (0, 613)	1
  (0, 919)	1
  (0, 1839)	1
  (0, 2526)	1


In [238]:
cv.inverse_transform(X_train[0,:]) #0 ==> prima recensione

[array(['room', 'speaker', 'say', 'music', 'love', 'voice', 'playing',
        'listen', 'able', 'playing music', 'price', 'used', 'purchase',
        'awesome', 'alarm', 'echo', 'morning', 'buying', 'start', 'life',
        'wake', 'changer', 'seriously', 'gotten', 'hesitate',
        'unbelievable', 'cordless', 'holder', 'awesome life',
        'life changer', 'changer seriously', 'seriously able',
        'able start', 'start morning', 'morning wake', 'wake alarm',
        'alarm playing', 'music gotten', 'gotten used', 'used voice',
        'voice room', 'room listen', 'listen say', 'say love', 'love echo',
        'echo hesitate', 'hesitate price', 'price speaker',
        'speaker unbelievable', 'unbelievable buying', 'buying cordless',
        'cordless holder', 'holder love', 'love purchase', 'purchase love',
        'awesome life changer', 'life changer seriously',
        'changer seriously able', 'seriously able start',
        'able start morning', 'start morning wake', 'mo

In [239]:
for feat,freq in zip(cv.inverse_transform(X_train[0,:])[0],X_train[0,:].data):
    print(feat,freq)

room 1
speaker 1
say 1
music 1
love 3
voice 1
playing 1
listen 1
able 1
playing music 1
price 1
used 1
purchase 1
awesome 1
alarm 1
echo 1
morning 1
buying 1
start 1
life 1
wake 1
changer 1
seriously 1
gotten 1
hesitate 1
unbelievable 1
cordless 1
holder 1
awesome life 1
life changer 1
changer seriously 1
seriously able 1
able start 1
start morning 1
morning wake 1
wake alarm 1
alarm playing 1
music gotten 1
gotten used 1
used voice 1
voice room 1
room listen 1
listen say 1
say love 1
love echo 1
echo hesitate 1
hesitate price 1
price speaker 1
speaker unbelievable 1
unbelievable buying 1
buying cordless 1
cordless holder 1
holder love 1
love purchase 1
purchase love 1
awesome life changer 1
life changer seriously 1
changer seriously able 1
seriously able start 1
able start morning 1
start morning wake 1
morning wake alarm 1
wake alarm playing 1
alarm playing music 1
playing music gotten 1
music gotten used 1
gotten used voice 1
used voice room 1
voice room listen 1
room listen say 1
l

In [240]:
#For classification we'll set 'chi2'  method as a scoring function. The target number of features is defined by k parameter
select = SelectKBest(chi2, k=5000)  # feature selection
select.fit(X_train,Y_train)
X_train_sel = select.transform(X_train)
X_test_sel = select.transform(X_test)

In [241]:
#We've selected 3 best features in x data. To identify the selected features we use get_support() function and filter out them from the features name list. The z object contains selected x data
filter = select.get_support() #filtra le parole selezionando quelle contenenti nei k 5000
sum(filter)

5000

In [242]:
X_train_sel

<411x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 14491 stored elements in Compressed Sparse Row format>

In [243]:
print(X_test_sel[0,:])

  (0, 1655)	1
  (0, 1382)	1
  (0, 1791)	1
  (0, 3867)	1
  (0, 2744)	1
  (0, 3876)	1
  (0, 2924)	1
  (0, 3247)	1
  (0, 4651)	1
  (0, 1674)	1
  (0, 4572)	1
  (0, 4576)	1
  (0, 3269)	1


In [244]:
print(cv.inverse_transform(select.inverse_transform(X_train_sel[0,:]))) #applichi l'inverse_transform sui vettori che poi hai utilizzato con il Select

[array(['able', 'able start', 'able start morning', 'alarm',
       'alarm playing', 'alarm playing music', 'awesome', 'awesome life',
       'awesome life changer', 'buying', 'buying cordless',
       'buying cordless holder', 'changer', 'changer seriously',
       'changer seriously able', 'cordless', 'cordless holder',
       'cordless holder love', 'echo', 'echo hesitate',
       'echo hesitate price', 'gotten', 'gotten used',
       'gotten used voice', 'hesitate', 'hesitate price',
       'hesitate price speaker', 'holder', 'holder love',
       'holder love purchase', 'life', 'life changer',
       'life changer seriously', 'listen', 'listen say',
       'listen say love', 'love', 'love echo', 'love echo hesitate',
       'love purchase', 'love purchase love', 'morning', 'morning wake',
       'morning wake alarm', 'music', 'music gotten', 'music gotten used',
       'playing', 'playing music', 'playing music gotten', 'price',
       'price speaker', 'price speaker unbelievable'

In [245]:
tfidf = TfidfTransformer()  # weighting
tfidf.fit(X_train_sel)
X_train_vec = tfidf.transform(X_train_sel)
X_test_vec =tfidf.transform(X_test_sel)

In [246]:
print(X_train_vec[0,:])

  (0, 4732)	0.11859142672166798
  (0, 4731)	0.11859142672166798
  (0, 4730)	0.08507142084540026
  (0, 4713)	0.11859142672166798
  (0, 4712)	0.11859142672166798
  (0, 4705)	0.07386557342494168
  (0, 4645)	0.11859142672166798
  (0, 4644)	0.11859142672166798
  (0, 4635)	0.07780453581820634
  (0, 4558)	0.11859142672166798
  (0, 4557)	0.11859142672166798
  (0, 4556)	0.11859142672166798
  (0, 4100)	0.11859142672166798
  (0, 4099)	0.11859142672166798
  (0, 4096)	0.08922922891720263
  (0, 4045)	0.11859142672166798
  (0, 4044)	0.11859142672166798
  (0, 3995)	0.05794775430725784
  (0, 3718)	0.11859142672166798
  (0, 3717)	0.11859142672166798
  (0, 3716)	0.11859142672166798
  (0, 3651)	0.11859142672166798
  (0, 3650)	0.11859142672166798
  (0, 3642)	0.0761348874494738
  (0, 3601)	0.11859142672166798
  :	:
  (0, 1771)	0.11859142672166798
  (0, 1646)	0.11859142672166798
  (0, 1645)	0.11859142672166798
  (0, 1642)	0.11283082904198284
  (0, 1198)	0.11859142672166798
  (0, 1197)	0.11859142672166798
  (

In [247]:
cv.inverse_transform(select.inverse_transform(X_train_vec[0,:]))

[array(['able', 'able start', 'able start morning', 'alarm',
        'alarm playing', 'alarm playing music', 'awesome', 'awesome life',
        'awesome life changer', 'buying', 'buying cordless',
        'buying cordless holder', 'changer', 'changer seriously',
        'changer seriously able', 'cordless', 'cordless holder',
        'cordless holder love', 'echo', 'echo hesitate',
        'echo hesitate price', 'gotten', 'gotten used',
        'gotten used voice', 'hesitate', 'hesitate price',
        'hesitate price speaker', 'holder', 'holder love',
        'holder love purchase', 'life', 'life changer',
        'life changer seriously', 'listen', 'listen say',
        'listen say love', 'love', 'love echo', 'love echo hesitate',
        'love purchase', 'love purchase love', 'morning', 'morning wake',
        'morning wake alarm', 'music', 'music gotten', 'music gotten used',
        'playing', 'playing music', 'playing music gotten', 'price',
        'price speaker', 'price speake

In [248]:
for feat,weight,freq in zip(cv.inverse_transform(select.inverse_transform(X_train_vec[1,:]))[0],X_train_vec[1,:].data,X_train_sel[1,:].data):
    print(feat,round(weight,4),freq)

able 0.1252 2
able download 0.1252 1
able download installing 0.0998 1
able install 0.0796 1
able install son 0.1252 1
app 0.1252 1
app able 0.2289 1
app able download 0.1955 1
download 0.1252 1
download installing 0.1252 1
download installing invasive 0.1252 1
driver 0.1252 1
driver wife 0.1252 1
driver wife retired 0.1045 1
friendly 0.1252 2
friendly i'd 0.1252 1
friendly i'd hoped 0.0684 1
friendly set-up 0.1252 1
friendly set-up user 0.1252 1
hoped 0.1252 1
hoped app 0.1252 1
hoped app able 0.1252 1
i'd 0.0653 1
i'd hoped 0.1252 1
i'd hoped app 0.1252 1
install 0.1252 1
install son 0.1252 1
install son supplying 0.0788 1
installing 0.1252 1
installing invasive 0.1252 1
installing invasive driver 0.1192 1
invasive 0.1252 1
invasive driver 0.1252 1
invasive driver wife 0.1252 1
phone 0.1252 1
phone using 0.1252 1
retired 0.1106 1
retired able 0.1252 1
retired able install 0.1252 1
set 0.1073 1
set user 0.1252 1
set user friendly 0.1252 1
set-up 0.1252 1
set-up user 0.1252 1
set-up us

In [249]:
# create a dataframe with words, tf_idf score and freq
total_scores = []

for i in range(X_train_vec.shape[0]):
    for feat,weight,freq in zip(cv.inverse_transform(select.inverse_transform(X_train_vec[i,:]))[0],X_train_vec[i,:].data,X_train_sel[i,:].data):
        total_scores.append((feat,round(weight,4),freq))

words = []
tf_idf_score = []
freq = []
for i in range(len(total_scores)):
    words.append(total_scores[i][0])
    tf_idf_score.append(total_scores[i][1])
    freq.append(total_scores[i][2])

scores_df = pd.DataFrame()
scores_df["words"] = words
scores_df["tf_idf_score"] = tf_idf_score
scores_df["freq"] = freq

In [250]:
scores_df.sort_values(by = "tf_idf_score", ascending = True).tail(50) # sort and show the df

Unnamed: 0,words,tf_idf_score,freq
6539,work,0.5858,1
3339,son,0.5875,1
3824,fast,0.5893,1
3826,middle,0.5893,1
6773,refurbished,0.5914,1
6228,volume low,0.5936,1
5082,volume low,0.5936,1
6388,talking,0.596,2
8069,hardly,0.5962,1
5373,device,0.5985,1


In [251]:
svm = LinearSVC()  # linear svm with default parameters
svm_clf = svm.fit(X_train_vec,Y_train)
predictions = svm_clf.predict(X_test_vec)

In [252]:
len(predictions), sum(predictions)

(103, 48)

In [253]:
predictions

array([0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1], dtype=int64)

In [273]:
print(metrics.classification_report(Y_test, predictions))

              precision    recall  f1-score   support

           0       0.98      0.75      0.85        56
           1       0.77      0.98      0.86        47

    accuracy                           0.85       103
   macro avg       0.87      0.86      0.85       103
weighted avg       0.88      0.85      0.85       103



In [255]:
feature_names = cv.get_feature_names_out()
feature_names

array(['...', '... amazing', '... amazing device', ..., 'z-wave', 'zero',
       'zigbee'], dtype=object)

In [256]:
print(select.scores_)

[0.67899578 0.95714286 0.95714286 ... 2.08955224 2.08955224 4.17910448]


In [257]:
feature_names = cv.get_feature_names_out()
feats_w_score = list()
for index,(selected,score) in enumerate(zip(filter,select.scores_)):
    feats_w_score.append((score,selected,feature_names[index]))
feats_w_score = sorted(feats_w_score, reverse = True)
len(feats_w_score)

5075

In [258]:
feats_w_score[:10] # this contains k score, wether it is a selcted feature and the word

[(45.221253978554415, True, 'music'),
 (29.498207509667157, True, 'smart'),
 (23.028639963448065, True, 'song'),
 (22.971428571428568, True, 'house'),
 (22.78724946695096, True, 'alarm'),
 (20.44349937061679, True, 'play'),
 (17.448760957119163, True, 'new'),
 (16.37628185602599, True, 'playing'),
 (14.881700194678778, True, 'fun'),
 (14.484643923240938, True, 'good')]

In [259]:
type(svm)

sklearn.svm._classes.LinearSVC

In [260]:
svm.coef_

array([[-0.07769381,  0.05787592,  0.05787592, ..., -0.16633377,
        -0.25752722, -0.25190052]])

In [261]:
feats_w_classifier_weight = list()
for index,weight in enumerate(select.inverse_transform(svm.coef_)[0]):
    if weight!=0:
        feats_w_classifier_weight.append((weight,feature_names[index]))
feats_w_classifier_weight = sorted(feats_w_classifier_weight)
len(feats_w_classifier_weight)

4993

In [267]:
feats_w_classifier_weight[-100:] #features positive

[(0.4136543288937374, 'definitely'),
 (0.41497019559998366, 'download'),
 (0.4151701645974032, 'external'),
 (0.42338270218401913, 'year'),
 (0.4274632508940891, 'display'),
 (0.4281113058014978, 'thing'),
 (0.43222683357750447, 'searching'),
 (0.43435891754970246, 'yard'),
 (0.43467133709201977, 'mother'),
 (0.43530546877746185, 'purchase'),
 (0.43748430710097325, 'kitchen'),
 (0.43921887241730917, 'philip'),
 (0.44344239774825167, 'wake'),
 (0.44709950917887237, 'life'),
 (0.4493288397216961, 'playing'),
 (0.4502020486356119, 'feel'),
 (0.4503713061560362, 'wonderful'),
 (0.4545807404550905, 'small'),
 (0.456560555639325, 'spotify'),
 (0.456824649201488, 'list'),
 (0.4569173304822794, 'use'),
 (0.45695512122742704, 'stick'),
 (0.4622418400077822, 'planning'),
 (0.47115629991485536, 'easy set'),
 (0.47357781129173626, 'friend'),
 (0.47451362399670904, 'perfectly'),
 (0.47876892821406664, 'better expected'),
 (0.48423385756400367, 'different'),
 (0.4854672628119097, 'learn'),
 (0.48820

In [263]:
feats_w_classifier_weight[:100] #features negative

[(-0.8504831255854596, 'return'),
 (-0.7615176375166367, 'poor'),
 (-0.750322002765968, 'disappointed'),
 (-0.7472588039203535, 'money'),
 (-0.6829986665969333, 'fix'),
 (-0.6765298714793548, 'try'),
 (-0.6107986089177687, 'terrible'),
 (-0.5821514236611528, 'tried'),
 (-0.5805228728349402, 'useless'),
 (-0.5649185489378937, 'sent'),
 (-0.5543091952921766, 'working'),
 (-0.539044821144899, 'support'),
 (-0.5207499187686023, 'fan'),
 (-0.5188928261624475, 'sonos'),
 (-0.49671545844081455, 'sound terrible'),
 (-0.49428562346505206, 'pair'),
 (-0.48572879568114025, 'unable'),
 (-0.4742019742825234, 'honestly'),
 (-0.47276498256302746, 'allow'),
 (-0.4596937882924174, 'wanted use'),
 (-0.45953734908266114, 'respond'),
 (-0.459317802301629, 'awful'),
 (-0.4590545521830671, 'month'),
 (-0.4578833488938699, 'half'),
 (-0.4525666025025385, 'waste'),
 (-0.44730837351285735, 'sound like'),
 (-0.42558266099497816, 'device tell'),
 (-0.4235802203451097, 'basically'),
 (-0.41897443001119883, "can't

In [270]:
stringa = "the product is disappointing"
clean = tokenize_list_of_text([stringa])
clean

[' product disappointing']

In [271]:
vector = cv.transform(clean)
vector = select.transform(vector)
vector = tfidf.transform(vector)
predicted = svm.predict(vector)
print(predicted)

[0]


In [275]:
mnb = MultinomialNB()  # Naive Bayes with default parameters
MNB_clsf = mnb.fit(X_train_vec,Y_train)
predictions = MNB_clsf.predict(X_test_vec)
print(metrics.classification_report(predictions, Y_test))

              precision    recall  f1-score   support

           0       0.75      0.98      0.85        43
           1       0.98      0.77      0.86        60

    accuracy                           0.85       103
   macro avg       0.86      0.87      0.85       103
weighted avg       0.88      0.85      0.86       103

