In [102]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn import metrics
from sklearn.svm import LinearSVC
import string
from nltk import pos_tag
from nltk.corpus import stopwords
# nltk.download('stopwords')

from mi_helper import *

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline

from numpy import array 

In [219]:
dataset = pd.read_csv("amazon_alexa.tsv", sep = "\t")
print(dataset["feedback"].value_counts())
print(dataset.shape)
# data_test = pd.read_csv("test_set.csv", sep = "\t")
# data_test.drop("Unnamed: 0", axis = 1, inplace = True)
# dataset = pd.concat([data_test, dataset])
# dataset.dropna(inplace=True)
dataset.drop(dataset[dataset.rating == 3].index, inplace=True) #droppa recensioni con 3-4 stelle
dataset.drop(dataset[dataset.rating == 4].index, inplace=True) #droppa recensioni con 3-4 stelle
print(dataset.shape)
print(dataset["feedback"].value_counts())
dataset.info()

1    2893
0     257
Name: feedback, dtype: int64
(3150, 5)
(2543, 5)
1    2286
0     257
Name: feedback, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2543 entries, 0 to 3148
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   rating            2543 non-null   int64 
 1   date              2543 non-null   object
 2   variation         2543 non-null   object
 3   verified_reviews  2543 non-null   object
 4   feedback          2543 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 119.2+ KB


In [220]:
# levare i numeri
def get_wordnet_pos(treebank_tag):
    """
    return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
    """
    if treebank_tag.startswith('J'):
        return "a"
    elif treebank_tag.startswith('V'):
        return "v"
    elif treebank_tag.startswith('N'):
        return "n"
    elif treebank_tag.startswith('R'):
        return "r"
    else:
        return "n"
        
custom_stopwords = ["echo", "alexa", "dot", "amazon", "prime", "2nd", "generation", "1st", "3rd", "4th", "5th", "google", "netflix", "youtube", "philip", "tp-link"]
pos_list = ["JJ", "JJR", "JJS", "RB", "RBR", "RBS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]

def tokenize_list_of_text(list_of_text, custom_stopwords = [], pos_filter = False, pos_list = []):
    """Tokenizza tutte le recensioni, pulisce da stopwords, elimina token <= 2 caratteri e lemmatizza. Ritorna la colonna delle recensioni."""

    tokenizer = nltk.tokenize.TweetTokenizer()
    lemmatizer = nltk.WordNetLemmatizer()
    tokenized_reviews = []
    sent_tokenized_reviews = []
    for review in list_of_text: #pulisce le recensioni
        clean_text = ""
        tokens = tokenizer.tokenize(review)
        tokens = [w.lower() for w in tokens]
        tokens_pos = pos_tag(tokens)
        lemmatized_tokens = [(lemmatizer.lemmatize(w, get_wordnet_pos(pos)), pos) for w, pos in tokens_pos]
        if pos_filter:
            clean_tokens = [(w, pos) for w, pos in lemmatized_tokens if w not in string.punctuation and len(w)>3 and w not in stopwords.words("english") and w not in custom_stopwords and pos in pos_list]
        else:
            clean_tokens = [(w, pos) for w, pos in lemmatized_tokens if w not in string.punctuation and len(w)>3 and w not in stopwords.words("english") and w not in custom_stopwords]
        sent_tokenized_reviews.append([w for w, pos in clean_tokens])
        for w, pos in clean_tokens:
            clean_text += " " + w
        tokenized_reviews.append(clean_text)
    
    n_tokens = []
    for sent in sent_tokenized_reviews:
        for w in sent:
            n_tokens.append(w)
    print("total number of tokens extracted are:", len(set(n_tokens)))
    return tokenized_reviews,  sent_tokenized_reviews # ritorna una tupla!

In [221]:
tokenized_reviews, sentences = tokenize_list_of_text(dataset["verified_reviews"], custom_stopwords, pos_filter=True, pos_list = pos_list)

dataset["verified_reviews"] = tokenized_reviews

total number of tokens extracted are: 1369


In [222]:
# CREAZIONE DI UN SAMPLE DATASET BILANCIATO
# prende le prime n recensioni positive di lunghezza maggiore, dove n è il numero di recensioni negative
def create_balanced_dataset(dataset):
    """Bilancia il dataset uniformando il numero di recensioni negative e positive. Prende in input il dataset"""
    reviews_1 = list(dataset[dataset["feedback"] == 1]["verified_reviews"])
    reviews_0 = list(dataset[dataset["feedback"] == 0]["verified_reviews"])
    reviews_1.sort(key=len, reverse = True)
    sample_1 = reviews_1[:len(reviews_0)]
    verified_reviews_sample = []
    feedback_sample = []
    verified_reviews_sample.extend(sample_1)
    verified_reviews_sample.extend(reviews_0)
    feedback_sample.extend([1 for i in range(len(sample_1))])
    feedback_sample.extend([0 for i in range(len(reviews_0))])
    dataset = pd.DataFrame({"verified_reviews":verified_reviews_sample, "feedback": feedback_sample})
    print(dataset["feedback"].value_counts())
    return dataset


dataset = create_balanced_dataset(dataset)

1    257
0    257
Name: feedback, dtype: int64


In [223]:
tokenizer = nltk.tokenize.TweetTokenizer()
cv = CountVectorizer(stop_words="english", ngram_range=(1, 3), tokenizer=tokenizer.tokenize, min_df = 2) #count == frequenza
text_counts = cv.fit_transform(dataset["verified_reviews"]) 
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, dataset["feedback"], test_size = 0.20, random_state=10) #divisione in train e test

In [224]:
text_counts

<514x1995 sparse matrix of type '<class 'numpy.int64'>'
	with 7195 stored elements in Compressed Sparse Row format>

In [225]:
len(cv.vocabulary_)

1995

In [226]:
cv.vocabulary_ # A mapping of terms to feature indices.

{'figure': 519,
 'honestly': 708,
 'really': 1357,
 'want': 1884,
 'like': 873,
 'personally': 1180,
 'kinda': 802,
 'love': 957,
 'order': 1143,
 'look': 943,
 'come': 218,
 'watch': 1909,
 'bother': 149,
 'hear': 674,
 'pretty': 1282,
 'favorite': 509,
 'hook': 709,
 'pick': 1186,
 'know': 807,
 'easy': 415,
 'interact': 781,
 'navigate': 1090,
 'multiple': 1085,
 'rotate': 1455,
 'play': 1193,
 'display': 395,
 'listen': 884,
 'clear': 209,
 'enjoy': 439,
 'able': 0,
 'tell': 1730,
 'great': 611,
 'download': 398,
 'different': 352,
 'best': 125,
 'arrive': 77,
 'current': 315,
 'program': 1296,
 'work': 1945,
 'actually': 26,
 'prefer': 1276,
 'light': 867,
 'deliver': 345,
 'recommend': 1407,
 'exactly': 475,
 "can't": 165,
 'command': 234,
 'blue': 141,
 'make': 1023,
 'ring': 1454,
 'yellow': 1993,
 'update': 1845,
 'integrate': 776,
 'smart': 1565,
 'instead': 774,
 'rent': 1424,
 'purchase': 1302,
 'nest': 1114,
 'really want': 1391,
 'want like': 1891,
 'love love': 988,
 're

In [228]:
#get_feature_names_out([input_features])
#Get output feature names for transformation.

len(cv.get_feature_names_out())

1995

In [229]:
X_train[0,:] # prima recensione

<1x1995 sparse matrix of type '<class 'numpy.int64'>'
	with 33 stored elements in Compressed Sparse Row format>

In [230]:
print(X_train[0,:])

  (0, 1884)	1
  (0, 957)	1
  (0, 611)	1
  (0, 1938)	1
  (0, 685)	1
  (0, 651)	1
  (0, 465)	1
  (0, 933)	1
  (0, 1597)	1
  (0, 175)	1
  (0, 27)	1
  (0, 1829)	1
  (0, 1014)	1
  (0, 1941)	1
  (0, 1598)	1
  (0, 176)	1
  (0, 1894)	1
  (0, 934)	1
  (0, 614)	1
  (0, 30)	1
  (0, 1830)	1
  (0, 688)	1
  (0, 654)	1
  (0, 1015)	1
  (0, 1942)	1
  (0, 1599)	1
  (0, 177)	1
  (0, 1895)	1
  (0, 935)	1
  (0, 615)	1
  (0, 31)	1
  (0, 1831)	1
  (0, 689)	1


In [231]:
cv.inverse_transform(X_train[0,:]) #0 ==> prima recensione

[array(['want', 'love', 'great', 'wish', 'help', 'happy', 'especially',
        'lock', 'sooner', 'certainly', 'additional', 'unexpected',
        'love wish', 'wish sooner', 'sooner certainly', 'certainly want',
        'want lock', 'lock great', 'great additional',
        'additional unexpected', 'unexpected help', 'help happy',
        'happy especially', 'love wish sooner', 'wish sooner certainly',
        'sooner certainly want', 'certainly want lock', 'want lock great',
        'lock great additional', 'great additional unexpected',
        'additional unexpected help', 'unexpected help happy',
        'help happy especially'], dtype='<U34')]

In [232]:
for feat,freq in zip(cv.inverse_transform(X_train[0,:])[0],X_train[0,:].data):
    print(feat,freq)

want 1
love 1
great 1
wish 1
help 1
happy 1
especially 1
lock 1
sooner 1
certainly 1
additional 1
unexpected 1
love wish 1
wish sooner 1
sooner certainly 1
certainly want 1
want lock 1
lock great 1
great additional 1
additional unexpected 1
unexpected help 1
help happy 1
happy especially 1
love wish sooner 1
wish sooner certainly 1
sooner certainly want 1
certainly want lock 1
want lock great 1
lock great additional 1
great additional unexpected 1
additional unexpected help 1
unexpected help happy 1
help happy especially 1


In [233]:
#For classification we'll set 'chi2'  method as a scoring function. The target number of features is defined by k parameter
select = SelectKBest(chi2, k=1995)  # feature selection
# select = SelectKBest(chi2, k="all")  # feature selection for balanced with pos filter
select.fit(X_train,Y_train)
X_train_sel = select.transform(X_train)
X_test_sel = select.transform(X_test)

In [234]:
#We've selected 3 best features in x data. To identify the selected features we use get_support() function and filter out them from the features name list. The z object contains selected x data
filter = select.get_support() #filtra le parole selezionando quelle contenenti nei k 5000
sum(filter)

1995

In [235]:
X_train_sel

<411x1995 sparse matrix of type '<class 'numpy.int64'>'
	with 5878 stored elements in Compressed Sparse Row format>

In [236]:
print(X_test_sel[0,:])

  (0, 611)	1
  (0, 1565)	1
  (0, 1092)	1
  (0, 1154)	1
  (0, 1854)	1
  (0, 1580)	1
  (0, 1836)	1


In [237]:
print(cv.inverse_transform(select.inverse_transform(X_train_sel[0,:]))) #applichi l'inverse_transform sui vettori che poi hai utilizzato con il Select

[array(['additional', 'additional unexpected',
       'additional unexpected help', 'certainly', 'certainly want',
       'certainly want lock', 'especially', 'great', 'great additional',
       'great additional unexpected', 'happy', 'happy especially', 'help',
       'help happy', 'help happy especially', 'lock', 'lock great',
       'lock great additional', 'love', 'love wish', 'love wish sooner',
       'sooner', 'sooner certainly', 'sooner certainly want',
       'unexpected', 'unexpected help', 'unexpected help happy', 'want',
       'want lock', 'want lock great', 'wish', 'wish sooner',
       'wish sooner certainly'], dtype='<U34')]


In [238]:
tfidf = TfidfTransformer()  # weighting
tfidf.fit(X_train_sel)
X_train_vec = tfidf.transform(X_train_sel)
X_test_vec =tfidf.transform(X_test_sel)

In [239]:
print(X_train_vec[0,:])

  (0, 1942)	0.1886531046056112
  (0, 1941)	0.1886531046056112
  (0, 1938)	0.1395836063098342
  (0, 1895)	0.1886531046056112
  (0, 1894)	0.1886531046056112
  (0, 1884)	0.09778508311323401
  (0, 1831)	0.1886531046056112
  (0, 1830)	0.1886531046056112
  (0, 1829)	0.1886531046056112
  (0, 1599)	0.1886531046056112
  (0, 1598)	0.1886531046056112
  (0, 1597)	0.1886531046056112
  (0, 1015)	0.1886531046056112
  (0, 1014)	0.1886531046056112
  (0, 957)	0.07420972727030196
  (0, 935)	0.1886531046056112
  (0, 934)	0.1886531046056112
  (0, 933)	0.1665735214389854
  (0, 689)	0.1886531046056112
  (0, 688)	0.17238121147827667
  (0, 685)	0.12666787812580407
  (0, 654)	0.1886531046056112
  (0, 651)	0.12241435510573381
  (0, 615)	0.1886531046056112
  (0, 614)	0.1886531046056112
  (0, 611)	0.08971169517938503
  (0, 465)	0.15030162831165086
  (0, 177)	0.1886531046056112
  (0, 176)	0.1886531046056112
  (0, 175)	0.1886531046056112
  (0, 31)	0.1886531046056112
  (0, 30)	0.1886531046056112
  (0, 27)	0.166573521

In [240]:
cv.inverse_transform(select.inverse_transform(X_train_vec[0,:]))

[array(['additional', 'additional unexpected',
        'additional unexpected help', 'certainly', 'certainly want',
        'certainly want lock', 'especially', 'great', 'great additional',
        'great additional unexpected', 'happy', 'happy especially', 'help',
        'help happy', 'help happy especially', 'lock', 'lock great',
        'lock great additional', 'love', 'love wish', 'love wish sooner',
        'sooner', 'sooner certainly', 'sooner certainly want',
        'unexpected', 'unexpected help', 'unexpected help happy', 'want',
        'want lock', 'want lock great', 'wish', 'wish sooner',
        'wish sooner certainly'], dtype='<U34')]

In [241]:
for feat,weight,freq in zip(cv.inverse_transform(select.inverse_transform(X_train_vec[1,:]))[0],X_train_vec[1,:].data,X_train_sel[1,:].data):
    print(feat,round(weight,4),freq)

come 0.3632 1
come smart 0.1883 1
connect 0.2688 1
connect come 0.2014 1
connect come smart 0.3632 1
easy 0.1727 1
figure 0.2645 1
great 0.2308 1
online 0.3456 1
smart 0.3456 1
wake 0.2014 1
want 0.3319 1
want wake 0.2126 1


In [242]:
# create a dataframe with words, tf_idf score and freq
total_scores = []

for i in range(X_train_vec.shape[0]):
    for feat,weight,freq in zip(cv.inverse_transform(select.inverse_transform(X_train_vec[i,:]))[0],X_train_vec[i,:].data,X_train_sel[i,:].data):
        total_scores.append((feat,round(weight,4),freq))

words = []
tf_idf_score = []
freq = []
for i in range(len(total_scores)):
    words.append(total_scores[i][0])
    tf_idf_score.append(total_scores[i][1])
    freq.append(total_scores[i][2])

scores_df = pd.DataFrame()
scores_df["words"] = words
scores_df["tf_idf_score"] = tf_idf_score
scores_df["freq"] = freq

In [243]:
scores_df.sort_values(by = "tf_idf_score", ascending = False).head(50) # sort and show the df

Unnamed: 0,words,tf_idf_score,freq
2287,return,1.0,1
2421,turn,1.0,1
1410,return,1.0,1
947,order,1.0,1
4643,work,1.0,1
2190,good,1.0,1
2238,know,1.0,1
5096,replace,1.0,1
693,refurbished,1.0,1
2637,work,1.0,1


In [244]:
svm = LinearSVC()  # linear svm with default parameters
svm_clf = svm.fit(X_train_vec,Y_train)
predictions = svm_clf.predict(X_test_vec)

In [245]:
len(predictions), sum(predictions)

(103, 54)

In [246]:
predictions

array([0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1], dtype=int64)

In [247]:
print(metrics.classification_report(Y_test, predictions))

              precision    recall  f1-score   support

           0       0.90      0.79      0.84        56
           1       0.78      0.89      0.83        47

    accuracy                           0.83       103
   macro avg       0.84      0.84      0.83       103
weighted avg       0.84      0.83      0.84       103



In [248]:
feature_names = cv.get_feature_names_out()
feature_names

array(['able', 'able download', 'able download instal', ..., 'yell',
       'yellow', 'young'], dtype=object)

In [249]:
print(select.scores_)

[7.11900596 1.91428571 1.91428571 ... 1.04477612 1.91428571 2.87142857]


In [250]:
feature_names = cv.get_feature_names_out()
feats_w_score = list()
for index,(selected,score) in enumerate(zip(filter,select.scores_)):
    feats_w_score.append((score,selected,feature_names[index]))
feats_w_score = sorted(feats_w_score, reverse = True)
len(feats_w_score)

1995

In [251]:
feats_w_score[:10] # this contains k score, wether it is a selcted feature and the word

[(102.02754615977861, True, 'love'),
 (44.22324613864474, True, 'great'),
 (25.040254472976724, True, 'play'),
 (19.111680170575696, True, 'purchase'),
 (16.271428571428572, True, 'best'),
 (10.012089552238805, True, 'good'),
 (8.538272921108742, True, 'return'),
 (7.9049218194740565, True, 'right'),
 (7.738848614072493, True, 'send'),
 (7.657142857142857, True, 'love play')]

In [252]:
type(svm)

sklearn.svm._classes.LinearSVC

In [253]:
svm.coef_

array([[ 0.01782997,  0.18932864,  0.18932864, ..., -0.10543122,
         0.49871133,  0.72890507]])

In [254]:
feats_w_classifier_weight = list()
for index,weight in enumerate(select.inverse_transform(svm.coef_)[0]):
    if weight!=0:
        feats_w_classifier_weight.append((weight,feature_names[index]))
feats_w_classifier_weight = sorted(feats_w_classifier_weight)
len(feats_w_classifier_weight)

1964

In [255]:
feats_w_classifier_weight[-100:] #features positive

[(0.4347389801315153, 'compare'),
 (0.43573582692603147, 'personal'),
 (0.4357631062428481, 'important'),
 (0.43616750113609537, 'amazing'),
 (0.43707434312750676, 'learn tell'),
 (0.4375403078551234, 'provide'),
 (0.44097461208489513, 'connect make'),
 (0.44233596401244735, 'definitely'),
 (0.4436174899600302, 'clean'),
 (0.4486155802057041, 'flash'),
 (0.4488176986026104, 'ring'),
 (0.45111189516733957, 'maintain'),
 (0.45356822667380586, 'tell work'),
 (0.4555931708365171, 'loud'),
 (0.4588809003546769, 'away'),
 (0.45907977237863634, 'satisfy'),
 (0.4617787897109439, 'interested'),
 (0.4655978490392751, 'previously'),
 (0.4662685799315204, 'extremely'),
 (0.46681338819722146, 'song'),
 (0.46718002160718136, 'cool'),
 (0.46886685594569866, 'basic'),
 (0.4725804959528253, 'answer think'),
 (0.4788402270197439, 'plan'),
 (0.47918523807828245, 'certified'),
 (0.47918523807828245, 'certified refurbished'),
 (0.48809371812137653, 'lose'),
 (0.49127594854297296, 'feel'),
 (0.4924898017785

In [256]:
feats_w_classifier_weight[:100] #features negative

[(-1.06719006283224, 'return'),
 (-0.9820147714803191, 'stop'),
 (-0.819964799929493, 'send'),
 (-0.7484952015781708, 'disappointed'),
 (-0.7411285380874648, 'spend'),
 (-0.7385150470511701, 'terrible'),
 (-0.7182199034804666, 'useless'),
 (-0.6644405541741791, 'awful'),
 (-0.6319647622154615, 'unable'),
 (-0.5749851708049162, 'poor'),
 (-0.5705007628380434, 'scroll'),
 (-0.5532139646176081, 'realize'),
 (-0.5296088720217701, 'honestly'),
 (-0.5181438243910282, "can't"),
 (-0.500923919160537, 'attempt'),
 (-0.4999482224895788, 'register'),
 (-0.491203188485386, 'want listen'),
 (-0.49108208948182863, 'purchase work'),
 (-0.4881783865869327, 'occasionally'),
 (-0.4876745259406816, 'unplug'),
 (-0.48170570432997883, 'sonos'),
 (-0.4695132071181086, 'actually'),
 (-0.46682945063835984, 'scar'),
 (-0.45938488135431454, 'communicate'),
 (-0.4585470750900001, 'operate'),
 (-0.43542191772604555, 'half'),
 (-0.4324918056293536, 'stick'),
 (-0.4255769616538777, 'simultaneously'),
 (-0.424049661

In [257]:
df_scores = pd.DataFrame()
scores = []
words = []
for entry in feats_w_classifier_weight:
    scores.append(entry[0])
    words.append(entry[1])

df_scores["scores"] = scores
df_scores["words"] = words

df_scores.sort_values(by = "scores", ascending = False).head(15) # use tail for seeing the positive words

Unnamed: 0,scores,words
1963,2.90671,love
1962,1.811655,smart
1961,1.630785,great
1960,1.510358,enjoy
1959,1.444977,learn
1958,1.41911,play
1957,1.258304,purchase
1956,1.160715,easy
1955,1.150958,little
1954,1.1421,really


In [216]:
stringa = "the product is disappointing, audio sounds bad"
clean = tokenize_list_of_text([stringa])[0]
clean

total number of tokens extracted are: 4


[' product disappointing audio sound']

In [258]:
vector = cv.transform(clean)
vector = select.transform(vector)
vector = tfidf.transform(vector)
predicted = svm.predict(vector)
print(predicted)

[0]


In [259]:
MNB = MultinomialNB()  # MNB with default parameters
MNB_clsf = MNB.fit(X_train_vec,Y_train)
predictions = MNB_clsf.predict(X_test_vec)
print(metrics.classification_report(predictions, Y_test))

              precision    recall  f1-score   support

           0       0.64      0.95      0.77        38
           1       0.96      0.69      0.80        65

    accuracy                           0.79       103
   macro avg       0.80      0.82      0.78       103
weighted avg       0.84      0.79      0.79       103

