In [342]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn import metrics
from sklearn.svm import LinearSVC
import string
from nltk import pos_tag
from nltk.corpus import stopwords
# nltk.download('stopwords')

from mi_helper import *

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline

from numpy import array 

In [343]:
dataset = pd.read_csv("amazon_alexa.tsv", sep = "\t")
# data_test = pd.read_csv("test_set.csv", sep = "\t")
# data_test.drop("Unnamed: 0", axis = 1, inplace = True)
# dataset = pd.concat([data_test, dataset])
# dataset.dropna(inplace=True)
dataset.drop(dataset[dataset.rating == 3].index, inplace=True) #droppa recensioni con 3-4 stelle
dataset.drop(dataset[dataset.rating == 4].index, inplace=True) #droppa recensioni con 3-4 stelle
print(dataset["feedback"].value_counts())
dataset.info()

1    2893
0     257
Name: feedback, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   rating            3150 non-null   int64 
 1   date              3150 non-null   object
 2   variation         3150 non-null   object
 3   verified_reviews  3150 non-null   object
 4   feedback          3150 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 123.2+ KB


In [344]:
# levare i numeri

def get_wordnet_pos(treebank_tag):
    """
    return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
    """
    if treebank_tag.startswith('J'):
        return "a"
    elif treebank_tag.startswith('V'):
        return "v"
    elif treebank_tag.startswith('N'):
        return "n"
    elif treebank_tag.startswith('R'):
        return "r"
    else:
        return "n"
        
custom_stopwords = ["echo", "alexa", "dot", "amzon", "prime", "2nd", "generation", "1st", "3rd", "4th", "5th"]
pos_list = ["JJ", "JJR", "JJS", "RB", "RBR", "RBS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]

def tokenize_list_of_text(list_of_text, custom_stopwords = [], pos_filter = False, pos_list = []):
    """Tokenizza tutte le recensioni, pulisce da stopwords, elimina token <= 2 caratteri e lemmatizza. Ritorna la colonna delle recensioni."""

    tokenizer = nltk.tokenize.TweetTokenizer()
    lemmatizer = nltk.WordNetLemmatizer()
    tokenized_reviews = []
    sent_tokenized_reviews = []
    for review in list_of_text: #pulisce le recensioni
        clean_text = ""
        tokens = tokenizer.tokenize(review)
        tokens = [w.lower() for w in tokens]
        tokens_pos = pos_tag(tokens)
        if pos_filter:
            clean_tokens = [(w, get_wordnet_pos(pos)) for w, pos in tokens_pos if w not in string.punctuation and len(w)>2 and w not in stopwords.words("english") and w not in custom_stopwords and pos in pos_list]
        else:
            clean_tokens = [(w, get_wordnet_pos(pos)) for w, pos in tokens_pos if w not in string.punctuation and len(w)>2 and w not in stopwords.words("english") and w not in custom_stopwords]
        lemmatized_tokens = [lemmatizer.lemmatize(w, pos) for w, pos in clean_tokens]
        sent_tokenized_reviews.append(lemmatized_tokens)
        for t in lemmatized_tokens:
            clean_text += " " + t
        tokenized_reviews.append(clean_text)

    return tokenized_reviews,  sent_tokenized_reviews # ritorna una tupla!

In [345]:
tokenized_reviews, sentences = tokenize_list_of_text(dataset["verified_reviews"], custom_stopwords, pos_filter=False, pos_list = pos_list)

dataset["verified_reviews"] = tokenized_reviews

In [292]:
# CREAZIONE DI UN SAMPLE DATASET BILANCIATO
# prende le prime n recensioni positive di lunghezza maggiore, dove n è il numero di recensioni negative
def create_balanced_dataset(dataset):
    """Bilancia il dataset uniformando il numero di recensioni negative e positive. Prende in input il dataset"""
    reviews_1 = list(dataset[dataset["feedback"] == 1]["verified_reviews"])
    reviews_0 = list(dataset[dataset["feedback"] == 0]["verified_reviews"])
    reviews_1.sort(key=len, reverse = True)
    sample_1 = reviews_1[:len(reviews_0)]
    verified_reviews_sample = []
    feedback_sample = []
    verified_reviews_sample.extend(sample_1)
    verified_reviews_sample.extend(reviews_0)
    feedback_sample.extend([1 for i in range(len(sample_1))])
    feedback_sample.extend([0 for i in range(len(reviews_0))])
    dataset = pd.DataFrame({"verified_reviews":verified_reviews_sample, "feedback": feedback_sample})
    print(dataset["feedback"].value_counts())
    return dataset


dataset = create_balanced_dataset(dataset)

1    257
0    257
Name: feedback, dtype: int64


In [346]:
tokenizer = nltk.tokenize.TweetTokenizer()
cv = CountVectorizer(stop_words="english", ngram_range=(1, 3), tokenizer=tokenizer.tokenize, min_df = 2) #count == frequenza
text_counts = cv.fit_transform(dataset["verified_reviews"]) 
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, dataset["feedback"], test_size = 0.20, random_state=10) #divisione in train e test

In [347]:
text_counts

<3150x12318 sparse matrix of type '<class 'numpy.int64'>'
	with 55085 stored elements in Compressed Sparse Row format>

In [348]:
len(cv.vocabulary_)

12318

In [349]:
cv.vocabulary_ # A mapping of terms to feature indices.

{'love': 6019,
 'play': 7563,
 'game': 3706,
 'answer': 570,
 'question': 8282,
 'correctly': 2100,
 'say': 8974,
 'wrong': 12275,
 'like': 5479,
 'able': 104,
 'turn': 11091,
 'light': 5417,
 'away': 860,
 'home': 4417,
 'play game': 7586,
 'answer question': 585,
 'wrong answer': 12276,
 'answer like': 582,
 'like able': 5483,
 'able turn': 146,
 'turn light': 11102,
 'light away': 5423,
 'away home': 861,
 'able turn light': 147,
 'lot': 5970,
 'fun': 3607,
 'thing': 10604,
 'old': 7176,
 'learns': 5381,
 'control': 2005,
 'nice': 7063,
 'sound': 9693,
 'music': 6647,
 'lot fun': 5979,
 'fun thing': 3657,
 'control light': 2021,
 'game like': 3709,
 'nice sound': 7089,
 'sound play': 9769,
 'play music': 7612,
 'play game like': 7587,
 'sound play music': 9771,
 'receive': 8540,
 'gift': 3756,
 'need': 6913,
 'bluetooth': 1173,
 'easily': 2750,
 'accessible': 184,
 'smart': 9550,
 'speaker': 9885,
 'wait': 11678,
 'receive gift': 8541,
 'gift need': 3772,
 'need bluetooth': 6919,
 '

In [350]:
#get_feature_names_out([input_features])
#Get output feature names for transformation.

len(cv.get_feature_names_out())

12318

In [351]:
X_train[0,:] # prima recensione

<1x12318 sparse matrix of type '<class 'numpy.int64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [352]:
print(X_train[0,:])

  (0, 6019)	1
  (0, 9693)	1
  (0, 3797)	2
  (0, 8193)	1
  (0, 11585)	1
  (0, 9723)	1
  (0, 8222)	1
  (0, 2132)	1
  (0, 6287)	1
  (0, 11603)	1
  (0, 11604)	1


In [353]:
cv.inverse_transform(X_train[0,:]) #0 ==> prima recensione

[array(['love', 'sound', 'good', 'quality', 'video', 'sound good',
        'quality good', 'course', 'love video', 'video quality',
        'video quality good'], dtype='<U35')]

In [354]:
for feat,freq in zip(cv.inverse_transform(X_train[0,:])[0],X_train[0,:].data):
    print(feat,freq)

love 1
sound 1
good 2
quality 1
video 1
sound good 1
quality good 1
course 1
love video 1
video quality 1
video quality good 1


In [355]:
#For classification we'll set 'chi2'  method as a scoring function. The target number of features is defined by k parameter
select = SelectKBest(chi2, k=5000)  # feature selection
# select = SelectKBest(chi2, k="all")  # feature selection for balanced with pos filter
select.fit(X_train,Y_train)
X_train_sel = select.transform(X_train)
X_test_sel = select.transform(X_test)

In [357]:
#We've selected 3 best features in x data. To identify the selected features we use get_support() function and filter out them from the features name list. The z object contains selected x data
filter = select.get_support() #filtra le parole selezionando quelle contenenti nei k 5000
sum(filter)

5000

In [358]:
X_train_sel

<2520x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 30244 stored elements in Compressed Sparse Row format>

In [359]:
print(X_test_sel[0,:])

  (0, 1437)	1
  (0, 1737)	1
  (0, 923)	1
  (0, 1063)	1
  (0, 2420)	1
  (0, 1244)	1
  (0, 1500)	1
  (0, 823)	1
  (0, 868)	1
  (0, 2860)	1
  (0, 2052)	2
  (0, 2907)	1
  (0, 479)	1
  (0, 3725)	1
  (0, 94)	1
  (0, 1504)	1
  (0, 4482)	1
  (0, 927)	1
  (0, 2885)	1
  (0, 4796)	1
  (0, 3622)	1
  (0, 1867)	1
  (0, 4440)	1
  (0, 222)	1
  (0, 226)	1
  (0, 1126)	1
  (0, 320)	1
  (0, 890)	1
  (0, 208)	1
  (0, 1020)	1
  (0, 2629)	1
  (0, 1485)	1


In [360]:
print(cv.inverse_transform(select.inverse_transform(X_train_sel[0,:]))) #applichi l'inverse_transform sui vettori che poi hai utilizzato con il Select

[array(['course', 'love', 'love video', 'sound', 'sound good', 'video',
       'video quality', 'video quality good'], dtype='<U35')]


In [361]:
tfidf = TfidfTransformer()  # weighting
tfidf.fit(X_train_sel)
X_train_vec = tfidf.transform(X_train_sel)
X_test_vec =tfidf.transform(X_test_sel)

In [362]:
print(X_train_vec[0,:])

  (0, 4493)	0.45708456288459687
  (0, 4492)	0.44008191767976584
  (0, 4482)	0.2709195306437948
  (0, 3094)	0.3619632313010742
  (0, 3067)	0.18393192613156412
  (0, 1628)	0.42689364146774217
  (0, 1563)	0.1327576969504811
  (0, 594)	0.3991153899940164


In [363]:
cv.inverse_transform(select.inverse_transform(X_train_vec[0,:]))

[array(['course', 'love', 'love video', 'sound', 'sound good', 'video',
        'video quality', 'video quality good'], dtype='<U35')]

In [364]:
for feat,weight,freq in zip(cv.inverse_transform(select.inverse_transform(X_train_vec[1,:]))[0],X_train_vec[1,:].data,X_train_sel[1,:].data):
    print(feat,round(weight,4),freq)

amazing 0.3522 1
daughter 0.4401 1
daughter love 0.5838 1
game 0.4208 1
love 0.4053 2


In [365]:
# create a dataframe with words, tf_idf score and freq
total_scores = []

for i in range(X_train_vec.shape[0]):
    for feat,weight,freq in zip(cv.inverse_transform(select.inverse_transform(X_train_vec[i,:]))[0],X_train_vec[i,:].data,X_train_sel[i,:].data):
        total_scores.append((feat,round(weight,4),freq))

words = []
tf_idf_score = []
freq = []
for i in range(len(total_scores)):
    words.append(total_scores[i][0])
    tf_idf_score.append(total_scores[i][1])
    freq.append(total_scores[i][2])

scores_df = pd.DataFrame()
scores_df["words"] = words
scores_df["tf_idf_score"] = tf_idf_score
scores_df["freq"] = freq

In [366]:
scores_df.sort_values(by = "tf_idf_score", ascending = False).head(50) # sort and show the df

Unnamed: 0,words,tf_idf_score,freq
6793,youtube,1.0,1
11800,entertainment,1.0,1
12738,love,1.0,1
8909,love,1.0,1
8910,perfect,1.0,1
8134,like,1.0,1
27521,amazing,1.0,1
28451,problem,1.0,1
8911,love,1.0,1
23930,excellent,1.0,1


In [367]:
svm = LinearSVC()  # linear svm with default parameters
svm_clf = svm.fit(X_train_vec,Y_train)
predictions = svm_clf.predict(X_test_vec)

In [368]:
len(predictions), sum(predictions)

(630, 611)

In [369]:
predictions

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [370]:
print(metrics.classification_report(Y_test, predictions))

              precision    recall  f1-score   support

           0       0.89      0.33      0.48        52
           1       0.94      1.00      0.97       578

    accuracy                           0.94       630
   macro avg       0.92      0.66      0.72       630
weighted avg       0.94      0.94      0.93       630



In [371]:
feature_names = cv.get_feature_names_out()
feature_names

array(['. .', '...', '... add', ..., 'zero', 'zigbee', 'zigbee hub'],
      dtype=object)

In [372]:
print(select.scores_)

[ 4.69061792 18.52439179  0.08855292 ...  4.69061792 34.55429595
  4.69061792]


In [374]:
feature_names = cv.get_feature_names_out()
feats_w_score = list()
for index,(selected,score) in enumerate(zip(filter,select.scores_)):
    feats_w_score.append((score,selected,feature_names[index]))
feats_w_score = sorted(feats_w_score, reverse = True)
len(feats_w_score)

12318

In [375]:
feats_w_score[:10] # this contains k score, wether it is a selcted feature and the word

[(67.4287520413001, True, 'cycle'),
 (56.3750197545172, True, 'act'),
 (28.143707527788017, True, 'bridge'),
 (22.585365853658537, True, 'everytime ask question'),
 (22.585365853658537, True, 'everytime ask'),
 (22.585365853658537, True, 'everytime'),
 (22.585365853658537, True, 'error know product'),
 (22.585365853658537, True, 'error know'),
 (22.585365853658537, True, 'enter password'),
 (22.585365853658537, True, 'buy bluetooth speaker')]

In [376]:
type(svm)

sklearn.svm._classes.LinearSVC

In [377]:
svm.coef_

array([[-0.27658184,  0.20624674, -0.04842905, ..., -0.11954606,
        -0.467057  , -0.14676024]])

In [378]:
feats_w_classifier_weight = list()
for index,weight in enumerate(select.inverse_transform(svm.coef_)[0]):
    if weight!=0:
        feats_w_classifier_weight.append((weight,feature_names[index]))
feats_w_classifier_weight = sorted(feats_w_classifier_weight)
len(feats_w_classifier_weight)

4479

In [379]:
feats_w_classifier_weight[-100:] #features positive

[(0.36768544754427557, 'life'),
 (0.3686058146422546, 'expectation'),
 (0.37049162394038077, 'grandkids'),
 (0.37223125352824155, 'perfectly'),
 (0.372629514969948, 'version'),
 (0.3740040995805361, 'light bulb'),
 (0.37608121856221716, 'quickly'),
 (0.37628725969619403, 'download app'),
 (0.37670001893466626, 'look brand'),
 (0.3768356018541839, 'tech'),
 (0.3819508434650999, 'inconvenient'),
 (0.3831712503317487, 'tell turn'),
 (0.38341006036149106, 'lamp'),
 (0.3842095822435753, 'worth price'),
 (0.38685290913995224, 'thought'),
 (0.3869246890119576, 'brand'),
 (0.38830778229341584, 'second kitchen'),
 (0.38875207551263646, 'small'),
 (0.39128315731043667, 'turn light'),
 (0.3923572771757467, 'handy'),
 (0.3930226850192461, 'nice product'),
 (0.39534686106129685, 'access'),
 (0.3956289983149196, 'tech support'),
 (0.3956418406984318, 'wake word'),
 (0.4000072285388685, 'expect'),
 (0.4007153778045342, 'work good'),
 (0.4007868119701239, 'useful'),
 (0.40110682324408997, 'dislike'),


In [380]:
feats_w_classifier_weight[:100] #features negative

[(-1.7741937248918018, 'return'),
 (-1.4424777419462906, 'poor'),
 (-1.3417548327696027, 'work time'),
 (-1.2999010221098144, 'send'),
 (-1.23121698079114, 'figure use'),
 (-1.1813550904027847, 'volume low'),
 (-1.1689909624247934, 'load'),
 (-1.1309140537314726, 'realize'),
 (-1.1193256679165757, 'try'),
 (-1.1070041059606472, 'disconnect'),
 (-1.1017801822644788, 'dont'),
 (-1.0905649246483127, 'horrible'),
 (-1.0603151393003387, 'slow'),
 (-1.0569166986435654, 'dumb'),
 (-1.0409203511203582, 'stop'),
 (-1.0338965588481126, 'refurbished'),
 (-1.0235605319738474, 'useless'),
 (-0.9809176988559936, 'buy know'),
 (-0.979507803148406, 'intrusive'),
 (-0.9677965745495639, 'stop work'),
 (-0.9630962545979463, 'speak'),
 (-0.9478472750502197, 'pair'),
 (-0.931581948233383, 'firmware'),
 (-0.9280853642275707, 'roku work'),
 (-0.9143519232516567, 'excitement'),
 (-0.9093860983156559, 'party'),
 (-0.8955651626162723, 'longer work'),
 (-0.8826510772791266, 'feature'),
 (-0.875995911910808, 'spa

In [386]:
df_scores = pd.DataFrame()
scores = []
words = []
for entry in feats_w_classifier_weight:
    scores.append(entry[0])
    words.append(entry[1])

df_scores["scores"] = scores
df_scores["words"] = words

df_scores.sort_values(by = "scores").head(15) # use tail for seeing the positive words

Unnamed: 0,scores,words
0,-1.774194,return
1,-1.442478,poor
2,-1.341755,work time
3,-1.299901,send
4,-1.231217,figure use
5,-1.181355,volume low
6,-1.168991,load
7,-1.130914,realize
8,-1.119326,try
9,-1.107004,disconnect


In [387]:
stringa = "the product is disappointing, audio sounds bad"
clean = tokenize_list_of_text([stringa])[0]
clean

[' product disappointing audio sound bad']

In [383]:
vector = cv.transform(clean)
vector = select.transform(vector)
vector = tfidf.transform(vector)
predicted = svm.predict(vector)
print(predicted)

[1]


In [384]:
MNB = MultinomialNB()  # MNB with default parameters
MNB_clsf = MNB.fit(X_train_vec,Y_train)
predictions = MNB_clsf.predict(X_test_vec)
print(metrics.classification_report(predictions, Y_test))

              precision    recall  f1-score   support

           0       0.08      1.00      0.14         4
           1       1.00      0.92      0.96       626

    accuracy                           0.92       630
   macro avg       0.54      0.96      0.55       630
weighted avg       0.99      0.92      0.95       630

