In [135]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn import metrics
from sklearn.svm import LinearSVC
import string
from mi_helper import *

#fare custom vectorizer
#includere pos tagging per corretta lemmatizzazione di verbi

In [116]:
dataset = pd.read_csv("amazon_alexa.tsv", sep = "\t")
data_test = pd.read_csv("test_set.csv", sep = "\t")
data_test.drop("Unnamed: 0", axis = 1, inplace = True)
# dataset = pd.concat([data_test, dataset])
dataset.dropna(inplace=True)
dataset.drop(dataset[dataset.rating == 3].index, inplace=True) #droppa recensioni con 3-4 stelle
dataset.drop(dataset[dataset.rating == 4].index, inplace=True)
print(dataset["feedback"].value_counts())
dataset.info()

1    2286
0     257
Name: feedback, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2543 entries, 0 to 3148
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   rating            2543 non-null   int64 
 1   date              2543 non-null   object
 2   variation         2543 non-null   object
 3   verified_reviews  2543 non-null   object
 4   feedback          2543 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 119.2+ KB


In [117]:
tokenizer = nltk.tokenize.TweetTokenizer()
lemmatizer = nltk.WordNetLemmatizer()
tokenized_reviews = []
for review in dataset["verified_reviews"]: #pulisce le recensioni
    clean_text = ""
    tokens = tokenizer.tokenize(review)
    clean_tokens = [w.lower() for w in tokens if w not in string.punctuation and len(w)>2]
    lemmatized_tokens = [lemmatizer.lemmatize(w) for w in clean_tokens]
    for t in lemmatized_tokens:
        clean_text += " " + t
    tokenized_reviews.append(clean_text)

dataset["verified_reviews"] = tokenized_reviews

In [118]:
# CREAZIONE DI UN SAMPLE DATASET BILANCIATO
# prende le prime n recensioni positive di lunghezza maggiore, dove n è il numero di recensioni negative
reviews_1 = list(dataset[dataset["feedback"] == 1]["verified_reviews"])
reviews_0 = list(dataset[dataset["feedback"] == 0]["verified_reviews"])
reviews_1.sort(key=len, reverse = True)
sample_1 = reviews_1[:len(reviews_0)]
verified_reviews_sample = []
feedback_sample = []
verified_reviews_sample.extend(sample_1)
verified_reviews_sample.extend(reviews_0)
feedback_sample.extend([1 for i in range(len(sample_1))])
feedback_sample.extend([0 for i in range(len(reviews_0))])


dataset = pd.DataFrame({"verified_reviews":verified_reviews_sample, "feedback": feedback_sample})
dataset["feedback"].value_counts()

1    257
0    257
Name: feedback, dtype: int64

In [119]:
##################################################################
# CALCOLO VETTORI COL MIO METODO
# per vedere il mio metodo guardare il file mi_helper.py
X_train, X_test, Y_train, Y_test = train_test_split(dataset["verified_reviews"], dataset["feedback"], test_size = 0.20, random_state=10)

data_dict = {0:{"text":""}, 1:{"text":""}}
for rev, feedback in zip(X_train, Y_train):
    data_dict[feedback]["text"]+= " " + rev

w_vect, bi_vect, tri_vect = vectors_creator(data_dict, normalize=True)
# print({k: v for k, v in sorted(tri_vect[1].items(), key=lambda item: item[1], reverse=True)})
predicted = predict(X_test, w_vect, bi_vect, tri_vect, True, False, True)
print(metrics.classification_report(Y_test, predicted))
#provare ad aggiungere un peso/bias per la lunghezza della recensione
#droppare le recensioni con 3-4 stelle, FATTO

              precision    recall  f1-score   support

           0       0.83      0.77      0.80        56
           1       0.75      0.81      0.78        47

    accuracy                           0.79       103
   macro avg       0.79      0.79      0.79       103
weighted avg       0.79      0.79      0.79       103



In [120]:
#####################################################################
# COUNT VECTOR - MULTI NOMIAL NAIVE BAYES
tokenizer = nltk.tokenize.TweetTokenizer()
cv = CountVectorizer(stop_words="english", ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
text_counts = cv.fit_transform(dataset["verified_reviews"])
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, dataset["feedback"], test_size = 0.20, random_state=10)
MNB = MultinomialNB(fit_prior=True)
MNB.fit(X_train, Y_train)
predicted = MNB.predict(X_test)
print(metrics.classification_report(Y_test, predicted))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93        56
           1       0.91      0.91      0.91        47

    accuracy                           0.92       103
   macro avg       0.92      0.92      0.92       103
weighted avg       0.92      0.92      0.92       103



In [121]:
#####################################################################
# TFIDF - MULTI NOMIAL NAIVE BAYES
tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
text_counts = tfidf.fit_transform(dataset["verified_reviews"])
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, dataset["feedback"], test_size = 0.20, random_state=10)
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)
predicted = MNB.predict(X_test)
print(metrics.classification_report(Y_test, predicted))


              precision    recall  f1-score   support

           0       0.93      0.66      0.77        56
           1       0.70      0.94      0.80        47

    accuracy                           0.79       103
   macro avg       0.81      0.80      0.79       103
weighted avg       0.82      0.79      0.78       103



In [122]:
#####################################################################
# TFIDF - SVM (support vector machine)

tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
text_counts = tfidf.fit_transform(dataset["verified_reviews"])
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, dataset["feedback"], test_size = 0.20, random_state=10)
svc_tfid = LinearSVC(class_weight = "balanced", dual = True, max_iter=1000)
svc_tfid.fit(X_train, Y_train)
pred = svc_tfid.predict(X_test)
print(metrics.classification_report(Y_test, pred))

              precision    recall  f1-score   support

           0       0.90      0.95      0.92        56
           1       0.93      0.87      0.90        47

    accuracy                           0.91       103
   macro avg       0.92      0.91      0.91       103
weighted avg       0.91      0.91      0.91       103



In [123]:
#####################################################################
# COUNT VECTOR - SVM (support vector machine)
cv = CountVectorizer(stop_words="english", ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
text_counts = cv.fit_transform(dataset["verified_reviews"])
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, dataset["feedback"], test_size = 0.20, random_state=10)
cv_tfid = LinearSVC(class_weight = "balanced")
cv_tfid.fit(X_train, Y_train)
pred = cv_tfid.predict(X_test)
print(metrics.classification_report(Y_test, pred))

              precision    recall  f1-score   support

           0       0.88      0.91      0.89        56
           1       0.89      0.85      0.87        47

    accuracy                           0.88       103
   macro avg       0.88      0.88      0.88       103
weighted avg       0.88      0.88      0.88       103



In [136]:
vector = tfidf.transform(["bad product", "Good, love it"])
predicted = svc_tfid.predict(vector)
print(predicted)

[0 1]


In [137]:
predicted = predict(["amazing", "sounds terrible"], w_vect,bi_vect,tri_vect, True, False, True)
print(predicted)

[1, 0]
