In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import json

import os


In [3]:
from sklearn.model_selection import train_test_split

REVIEWS_LIMIT = 100000 #300000

def load_rows(filepath, nrows = None, func = None) -> pd.DataFrame :
    with open(filepath) as json_file:
        count = 0
        objs = []
        line = json_file.readline()
        while (nrows is None or count < nrows) and line:
            count += 1
            obj = json.loads(line)
            if func != None :
                func(obj)
            objs.append(obj)
            line = json_file.readline()
        return pd.DataFrame(objs)
    
# Aggiunge la classe della recensione
def add_sentiment(obj) :
    if (obj["stars"] <= 3):
        obj["label"] = 0
    else:
        obj["label"] = 1
        
reviews = load_rows('input/yelp_academic_dataset_review.json', REVIEWS_LIMIT, add_sentiment)
print('Review objects loaded. Count = {}'.format(reviews.shape[0]))

reviews['text_length'] = reviews['text'].apply(lambda x:len(x.split()))

# 80% train, 20% test
reviews_train, reviews_test = train_test_split(reviews, test_size = 0.2)

# Solo text, label
reviews_train = reviews_train[['text', 'label']]
reviews_test = reviews_test[['text', 'label']]
display(reviews_train.head(2))
display(reviews_test.head(2))

#with pd.option_context('display.max_colwidth', None):
#  display(reviews_train)

Review objects loaded. Count = 100000


Unnamed: 0,text,label
31154,Yummy! And pretty authenic. I'm here from San ...,1
64633,"I've been coming here for years, Donna the man...",1


Unnamed: 0,text,label
27441,Received dead flower on Mother's Day. \nI know...,0
8478,This place has the best patio ever! And the be...,1


In [4]:
from tqdm import tqdm
import re
import copy
import nltk
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

pd.options.mode.chained_assignment = None  # default='warn'

def contractions(sent):
    sent = re.sub(r"ain't", "am not", sent)
    sent = re.sub(r"aren't", "are not", sent)
    sent = re.sub(r"can't", "can not", sent)
    sent = re.sub(r"can't've", "can not have", sent)
    sent = re.sub(r"'cause", "because", sent)
    sent = re.sub(r"could've", "could have", sent)
    sent = re.sub(r"couldn't", "could not", sent)
    sent = re.sub(r"couldn't've", "could not have", sent)
    sent = re.sub(r"doesn't", "does not", sent)
    sent = re.sub(r"hadn't", "had not", sent)
    sent = re.sub(r"hadn't've", "had not have", sent)
    sent = re.sub(r"hasn't", "has not", sent)
    sent = re.sub(r"haven't", "have not", sent)
    sent = re.sub(r"he'd", "he had", sent)
    sent = re.sub(r"he'd've", "he would have", sent)
    sent = re.sub(r"he'll", "he will", sent)
    sent = re.sub(r"he'll've", "he will have", sent)
    sent = re.sub(r"he's", "he has", sent)
    sent = re.sub(r"how'd", "how did", sent)
    sent = re.sub(r"how'd'y", "how do you", sent)
    sent = re.sub(r"how'll", "how will", sent)
    sent = re.sub(r"how's", "how has", sent)
    sent = re.sub(r"i'd", "i had", sent)
    sent = re.sub(r"i'd've", "i would have", sent)
    sent = re.sub(r"i'll", "i shall", sent)
    sent = re.sub(r"i'll've", "i shall have", sent)
    sent = re.sub(r"i'm", "i am", sent)
    sent = re.sub(r"i've", "i have", sent)
    sent = re.sub(r"isn't", "is not", sent)
    sent = re.sub(r"it'd", "it had", sent)
    sent = re.sub(r"it'd've", "it would have", sent)
    sent = re.sub(r"it'll", "it shall", sent)
    sent = re.sub(r"it'll've", "it shall have", sent)
    sent = re.sub(r"it's", "it is", sent)
    sent = re.sub(r"let's", "let us", sent)
    sent = re.sub(r"ma'am", "madam", sent)
    sent = re.sub(r"mayn't", "may not", sent)
    sent = re.sub(r"might've", "might have", sent)
    sent = re.sub(r"mightn't", "might not", sent)
    sent = re.sub(r"mightn't've", "might not have", sent)
    sent = re.sub(r"must've", "must have", sent)
    sent = re.sub(r"mustn't", "must not", sent)
    sent = re.sub(r"mustn't've", "must not have", sent)
    sent = re.sub(r"needn't", "need not", sent)
    sent = re.sub(r"needn't've", "need not have", sent)
    sent = re.sub(r"o'clock", "of the clock", sent)
    sent = re.sub(r"oughtn't", "ought not", sent)
    sent = re.sub(r"oughtn't've", "ought not have", sent)
    sent = re.sub(r"shan't", "shall not", sent)
    sent = re.sub(r"sha'n't", "shall not", sent)
    sent = re.sub(r"shan't've", "shall not have", sent)
    sent = re.sub(r"she'd", "she had", sent)
    sent = re.sub(r"she'd've", "she would have", sent)
    sent = re.sub(r"she'll", "she shall", sent)
    sent = re.sub(r"she'll've", "she shall have", sent)
    sent = re.sub(r"she's", "she has", sent)
    sent = re.sub(r"should've", "should have", sent)
    sent = re.sub(r"shouldn't", "should not", sent)
    sent = re.sub(r"shouldn't've", "should not have", sent)
    sent = re.sub(r"so've", "so have", sent)
    sent = re.sub(r"so's", "so as", sent)
    sent = re.sub(r"that'd", "that would", sent)
    sent = re.sub(r"that'd've", "that would have", sent)
    sent = re.sub(r"that's", "that has", sent)
    sent = re.sub(r"there'd", "there had", sent)
    sent = re.sub(r"there'd've", "there would have", sent)
    sent = re.sub(r"there's", "there has", sent)
    sent = re.sub(r"they'd", "they had", sent)
    sent = re.sub(r"they'd've", "they would have", sent)
    sent = re.sub(r"they'll", "they shall", sent)
    sent = re.sub(r"they'll've", "they shall have", sent)
    sent = re.sub(r"they're", "they are", sent)
    sent = re.sub(r"they've", "they have", sent)
    sent = re.sub(r"to've", "to have", sent)
    sent = re.sub(r"wasn't", "was not", sent)
    sent = re.sub(r"we'd", "we had", sent)
    sent = re.sub(r"we'd've", "we would have", sent)
    sent = re.sub(r"we'll", "we will", sent)
    sent = re.sub(r"we'll've", "we will have", sent)
    sent = re.sub(r"we're", "we are", sent)
    sent = re.sub(r"we've", "we have", sent)
    sent = re.sub(r"weren't", "were not", sent)
    sent = re.sub(r"what'll", "what shall", sent)
    sent = re.sub(r"what'll've", "what shall have", sent)
    sent = re.sub(r"what're", "what are", sent)
    sent = re.sub(r"what's", "what has", sent)
    sent = re.sub(r"what've", "what have", sent)
    sent = re.sub(r"when's", "when has", sent)
    sent = re.sub(r"when've", "when have", sent)
    sent = re.sub(r"where'd", "where did", sent)
    sent = re.sub(r"where's", "where has", sent)
    sent = re.sub(r"where've", "where have", sent)
    sent = re.sub(r"who'll", "who shall", sent)
    sent = re.sub(r"who'll've", "who shall have", sent)
    sent = re.sub(r"who's", "who has", sent)
    sent = re.sub(r"who've", "who have", sent)
    sent = re.sub(r"why's", "why has", sent)
    sent = re.sub(r"why've", "why have", sent)
    sent = re.sub(r"will've", "will have", sent)
    sent = re.sub(r"won't", "will not", sent)
    sent = re.sub(r"won't've", "will not have", sent)
    sent = re.sub(r"would've", "would have", sent)
    sent = re.sub(r"wouldn't", "would not", sent)
    sent = re.sub(r"wouldn't've", "would not have", sent)
    sent = re.sub(r"y'all", "you all", sent)
    sent = re.sub(r"y'all'd", "you all would", sent)
    sent = re.sub(r"y'all'd've", "you all would have", sent)
    sent = re.sub(r"y'all're", "you all are", sent)
    sent = re.sub(r"y'all've", "you all have", sent)
    sent = re.sub(r"you'd", "you had", sent)
    sent = re.sub(r"you'd've", "you would have", sent)
    sent = re.sub(r"you'll", "you shall", sent)
    sent = re.sub(r"you'll've", "you shall have", sent)
    sent = re.sub(r"how's", "how has", sent)
    sent = re.sub(r"you're", "you are", sent)
    sent = re.sub(r"you've", "you have", sent)
    sent = re.sub(r"didn't", "did not", sent)
    sent = re.sub(r"don't", "do not", sent)
    sent = re.sub(r"'","",sent)
    sent = re.sub(r". . .","",sent)
    return(sent)

## Function for removing unwanted text
def processing(data_1):
 
    for index, row in tqdm(data_1.iterrows()):
        stri = ""
## Code to remove digit with word pattern
        cle = re.sub(r'([\d]+[a-zA-Z]+)|([a-zA-Z]+[\d]+)', "", row["text"])
## Code to remove only digit patter
        cle = re.sub(r"(^|\s)(\-?\d+(?:\.\d)*|\d+|[\d]+[A-Za-z]+)"," ", cle.lower())
## Code to remove every symbols except characters
        cle = re.sub('[^A-Za-z\']+', " ", cle)
## Code for concatinating strings
        stri = stri + cle
## Code for calling contraction function
        stri = contractions(stri)
        data_1["text"][index] = stri
    return(data_1)

## Function for stopwords removal and lemitizing the word
def lema_stopw(data_l):
    var2 = copy.deepcopy(data_l)
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english')) - set(['no', 'not'])
    for index, row in tqdm(var2.iterrows()):
        sent = ''
        for e in row["text"].split():
            if e not in stop_words:
                e = lemmatizer.lemmatize(e, pos ="a")
                sent = ' '.join([sent,e])
        var2["text"][index] = sent
    return(var2)

reviews_train = processing(reviews_train)
reviews_test = processing(reviews_test)
reviews_train.head(5)

80000it [01:50, 722.42it/s]
20000it [00:13, 1481.69it/s]


Unnamed: 0,text,label
31154,yummy and pretty authenim here from san diego ...,1
64633,i have been coming here for years donna the ma...,1
15280,i went last night witriend of mine and did the...,0
36766,took my parents here while they were in town b...,0
73242,came witmall group right at sunday opening tim...,0


In [5]:
# Text vectorization
# Bigram Counts
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from joblib import dump, load # used for saving and loading sklearn objects


bigram_vectorizer = CountVectorizer(ngram_range=(1, 2))
bigram_vectorizer.fit(reviews_train['text'].values)


X_train_bigram = bigram_vectorizer.transform(reviews_train['text'].values)
X_test_bigram = bigram_vectorizer.transform(reviews_test['text'].values)

# Bigram Tf-Idf

bigram_tf_idf_transformer = TfidfTransformer()
bigram_tf_idf_transformer.fit(X_train_bigram)

X_train_bigram_tf_idf = bigram_tf_idf_transformer.transform(X_train_bigram)
y_train = reviews_train['label'].values

X_test_bigram_tf_idf = bigram_tf_idf_transformer.transform(X_test_bigram)
y_test = reviews_test['label'].values

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import uniform
import time

# loss, learning rate, initial learning rate, penalty and alpha

params = {
    "n_neighbors" : [3, 5, 10],
    "weights" : ["uniform", "distance"]
}

grid_search_knn_bigram = GridSearchCV(
    estimator= KNeighborsClassifier(n_jobs=-1),
    cv=5,
    param_grid = params,
    scoring='f1',
    verbose=1,
    return_train_score = True
)

grid_search_knn_tf_idf = GridSearchCV(
    estimator= KNeighborsClassifier(n_jobs=-1),
    cv=5,
    param_grid = params,
    verbose=1,
    scoring="f1",
    return_train_score = True
)

start_time = time.time()
grid_search_knn_bigram.fit(X_train_bigram, y_train)
print("--- Ended in %s minutes ---" % ((time.time() - start_time)/60))

start_time = time.time()
grid_search_knn_tf_idf.fit(X_train_bigram_tf_idf, y_train)
print("--- Ended in %s minutes ---" % ((time.time() - start_time)/60))

print(f'\nKNN Bigram')
print(f'Best params: {grid_search_knn_bigram.best_params_}')
print(f'Best score: {grid_search_knn_bigram.best_score_}')

print(f'\nKNN Bigram Tf-Idf')
print(f'Best params: {grid_search_knn_tf_idf.best_params_}')
print(f'Best score: {grid_search_knn_tf_idf.best_score_}')

knn_classifier_bigram = grid_search_knn_bigram.best_estimator_
knn_classifier_tf_idf = grid_search_knn_tf_idf.best_estimator_

Fitting 5 folds for each of 6 candidates, totalling 30 fits
--- Ended in 71.47292774120966 minutes ---
Fitting 5 folds for each of 6 candidates, totalling 30 fits
--- Ended in 71.19827625751495 minutes ---

KNN Bigram
Best params: {'n_neighbors': 3, 'weights': 'distance'}
Best score: 0.8014944130871037

KNN Bigram Tf-Idf
Best params: {'n_neighbors': 3, 'weights': 'distance'}
Best score: 0.6460750009333186


In [7]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def evaluate(y_test,y_pred):
    print("Precision Score of the model:", precision_score(y_test,y_pred)*100)
    print("Recall Score of the model:", recall_score(y_test,y_pred)*100)
    print("Acuracy score of the model:",accuracy_score(y_test,y_pred)*100)
    print("F1 score of the model:",f1_score(y_test,y_pred)*100)
    
def set_labels(cf_matrix):
    group_names = ["True Neg","False Pos","False Neg","True Pos"]
    group_counts = ["{0:0.0f}".format(value) for value in
                    cf_matrix.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in
                         cf_matrix.flatten()/np.sum(cf_matrix)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
              zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    return labels

In [9]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import plot_precision_recall_curve
import seaborn as sn

print("KNN Bigram")
y_pred = knn_classifier_bigram.predict(X_test_bigram)

print(classification_report(y_test, y_pred))

evaluate(y_test,y_pred)

KNN Bigram
              precision    recall  f1-score   support

           0       0.63      0.33      0.44      6435
           1       0.74      0.91      0.82     13565

    accuracy                           0.72     20000
   macro avg       0.69      0.62      0.63     20000
weighted avg       0.71      0.72      0.69     20000

Precision Score of the model: 74.18887950790013
Recall Score of the model: 90.68927386656837
Acuracy score of the model: 72.285
F1 score of the model: 81.6134275383952
