In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle

In [3]:
# file_path = '~/code/TechLah/RevuSum/data/lemm_df.csv'

# Load the train_dataframe.pkl file
with open('lemm_train_df.pkl', 'rb') as file:
    data = pickle.load(file)

# Convert the loaded data to a DataFrame
hotelreviews = pd.DataFrame(data)

df = hotelreviews.copy()

In [4]:
df.head(2)

Unnamed: 0,Review,Label
610301,can it think of anything negative to say about...,1
328118,small room the bathroom be open to the room wh...,0


In [5]:
df['Review'][0],type(df['Review'][0])

('i be so angry that i make this post available via all possible sit i use when plan my trip so no one will make the mistake of book this place i make my book via book com we stay for night in this hotel from to july upon arrival we be place in a small room on the nd floor of the hotel it turn out that this be not the room we book i have specially reserve the level duplex room so that we would have a big window and high ceiling the room itself be ok if you don t mind the break window that can not be close hello rain and a mini fridge that contain some sort of a bio weapon at least i guess so by the smell of it i intimately ask to change the room and after explain time that i book a duplex btw it cost the same a a simple double but get way more volume due to the high ceiling be offer a room but only the next day so i have to check out the next day before o clock in order to get the room i wan to not the best way to begin your holiday so we have to wait till in order to check in my new r

In [6]:
X = df['Review']
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X,y, shuffle=True,test_size=0.3,random_state=1)

In [7]:
X_train.shape

(434446,)

In [8]:
X[0],type(X[0])

('i be so angry that i make this post available via all possible sit i use when plan my trip so no one will make the mistake of book this place i make my book via book com we stay for night in this hotel from to july upon arrival we be place in a small room on the nd floor of the hotel it turn out that this be not the room we book i have specially reserve the level duplex room so that we would have a big window and high ceiling the room itself be ok if you don t mind the break window that can not be close hello rain and a mini fridge that contain some sort of a bio weapon at least i guess so by the smell of it i intimately ask to change the room and after explain time that i book a duplex btw it cost the same a a simple double but get way more volume due to the high ceiling be offer a room but only the next day so i have to check out the next day before o clock in order to get the room i wan to not the best way to begin your holiday so we have to wait till in order to check in my new r

In [9]:
# Pipeline Hashing vectorizer + Naive Bayes
pipeline_naive_bayes = make_pipeline(
    HashingVectorizer(n_features=10000,alternate_sign=False),
    MultinomialNB()
)

# Define the grid of parameters
parameters = {
    'hashingvectorizer__norm': ['l1','l2'],
    'hashingvectorizer__ngram_range': [(1, 1),(1, 2), (1, 3)],
    'multinomialnb__alpha': [0.1, 1]
}

# Perform Grid Search
grid_search = GridSearchCV(
    pipeline_naive_bayes,
    parameters,
    scoring = "accuracy",
    cv = 5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train,y_train)

# Best score
print(f"Best Score = {grid_search.best_score_}")

# Best params
print(f"Best params = {grid_search.best_params_}")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Score = 0.9248537231167138
Best params = {'hashingvectorizer__ngram_range': (1, 1), 'hashingvectorizer__norm': 'l1', 'multinomialnb__alpha': 0.1}


In [10]:
# Access the best parameters
best_params = grid_search.best_params_

# Use the best parameters to create a pipeline
pipeline_best = make_pipeline(
    HashingVectorizer(alternate_sign=False,
                      n_features=10000,
                      ngram_range=best_params['hashingvectorizer__ngram_range'],
                      norm=best_params['hashingvectorizer__norm']),
    MultinomialNB(alpha=best_params['multinomialnb__alpha'])
)

cv_nb_hash = cross_validate(
    pipeline_best,
    X_test,
    y_test,
    scoring = "accuracy",
    cv=10
)

# Print the mean recall score
print(f"Mean Accuracy Score: {round(cv_nb_hash['test_score'].mean(),2)}")

Mean Accuracy Score: 0.92


In [11]:
hash_nb_model = pipeline_best.fit(X_train,y_train)
y_pred = hash_nb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9252760591217668


In [13]:
import pickle

# Assuming you have a trained model object named 'model'
# Save the model to a file
with open('../data/hash_nb_model(92%).pkl', 'wb') as file:
    pickle.dump(hash_nb_model, file)