In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [11]:
file_path = '~/code/TechLah/RevuSum/data/lemm_df.csv'
hotelreviews = pd.read_csv(file_path)
df = hotelreviews.copy()

In [12]:
X = df['Review']
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X,y, shuffle=True,test_size=0.3,random_state=1)

In [13]:
# Pipeline Hashing vectorizer + Naive Bayes
pipeline_naive_bayes = make_pipeline(
    HashingVectorizer(n_features=10000,alternate_sign=False),
    MultinomialNB()
)

# Define the grid of parameters
parameters = {
    'hashingvectorizer__norm': ['l1','l2'],
    'hashingvectorizer__ngram_range': [(1, 1),(1, 2), (1, 3)],
    'multinomialnb__alpha': [0.1, 1]
}

# Perform Grid Search
grid_search = GridSearchCV(
    pipeline_naive_bayes,
    parameters,
    scoring = "accuracy",
    cv = 5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train,y_train)

# Best score
print(f"Best Score = {grid_search.best_score_}")

# Best params
print(f"Best params = {grid_search.best_params_}")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Score = 0.9247280137230749
Best params = {'hashingvectorizer__ngram_range': (1, 1), 'hashingvectorizer__norm': 'l2', 'multinomialnb__alpha': 1}


In [14]:
# Access the best parameters
best_params = grid_search.best_params_

# Use the best parameters to create a pipeline
pipeline_best = make_pipeline(
    HashingVectorizer(alternate_sign=False,
                      n_features=10000,
                      ngram_range=best_params['hashingvectorizer__ngram_range'],
                      norm=best_params['hashingvectorizer__norm']),
    MultinomialNB(alpha=best_params['multinomialnb__alpha'])
)

cv_nb_hash = cross_validate(
    pipeline_best,
    X_test,
    y_test,
    scoring = "accuracy",
    cv=10
)

# Print the mean recall score
print(f"Mean Accuracy Score: {round(cv_nb_hash['test_score'].mean(),2)}")

Mean Accuracy Score: 0.92


In [15]:
hash_nb_model = pipeline_best.fit(X_train,y_train)
y_pred = hash_nb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9241129950787613


In [16]:
#Validation data preprocess

test_file_path = '~/code/TechLah/RevuSum/data/test_processed_df.csv'
test_hotelreviews = pd.read_csv(test_file_path)
test_df = test_hotelreviews.copy()

In [18]:
test_df.head(2)

Unnamed: 0,Review
0,"['i', 'be', 'so', 'angry', 'that', 'i', 'make'..."
1,"['no', 'negative']"


In [17]:
#Validation data split

test_X = test_df['Review']
test_y = test_df['Label']

KeyError: 'Label'

In [None]:
#Validation data prediction

test_y_pred = hash_nb_model.predict(test_X)
test_accuracy = accuracy_score(test_y, test_y_pred)
print("Test Accuracy:", test_accuracy)

In [None]:
# import pickle

# # Assuming you have a trained model object named 'model'
# # Save the model to a file
# with open('model.pkl', 'wb') as file:
#     pickle.dump(model, file)