In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
file_path = '~/code/TechLah/RevuSum/data/processed_kswdf.csv'
hotelreviews = pd.read_csv(file_path)
df = hotelreviews.copy()

In [4]:
X = df['Review']
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X,y, shuffle=True,test_size=0.3)

In [5]:
# Pipeline Hashing vectorizer + Naive Bayes
pipeline_naive_bayes = make_pipeline(
    HashingVectorizer(),
    MultinomialNB()
)

# Define the grid of parameters
parameters = {
    'hashingvectorizer__n_features': [10000],
    'hashingvectorizer__norm': [None,'l1','l2'],
    'hashingvectorizer__alternate_sign': [False],
    'multinomialnb__alpha': [0.1, 1]
}

# Perform Grid Search
grid_search = GridSearchCV(
    pipeline_naive_bayes,
    parameters,
    scoring = "recall",
    cv = 5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(df['Review'],df['Label'])

# Best score
print(f"Best Score = {grid_search.best_score_}")

# Best params
print(f"Best params = {grid_search.best_params_}")

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Score = 0.9504380092524775
Best params = {'hashingvectorizer__alternate_sign': False, 'hashingvectorizer__n_features': 10000, 'hashingvectorizer__norm': 'l1', 'multinomialnb__alpha': 1}


In [6]:
# Access the best parameters
best_params = grid_search.best_params_

# Use the best parameters to create a pipeline
pipeline_best = make_pipeline(
    HashingVectorizer(alternate_sign=best_params['hashingvectorizer__alternate_sign'],
                      n_features=best_params['hashingvectorizer__n_features'],
                      norm=best_params['hashingvectorizer__norm']),
    MultinomialNB(alpha=best_params['multinomialnb__alpha'])
)

cv_nb_hash = cross_validate(
    pipeline_best,
    df['Review'],
    df['Label'],
    scoring = "accuracy",
    cv=10
)

# Print the mean recall score
print(f"Mean Accuracy Score: {round(cv_nb_hash['test_score'].mean(),2)}")

Mean Accuracy Score: 0.93


In [7]:
hash_nb_model = pipeline_best.fit(X_train,y_train)
y_pred = hash_nb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9299354000575227
