In [20]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [21]:
file_path = '~/code/TechLah/RevuSum/data/processed_kswdf.csv'
hotelreviews = pd.read_csv(file_path)
df = hotelreviews.copy()

In [22]:
X = df['Review']
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X,y, shuffle=True,test_size=0.3,random_state=1)

In [26]:
# Pipeline Hashing vectorizer + Naive Bayes
pipeline_naive_bayes = make_pipeline(
    HashingVectorizer(n_features=10000,alternate_sign=False),
    MultinomialNB()
)

# Define the grid of parameters
parameters = {
    'hashingvectorizer__norm': ['l1','l2'],
    'hashingvectorizer__ngram_range': [(1, 1),(1, 2), (1, 3)],
    'multinomialnb__alpha': [0.1, 1]
}

# Perform Grid Search
grid_search = GridSearchCV(
    pipeline_naive_bayes,
    parameters,
    scoring = "accuracy",
    cv = 5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train,y_train)

# Best score
print(f"Best Score = {grid_search.best_score_}")

# Best params
print(f"Best params = {grid_search.best_params_}")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Score = 0.9306375192679065
Best params = {'hashingvectorizer__ngram_range': (1, 1), 'hashingvectorizer__norm': 'l2', 'multinomialnb__alpha': 1}


In [24]:
# Access the best parameters
best_params = grid_search.best_params_

# Use the best parameters to create a pipeline
pipeline_best = make_pipeline(
    HashingVectorizer(alternate_sign=False,
                      n_features=10000,
                      ngram_range=best_params['hashingvectorizer__ngram_range'],
                      norm=best_params['hashingvectorizer__norm']),
    MultinomialNB(alpha=best_params['multinomialnb__alpha'])
)

cv_nb_hash = cross_validate(
    pipeline_best,
    X_test,
    y_test,
    scoring = "accuracy",
    cv=10
)

# Print the mean recall score
print(f"Mean Accuracy Score: {round(cv_nb_hash['test_score'].mean(),2)}")

Mean Accuracy Score: 0.93


In [25]:
hash_nb_model = pipeline_best.fit(X_train,y_train)
y_pred = hash_nb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9301002123169695
