In [53]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [54]:
file_path = '~/code/TechLah/RevuSum/data/processed_kswdf.csv'
hotelreviews = pd.read_csv(file_path)
df = hotelreviews.copy()

In [55]:
df = df.groupby('Label', group_keys=False).apply(lambda x: x.sample(10000))

df['Label'].value_counts()

Label
0    10000
1    10000
Name: count, dtype: int64

In [56]:
X = df['Review']
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X,y, shuffle=True,test_size=0.3,random_state=1)

## Not In Use as we are using CountVectorizer in Pipeline but this is how a basic countvectorizer works

vectorizer = CountVectorizer()

# Fit and transform the 'clean_text' column
features = vectorizer.fit_transform(df.Review)

# Get the feature names
feature_names = vectorizer.get_feature_names_out()

# Create a DataFrame with the features as column names
X_bow = pd.DataFrame(features.toarray(), columns=feature_names)

In [57]:
# Pipeline vectorizer + Naive Bayes
pipeline_naive_bayes = make_pipeline(
    CountVectorizer(),
    MultinomialNB()
)

# Define the grid of parameters
parameters = {
    'countvectorizer__ngram_range': ((1,2), (3,4)),
    'multinomialnb__alpha': (0.1,1)
}

# Perform Grid Search
grid_search = GridSearchCV(
    pipeline_naive_bayes,
    parameters,
    scoring = "recall",
    cv = 5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(df['Review'],df['Label'])

# Best score
print(f"Best Score = {grid_search.best_score_}")

# Best params
print(f"Best params = {grid_search.best_params_}")

Fitting 5 folds for each of 4 candidates, totalling 20 fits


Best Score = 0.9437999999999999
Best params = {'countvectorizer__ngram_range': (1, 2), 'multinomialnb__alpha': 1}


In [58]:
# Access the best parameters
best_params = grid_search.best_params_

# Use the best parameters to create a pipeline
pipeline_best = make_pipeline(
    CountVectorizer(ngram_range=best_params['countvectorizer__ngram_range']),
    MultinomialNB(alpha=best_params['multinomialnb__alpha'])
)

cv_nb = cross_validate(
    pipeline_best,
    df['Review'],
    df['Label'],
    scoring = "accuracy",
    cv=10
)

# Print the mean recall score
print(f"Mean Accuracy Score: {round(cv_nb['test_score'].mean(),2)}")

Mean Accuracy Score: 0.94


In [59]:
nb_model = pipeline_best.fit(X_train,y_train)
y_pred = nb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9415
