In [9]:
import pandas as pd
import numpy as np
import matplotlib as plt
import re
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

In [2]:
filename = 'clustered_books.csv'
df = pd.read_csv(filename)
df['processed_description'] = df['processed_description'].fillna('')
print(df.head(10))

                     title                                        description  \
0  The Old Man and the Sea  Librarian's note: An alternate cover edition c...   
1       The Vampire Lestat  Lestat. The vampire hero of Anne Rice's enthra...   
2     The Poisonwood Bible  The Poisonwood Bible is a story told by the wi...   
3        Different Seasons  Includes the stories “The Body” and “Rita Hayw...   
4            Invisible Man  First published in 1952 and immediately hailed...   
5            Battle Royale  Koushun Takami's notorious high-octane thrille...   
6    I'll Give You the Sun  At first, Jude and her twin brother Noah, are ...   
7    Because of Winn-Dixie  The summer Opal and her father, the preacher, ...   
8            Lover Avenged  Rehvenge has always kept his distance from the...   
9                Ficciones  The seventeen pieces in Ficciones demonstrate ...   

                               processed_description  cluster  
0  librarian note alternate cover edition fo

In [3]:
# Separate processed description and cluster columns into separate dataframes to be used for data split later
description = pd.DataFrame()
description = df['processed_description']
cluster = pd.DataFrame()
cluster = df['cluster']

In [4]:
# Split into train, validation, test sets
X_train, X_temp, y_train, y_temp = train_test_split(description, cluster, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [5]:
# Check shapes of the train, validation, test sets
print(X_train.shape[0])
print(X_val.shape[0])
print(X_test.shape[0])

6979
1496
1496


In [6]:
# Calculate BoW for train, val, test sets to be used as input for Random Forest Classifier
vectorizer = CountVectorizer(binary=False, min_df=2)

X_train_bow = vectorizer.fit_transform(X_train)
X_val_bow = vectorizer.transform(X_val)
X_test_bow = vectorizer.transform(X_test)

print('Number of unigram', len(vectorizer.get_feature_names_out()))

Number of unigram 21605


In [7]:
# Fit Random Forest Regressor on train data

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X_train_bow, y_train)

RandomForestClassifier(random_state=42)

In [17]:
# Predict on the train set
y_train_pred = rf_model.predict(X_train_bow)


# Evaluate the model
accuracy_train = accuracy_score(y_train, y_train_pred)
print(f"Accuracy of the Random Forest Classifier for Train: {accuracy_train:.2f}")

print("\n")

# Predict on the test set
y_val_pred = rf_model.predict(X_val_bow)


# Evaluate the model
accuracy_val = accuracy_score(y_val, y_val_pred)
print(f"Accuracy of the Random Forest Classifier for Validation: {accuracy_val:.2f}")

print("\n")

# Predict on the test set
y_test_pred = rf_model.predict(X_test_bow)

# Evaluate the model
accuracy_test = accuracy_score(y_test, y_test_pred)
print(f"Accuracy of the Random Forest Classifier for Test: {accuracy_test:.2f}")

Accuracy of the Random Forest Classifier for Train: 1.00


Accuracy of the Random Forest Classifier for Validation: 0.37


Accuracy of the Random Forest Classifier for Test: 0.37


### Tuning Hyperparameters for a better accuracy
#### 1) Randomized Search CV

In [16]:
# We use Randomized Search with cross-validation

from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid or distributions for random search
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 6, 8, 10, 12],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]}

# Perform Randomized Search with cross-validation
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=10, 
                                   cv=5, scoring = 'accuracy', random_state=42)

random_search.fit(X_train_bow, y_train)

# Get the best hyperparameters and best score
best_params = random_search.best_params_
best_score = random_search.best_score_

print("Best Hyperparameters:", best_params)
print("Best Score (accuracy):", best_score)

# Evaluate the best model on the test set
best_model = random_search.best_estimator_
test_accuracy = best_model.score(X_test_bow, y_test)
print("Test Set Accuracy:", test_accuracy)

Best Hyperparameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 12}
Best Score (accuracy): 0.3225390516683612
Test Set Accuracy: 0.3074866310160428
