In [41]:
import pandas as pd
import numpy as np
import matplotlib as plt
import re
import math
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
filename = 'clustered_books.csv'
df = pd.read_csv(filename)
df['processed_description'] = df['processed_description'].fillna('')
print(df.head(10))

                     title                                        description  \
0  The Old Man and the Sea  Librarian's note: An alternate cover edition c...   
1       The Vampire Lestat  Lestat. The vampire hero of Anne Rice's enthra...   
2     The Poisonwood Bible  The Poisonwood Bible is a story told by the wi...   
3        Different Seasons  Includes the stories “The Body” and “Rita Hayw...   
4            Invisible Man  First published in 1952 and immediately hailed...   
5            Battle Royale  Koushun Takami's notorious high-octane thrille...   
6    I'll Give You the Sun  At first, Jude and her twin brother Noah, are ...   
7    Because of Winn-Dixie  The summer Opal and her father, the preacher, ...   
8            Lover Avenged  Rehvenge has always kept his distance from the...   
9                Ficciones  The seventeen pieces in Ficciones demonstrate ...   

                               processed_description  cluster  
0  librarian note alternate cover edition fo

In [16]:
# Separate processed description and cluster columns into separate dataframes to be used for data split later
description = pd.DataFrame()
description = df['processed_description']
cluster = pd.DataFrame()
cluster = df['cluster']

In [17]:
# Split into train, validation, test sets
X_train, X_temp, y_train, y_temp = train_test_split(description, cluster, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [18]:
# Check shapes of the train, validation, test sets
print(X_train.shape[0])
print(X_val.shape[0])
print(X_test.shape[0])

6979
1496
1496


In [24]:
# Calculate BoW for train, val, test sets to be used as input for Random Forest Classifier
vectorizer = CountVectorizer(binary=False, min_df=2)

X_train_bow = vectorizer.fit_transform(X_train)
X_val_bow = vectorizer.transform(X_val)
X_test_bow = vectorizer.transform(X_test)

print('Number of unigram', len(vectorizer.get_feature_names_out()))

Number of unigram 21605


In [31]:
# Fit Random Forest Regressor on train data

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

rf_model.fit(X_train_bow, y_train)

RandomForestRegressor(random_state=42)

In [59]:
from sklearn.metrics import mean_squared_error

# Predict on the test set
y_val_pred = rf_model.predict(X_val_bow)

# Evaluate the model using mean squared error
mse = mean_squared_error(y_val, y_pred)
print(f"Mean Squared Error on the Validation set: {mse:.2f}")

# Evaluate the model
accuracy_val = rf_model.score(X_val_bow, y_val)
print(f"Accuracy of the Random Forest Regressor for Validation: {accuracy_val:.2f}")

# Evaluate the model using R^2 Score
val_score = rf_model.score(X_val_bow, y_val)
print("Validation Score:", val_score)

print("\n")

# Predict on the test set
y_pred = rf_model.predict(X_test_bow)

# Evaluate the model using mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on the test set: {mse:.2f}")

# Evaluate the model
accuracy_test = rf_model.score(X_val_bow, y_val)
print(f"Accuracy of the Random Forest Regressor for Test: {accuracy_test:.2f}")

# Evaluate the model using R^2 Score
test_score = rf_model.score(X_test_bow, y_test)
print("Test Score:", test_score)

Mean Squared Error on the Validation set: 13.95
Accuracy of the Random Forest Regressor for Validation: 0.03
Validation Score: 0.03383545491195383


Mean Squared Error on the test set: 12.45
Accuracy of the Random Forest Regressor for Test: 0.03
Test Score: 0.014676289388211194


### Trying to tune Hyperparameters for a better accuracy NEED TO FIGURE THIS OUT:
#### 1) Randomized Search CV
#### 2) Grid Search CV

In [56]:
# We use Randomized Search with cross-validation

from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid or distributions for random search
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 6, 8, 10, 12],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]}

# Create a Random Forest regressor
rf = RandomForestRegressor(n_estimators=100)

# Perform Randomized Search with cross-validation
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=10, cv=5, random_state=42)
random_search.fit(X_train_bow, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

KeyboardInterrupt: 

In [55]:
# Evaluate the model
best_rf = random_search.best_estimator_

validation_score = best_rf.score(X_val_bow, y_val)
print("Validation Score:", validation_score)


test_score = best_rf.score(X_test_bow, y_test)
print("Test Score:", test_score)

Test Score: 0.11776064426748822
Validation Score: 0.015517665180930229
Test Score: 0.016692736693864396


In [35]:
# We use Grid Search on validation set to find best parameters for the model

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 15],
    'min_samples_leaf': [1, 5, 10],
    'min_samples_split': [2, 5, 10],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Initialize GridSearchCV with the Random Forest classifier and the parameter grid
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform grid search on the validation set
# X_val_bow is the transformed validation data, y_val are the clusters
grid_search.fit(X_val_bow, y_val)

# Get the best hyperparameters
best_params = grid_search.best_params_
best_score = grid_search.best_score_

KeyboardInterrupt: 