In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [2]:
data = pd.read_csv('Books_Data_Clean.csv')

In [3]:
data.head()

Unnamed: 0,index,Publishing Year,Book Name,Author,language_code,Author_Rating,Book_average_rating,Book_ratings_count,genre,gross sales,publisher revenue,sale price,sales rank,Publisher,units sold
0,0,1975.0,Beowulf,"Unknown, Seamus Heaney",en-US,Novice,3.42,155903,genre fiction,34160.0,20496.0,4.88,1,HarperCollins Publishers,7000
1,1,1987.0,Batman: Year One,"Frank Miller, David Mazzucchelli, Richmond Lew...",eng,Intermediate,4.23,145267,genre fiction,12437.5,7462.5,1.99,2,HarperCollins Publishers,6250
2,2,2015.0,Go Set a Watchman,Harper Lee,eng,Novice,3.31,138669,genre fiction,47795.0,28677.0,8.69,3,"Amazon Digital Services, Inc.",5500
3,3,2008.0,When You Are Engulfed in Flames,David Sedaris,en-US,Intermediate,4.04,150898,fiction,41250.0,24750.0,7.5,3,Hachette Book Group,5500
4,4,2011.0,Daughter of Smoke & Bone,Laini Taylor,eng,Intermediate,4.04,198283,genre fiction,37952.5,22771.5,7.99,4,Penguin Group (USA) LLC,4750


In [4]:
data.rename(columns={'sale price': 'sale_price'}, inplace=True)
data.rename(columns={'Book Name': 'Book_Name'}, inplace=True)
data.rename(columns={'Publishing Year': 'Publishing_Year'}, inplace=True)

In [5]:
rating_counts = data['Book_average_rating'].value_counts().sort_index()

In [6]:
for rating, count in rating_counts.items():
    print(f"Rating {rating}: {count} data points")

Rating 2.97: 1 data points
Rating 3.1: 1 data points
Rating 3.21: 2 data points
Rating 3.22: 1 data points
Rating 3.23: 1 data points
Rating 3.28: 1 data points
Rating 3.29: 1 data points
Rating 3.3: 1 data points
Rating 3.31: 1 data points
Rating 3.32: 1 data points
Rating 3.33: 1 data points
Rating 3.35: 1 data points
Rating 3.37: 1 data points
Rating 3.39: 1 data points
Rating 3.4: 1 data points
Rating 3.41: 3 data points
Rating 3.42: 3 data points
Rating 3.43: 1 data points
Rating 3.45: 1 data points
Rating 3.46: 2 data points
Rating 3.47: 2 data points
Rating 3.48: 4 data points
Rating 3.49: 2 data points
Rating 3.5: 1 data points
Rating 3.51: 3 data points
Rating 3.52: 3 data points
Rating 3.53: 1 data points
Rating 3.54: 1 data points
Rating 3.55: 3 data points
Rating 3.56: 6 data points
Rating 3.57: 4 data points
Rating 3.58: 2 data points
Rating 3.59: 3 data points
Rating 3.6: 5 data points
Rating 3.62: 4 data points
Rating 3.63: 2 data points
Rating 3.64: 5 data points
Rating

In [7]:
def map_rating(value):
    if 2.5 <= value < 3.0:
        return 1
    elif 3.0 <= value < 3.5:
        return 2
    elif 3.5 <= value < 4.0:
        return 3
    elif 4.0 <= value < 4.5:
        return 4
    elif 4.5 <= value <= 5.0:
        return 5
    else:
        return None 


In [8]:
data['Rating'] = data['Book_average_rating'].apply(map_rating)

In [9]:
print(data[[ 'Book_average_rating', 'Rating']].head())

   Book_average_rating  Rating
0                 3.42       2
1                 4.23       4
2                 3.31       2
3                 4.04       4
4                 4.04       4


In [10]:
data.head()

Unnamed: 0,index,Publishing_Year,Book_Name,Author,language_code,Author_Rating,Book_average_rating,Book_ratings_count,genre,gross sales,publisher revenue,sale_price,sales rank,Publisher,units sold,Rating
0,0,1975.0,Beowulf,"Unknown, Seamus Heaney",en-US,Novice,3.42,155903,genre fiction,34160.0,20496.0,4.88,1,HarperCollins Publishers,7000,2
1,1,1987.0,Batman: Year One,"Frank Miller, David Mazzucchelli, Richmond Lew...",eng,Intermediate,4.23,145267,genre fiction,12437.5,7462.5,1.99,2,HarperCollins Publishers,6250,4
2,2,2015.0,Go Set a Watchman,Harper Lee,eng,Novice,3.31,138669,genre fiction,47795.0,28677.0,8.69,3,"Amazon Digital Services, Inc.",5500,2
3,3,2008.0,When You Are Engulfed in Flames,David Sedaris,en-US,Intermediate,4.04,150898,fiction,41250.0,24750.0,7.5,3,Hachette Book Group,5500,4
4,4,2011.0,Daughter of Smoke & Bone,Laini Taylor,eng,Intermediate,4.04,198283,genre fiction,37952.5,22771.5,7.99,4,Penguin Group (USA) LLC,4750,4


In [11]:
print(data.shape)

(1070, 16)


In [12]:
data.dropna(subset=['Publishing_Year', 'Book_Name'], inplace=True)

In [13]:
data['language_code'].fillna('unknown', inplace=True)

In [14]:
print(data.isnull().sum())

index                  0
Publishing_Year        0
Book_Name              0
Author                 0
language_code          0
Author_Rating          0
Book_average_rating    0
Book_ratings_count     0
genre                  0
gross sales            0
publisher revenue      0
sale_price             0
sales rank             0
Publisher              0
units sold             0
Rating                 0
dtype: int64


In [15]:
X = data[['Book_Name', 'sale_price']]
y = data['Rating']

In [16]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [17]:
vectorizer = CountVectorizer()
X_train_name = vectorizer.fit_transform(X_train['Book_Name'])
X_val_name = vectorizer.transform(X_val['Book_Name'])
X_test_name = vectorizer.transform(X_test['Book_Name'])


In [18]:
X_train_combined = pd.concat([pd.DataFrame(X_train_name.toarray()), X_train['sale_price'].reset_index(drop=True)], axis=1)
X_val_combined = pd.concat([pd.DataFrame(X_val_name.toarray()), X_val['sale_price'].reset_index(drop=True)], axis=1)
X_test_combined = pd.concat([pd.DataFrame(X_test_name.toarray()), X_test['sale_price'].reset_index(drop=True)], axis=1)

In [19]:
X_train_combined.columns = X_train_combined.columns.astype(str)
X_val_combined.columns = X_val_combined.columns.astype(str)
X_test_combined.columns = X_test_combined.columns.astype(str)

In [None]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

decision_tree = DecisionTreeClassifier()

# %%
grid_search = GridSearchCV(decision_tree, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_combined, y_train)

# %%
# Display the best hyperparameters found by Grid Search
print("Best Hyperparameters:", grid_search.best_params_)

# %%
# Fit the Decision Tree model with the best hyperparameters
best_decision_tree = grid_search.best_estimator_

# %%
# Using Cross-Validation with the best model
cv_scores = cross_val_score(best_decision_tree, X_train_combined, y_train, cv=5, scoring='accuracy')

print("Cross-Validation Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())

# %%
best_decision_tree.fit(X_train_combined, y_train)



In [None]:
X_val_combined_array = X_val_combined.to_numpy()
y_val_pred = best_decision_tree.predict(X_val_combined_array)

In [149]:
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)

In [150]:
print(f"Validation Accuracy: {val_accuracy}")
print(f"Validation Confusion Matrix:\n{val_conf_matrix}")

Validation Accuracy: 0.5668789808917197
Validation Confusion Matrix:
[[ 0  3  0  0]
 [ 2 29 34  1]
 [ 0 26 60  1]
 [ 0  1  0  0]]


In [153]:
user_input_book_name = 'Jon Ronson'  # Replace with the actual user input
user_input_sale_price = 3.79

# %%
user_input_name = vectorizer.transform([user_input_book_name])

# %%
user_input_combined = pd.concat([pd.DataFrame(user_input_name.toarray()), pd.Series(user_input_sale_price)], axis=1)

# %%
user_input_combined.columns = X_train_combined.columns

# %%
user_input_combined_array = user_input_combined.to_numpy()

# %%
prediction = best_decision_tree.predict(user_input_combined_array)

print(f"Predicted Rating for '{user_input_book_name}': {prediction[0]}")

Predicted Rating for 'Jon Ronson': 3


