In [1]:
# Import libraries we want to use
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV



In [2]:
data = pd.read_csv('Books_Data_Clean.csv')

In [3]:
data.head()

Unnamed: 0,index,Publishing Year,Book Name,Author,language_code,Author_Rating,Book_average_rating,Book_ratings_count,genre,gross sales,publisher revenue,sale price,sales rank,Publisher,units sold
0,0,1975.0,Beowulf,"Unknown, Seamus Heaney",en-US,Novice,3.42,155903,genre fiction,34160.0,20496.0,4.88,1,HarperCollins Publishers,7000
1,1,1987.0,Batman: Year One,"Frank Miller, David Mazzucchelli, Richmond Lew...",eng,Intermediate,4.23,145267,genre fiction,12437.5,7462.5,1.99,2,HarperCollins Publishers,6250
2,2,2015.0,Go Set a Watchman,Harper Lee,eng,Novice,3.31,138669,genre fiction,47795.0,28677.0,8.69,3,"Amazon Digital Services, Inc.",5500
3,3,2008.0,When You Are Engulfed in Flames,David Sedaris,en-US,Intermediate,4.04,150898,fiction,41250.0,24750.0,7.5,3,Hachette Book Group,5500
4,4,2011.0,Daughter of Smoke & Bone,Laini Taylor,eng,Intermediate,4.04,198283,genre fiction,37952.5,22771.5,7.99,4,Penguin Group (USA) LLC,4750


In [4]:
data.rename(columns={'sale price': 'sale_price', 'Book Name': 'Book_Name', 'Publishing Year': 'Publishing_Year'}, inplace=True)

In [5]:
rating_counts = data['Book_average_rating'].value_counts().sort_index()

# Print the distribution of book ratings
for rating, count in rating_counts.items():
    print(f"Rating {rating}: {count} data points")

Rating 2.97: 1 data points
Rating 3.1: 1 data points
Rating 3.21: 2 data points
Rating 3.22: 1 data points
Rating 3.23: 1 data points
Rating 3.28: 1 data points
Rating 3.29: 1 data points
Rating 3.3: 1 data points
Rating 3.31: 1 data points
Rating 3.32: 1 data points
Rating 3.33: 1 data points
Rating 3.35: 1 data points
Rating 3.37: 1 data points
Rating 3.39: 1 data points
Rating 3.4: 1 data points
Rating 3.41: 3 data points
Rating 3.42: 3 data points
Rating 3.43: 1 data points
Rating 3.45: 1 data points
Rating 3.46: 2 data points
Rating 3.47: 2 data points
Rating 3.48: 4 data points
Rating 3.49: 2 data points
Rating 3.5: 1 data points
Rating 3.51: 3 data points
Rating 3.52: 3 data points
Rating 3.53: 1 data points
Rating 3.54: 1 data points
Rating 3.55: 3 data points
Rating 3.56: 6 data points
Rating 3.57: 4 data points
Rating 3.58: 2 data points
Rating 3.59: 3 data points
Rating 3.6: 5 data points
Rating 3.62: 4 data points
Rating 3.63: 2 data points
Rating 3.64: 5 data points
Rating

In [6]:
# Define a function to map ratings into categories
def map_rating(value):
    if 2.5 <= value < 3.0:
        return "verry bad"
    elif 3.0 <= value < 3.5:
        return "bad"
    elif 3.5 <= value < 4.0:
        return "good"
    elif 4.0 <= value < 4.5:
        return "very good"
    elif 4.5 <= value <= 5.0:
        return  "excellent"
    else:
        return None 

In [7]:
data['Rating'] = data['Book_average_rating'].apply(map_rating)

In [8]:
print(data[['Book_average_rating', 'Rating']].head())

   Book_average_rating     Rating
0                 3.42        bad
1                 4.23  very good
2                 3.31        bad
3                 4.04  very good
4                 4.04  very good


In [9]:
data.head()

Unnamed: 0,index,Publishing_Year,Book_Name,Author,language_code,Author_Rating,Book_average_rating,Book_ratings_count,genre,gross sales,publisher revenue,sale_price,sales rank,Publisher,units sold,Rating
0,0,1975.0,Beowulf,"Unknown, Seamus Heaney",en-US,Novice,3.42,155903,genre fiction,34160.0,20496.0,4.88,1,HarperCollins Publishers,7000,bad
1,1,1987.0,Batman: Year One,"Frank Miller, David Mazzucchelli, Richmond Lew...",eng,Intermediate,4.23,145267,genre fiction,12437.5,7462.5,1.99,2,HarperCollins Publishers,6250,very good
2,2,2015.0,Go Set a Watchman,Harper Lee,eng,Novice,3.31,138669,genre fiction,47795.0,28677.0,8.69,3,"Amazon Digital Services, Inc.",5500,bad
3,3,2008.0,When You Are Engulfed in Flames,David Sedaris,en-US,Intermediate,4.04,150898,fiction,41250.0,24750.0,7.5,3,Hachette Book Group,5500,very good
4,4,2011.0,Daughter of Smoke & Bone,Laini Taylor,eng,Intermediate,4.04,198283,genre fiction,37952.5,22771.5,7.99,4,Penguin Group (USA) LLC,4750,very good


In [10]:
print(data.isnull().sum())

index                   0
Publishing_Year         1
Book_Name              23
Author                  0
language_code          53
Author_Rating           0
Book_average_rating     0
Book_ratings_count      0
genre                   0
gross sales             0
publisher revenue       0
sale_price              0
sales rank              0
Publisher               0
units sold              0
Rating                  0
dtype: int64


In [11]:
data.dropna(subset=['Publishing_Year', 'Book_Name'], inplace=True)

In [12]:
data['language_code'].fillna('unknown', inplace=True)

In [13]:
print(data.isnull().sum())

index                  0
Publishing_Year        0
Book_Name              0
Author                 0
language_code          0
Author_Rating          0
Book_average_rating    0
Book_ratings_count     0
genre                  0
gross sales            0
publisher revenue      0
sale_price             0
sales rank             0
Publisher              0
units sold             0
Rating                 0
dtype: int64


In [14]:
# Assuming 'Author_Rating' is the categorical variable
X = pd.get_dummies(data[['sale_price', 'Author_Rating']])

# Rest of your code for splitting and training the model

y = data['Rating']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [16]:
model = DecisionTreeClassifier()

In [17]:
model.fit(X_train, y_train)

In [18]:
pred=model.predict(X_test)

In [19]:
y_test.head()

360     very good
571     very good
893     very good
1002         good
32      very good
Name: Rating, dtype: object

In [20]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,pred)

0.8285714285714286

In [21]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [22]:
# Create a Decision Tree classifier
decision_tree = DecisionTreeClassifier()

In [23]:
grid_search = GridSearchCV(decision_tree, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)



In [24]:
print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2}


In [25]:
best_decision_tree = grid_search.best_estimator_

In [26]:
best_decision_tree.fit(X_train, y_train)

In [27]:
pred=best_decision_tree.predict(X_test)

In [28]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,pred)

0.8380952380952381

In [None]:
# Get user input for Author Rating and Book Price
user_input_authour_rating = input("Enter Author Rating ('Novice', 'Intermediate', 'Advanced'): ")
user_input_sale_price = float(input("Enter Book Price: "))

# Ensure that the input Author Rating is one of the categories used in training
allowed_ratings = ['Novice', 'Intermediate', 'Advanced']
if user_input_authour_rating not in allowed_ratings:
    print(f"Invalid Author Rating. Allowed values are: {', '.join(allowed_ratings)}")
else:
    # Make sure the input is in the correct format for prediction
    user_input_encoded = pd.get_dummies(pd.Series([user_input_authour_rating]))
    
    # Ensure that the user input columns match the training columns
    missing_cols = set(X_train.columns) - set(user_input_encoded.columns)
    for col in missing_cols:
        user_input_encoded[col] = 0

    user_input_combined = pd.concat([user_input_encoded, pd.Series([user_input_sale_price])], axis=1)
    user_input_combined_array = user_input_combined[X_train.columns].to_numpy()

    # Make a prediction using the trained model
    prediction = best_decision_tree.predict(user_input_combined_array)

    # Map the predicted value back to the original rating categories
    print(f"Predicted Rating for 'your book is': {prediction[0]}")

