In [167]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import io
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, RocCurveDisplay

import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

Using a variable for the filepath to ease location and change if needed

In [35]:
filepath="C:/Users/itrem/OneDrive/Dev/dsti/S23/Python Project/Goodreads-books-rating/data/books.csv"

Importing CSV with warning for incorrect data import to review whether it will have an impact on the data set information. Only 4 lines were skipped, which given the dataset volume are deprecable

In [135]:
dataset = pd.read_csv(filepath, on_bad_lines='warn')
dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11123 entries, 0 to 11122
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   bookID              11123 non-null  int64  
 1   title               11123 non-null  object 
 2   authors             11123 non-null  object 
 3   average_rating      11123 non-null  float64
 4   isbn                11123 non-null  object 
 5   isbn13              11123 non-null  int64  
 6   language_code       11123 non-null  object 
 7     num_pages         11123 non-null  int64  
 8   ratings_count       11123 non-null  int64  
 9   text_reviews_count  11123 non-null  int64  
 10  publication_date    11123 non-null  object 
 11  publisher           11123 non-null  object 
dtypes: float64(1), int64(5), object(6)
memory usage: 1.0+ MB


b'Skipping line 3350: expected 12 fields, saw 13\nSkipping line 4704: expected 12 fields, saw 13\nSkipping line 5879: expected 12 fields, saw 13\nSkipping line 8981: expected 12 fields, saw 13\n'


Reviwing basic stats of the dataset:

In [51]:
# Basic stats
print("# Number of rows : {}".format(dataset.shape[0]))
print()

print("# Display of dataset: ")
print("---------------------")
display(dataset.head())
print()

print("# Basics statistics: ")
print("--------------------")
data_desc = dataset.describe(include='all')
display(data_desc)
print()

print("# Percentage of missing values: ")
print("-------------------------------")
display(100*dataset.isnull().sum()/dataset.shape[0])

print("# Feature valUe types:")
print("----------------------")
display(dataset.dtypes)

# Number of rows : 11123

# Display of dataset: 
---------------------


Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic



# Basics statistics: 
--------------------


Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
count,11123.0,11123,11123,11123.0,11123.0,11123.0,11123,11123.0,11123.0,11123.0,11123,11123
unique,,10348,6639,,11123.0,,27,,,,3679,2290
top,,The Iliad,Stephen King,,439785960.0,,eng,,,,10/1/2005,Vintage
freq,,9,40,,1.0,,8908,,,,56,318
mean,21310.856963,,,3.934075,,9759880000000.0,,336.405556,17942.85,542.048099,,
std,13094.727252,,,0.350485,,442975800000.0,,241.152626,112499.2,2576.619589,,
min,1.0,,,0.0,,8987060000.0,,0.0,0.0,0.0,,
25%,10277.5,,,3.77,,9780345000000.0,,192.0,104.0,9.0,,
50%,20287.0,,,3.96,,9780582000000.0,,299.0,745.0,47.0,,
75%,32104.5,,,4.14,,9780872000000.0,,416.0,5000.5,238.0,,



# Percentage of missing values: 
-------------------------------


bookID                0.0
title                 0.0
authors               0.0
average_rating        0.0
isbn                  0.0
isbn13                0.0
language_code         0.0
  num_pages           0.0
ratings_count         0.0
text_reviews_count    0.0
publication_date      0.0
publisher             0.0
dtype: float64

# Feature valUe types:
----------------------


bookID                  int64
title                  object
authors                object
average_rating        float64
isbn                   object
isbn13                  int64
language_code          object
  num_pages             int64
ratings_count           int64
text_reviews_count      int64
publication_date       object
publisher              object
dtype: object

After reviewing of the stats we notice a few things:
- There are a few columns that do not provide any valuable information that could be used in a model: bookID (it's an index), isbn (book internal reference) and isbn13 (book internal reference). These can be removed
- publication_date should be imported as a date and is processed as an object
- num_pages has a weird space on front

In [136]:
# Drop useless columns
useless_cols = ['bookID', 'isbn', 'isbn13']

print("Dropping useless columns...")
dataset = dataset.drop(useless_cols, axis=1) # axis = 1 indicates that we are dropping along the column axis

print("Converting date columns to the right format...")
dataset['publication_date']= pd.to_datetime(dataset['publication_date'], errors='coerce', format='%m/%d/%Y')
dataset.dtypes

print("Renaming columns...")
dataset.rename(columns={'  num_pages': 'num_pages'}, inplace=True)

print("...Done.")
display(dataset.head())
display(dataset.dtypes)


Dropping useless columns...
Converting date columns to the right format...
Renaming columns...
...Done.


Unnamed: 0,title,authors,average_rating,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,eng,652,2095690,27591,2006-09-16,Scholastic Inc.
1,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,eng,870,2153167,29221,2004-09-01,Scholastic Inc.
2,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,eng,352,6333,244,2003-11-01,Scholastic
3,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,eng,435,2339585,36325,2004-05-01,Scholastic Inc.
4,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,eng,2690,41428,164,2004-09-13,Scholastic


title                         object
authors                       object
average_rating               float64
language_code                 object
num_pages                      int64
ratings_count                  int64
text_reviews_count             int64
publication_date      datetime64[ns]
publisher                     object
dtype: object

After this, we can continue with two data engineering splits:
- We noticed that books that are from a series are within parenthesis. We can extract this information to a new column and remove the series number
- Splitting authors into several columns could be cumbersome and provide little information, however knowing whether the book was single written or co-authored could have a significant meaning, so we can create a column that will contain the first author only, who is generally the main author

In [137]:
# Creating a new column for the in_series data
dataset.insert(1,"is_series", False)

# Creating a new column for the main_author data
dataset.insert(3,"main_author", False)
    

In [138]:
# Stripping the series data from the title to the newly created column
for i in range(len(dataset.title)) :
    if dataset.title[i].find("#") != -1: # All series have a volume number with # before. Flagging those help strip the series name
        dataset.is_series[i] = dataset.title[i][dataset.title[i].find("(")+1:dataset.title[i].find(")")].strip()[:dataset.title[i][dataset.title[i].find("(")+1:dataset.title[i].find(")")].find(" #")]
        dataset.title[i] = dataset.title[i][:dataset.title[i].find("(")-1] # to remove the series name and number from the book title
    
# Copying the main author name to the newly created column
for i in range(len(dataset.authors)):
    if dataset.authors[i].find("/") != -1:
        dataset.main_author[i] = dataset.authors[i][:dataset.authors[i].find("/")]
    
    else:
       dataset.main_author[i] = dataset.authors[i] 

dataset.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset.is_series[x] = dataset.title[x][dataset.title[x].find("(")+1:dataset.title[x].find(")")].strip()[:dataset.title[x][dataset.title[x].find("(")+1:dataset.title[x].find(")")].find(" #")]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset.title[x] = dataset.title[x][:dataset.title[x].find("(")-1]


Unnamed: 0,title,is_series,authors,average_rating,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,Harry Potter and the Half-Blood Prince,Harry Potter,J.K. Rowling/Mary GrandPré,4.57,eng,652,2095690,27591,2006-09-16,Scholastic Inc.
1,Harry Potter and the Order of the Phoenix,Harry Potter,J.K. Rowling/Mary GrandPré,4.49,eng,870,2153167,29221,2004-09-01,Scholastic Inc.
2,Harry Potter and the Chamber of Secrets,Harry Potter,J.K. Rowling,4.42,eng,352,6333,244,2003-11-01,Scholastic
3,Harry Potter and the Prisoner of Azkaban,Harry Potter,J.K. Rowling/Mary GrandPré,4.56,eng,435,2339585,36325,2004-05-01,Scholastic Inc.
4,Harry Potter Boxed Set Books 1-5,Harry Potter,J.K. Rowling/Mary GrandPré,4.78,eng,2690,41428,164,2004-09-13,Scholastic


In [164]:
# Separate target variable Y from features X
print("Separating labels from features...")
target_variable = "average_rating"

X = dataset.drop(target_variable, axis = 1)
Y = dataset.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())

Separating labels from features...
...Done.

Y : 
0    4.57
1    4.49
2    4.42
3    4.56
4    4.78
Name: average_rating, dtype: float64

X :
                                       title      is_series  \
0     Harry Potter and the Half-Blood Prince  Harry Potter    
1  Harry Potter and the Order of the Phoenix  Harry Potter    
2    Harry Potter and the Chamber of Secrets  Harry Potter    
3   Harry Potter and the Prisoner of Azkaban  Harry Potter    
4          Harry Potter Boxed Set  Books 1-5  Harry Potter    

                      authors   main_author language_code  num_pages  \
0  J.K. Rowling/Mary GrandPré  J.K. Rowling           eng        652   
1  J.K. Rowling/Mary GrandPré  J.K. Rowling           eng        870   
2                J.K. Rowling  J.K. Rowling           eng        352   
3  J.K. Rowling/Mary GrandPré  J.K. Rowling           eng        435   
4  J.K. Rowling/Mary GrandPré  J.K. Rowling           eng       2690   

   ratings_count  text_reviews_count publicati

In [169]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.iteritems():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['num_pages', 'ratings_count', 'text_reviews_count']
Found categorical features  ['title', 'is_series', 'authors', 'main_author', 'language_code', 'publication_date', 'publisher']


In [170]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
# WARNING : don't forget stratify=Y for classification problems
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify = Y)
print("...Done.")
print()

Dividing into train and test sets...


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [None]:
# Create pipeline for numeric features
numeric_transformer = SimpleImputer(strategy='mean') # missing values will be replaced by columns' mean

# Create pipeline for categorical features
categorical_transformer = OneHotEncoder(drop='first') # no missing values in categorical data, so we only need the OHE


# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()
# Label encoding
print("Encoding labels...")
print(Y_train.head())
encoder = LabelEncoder()
Y_train = encoder.fit_transform(Y_train)
print("...Done")
print(Y_train[0:5])

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !! The test set is used for validating decisions
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all your results.
print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()
# Label encoding
print("Encoding labels...")
print(Y_test[0:5])
Y_test = encoder.transform(Y_test)
print("...Done")
print(Y_test[0:5])

In [None]:
# Perform grid search
print("Grid search...")
classifier = DecisionTreeClassifier()

# Grid of values to be tested
params = {
    'max_depth': [4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 5],
    'min_samples_split': [2, 4, 8]
}
gridsearch = GridSearchCV(classifier, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best validation accuracy : ", gridsearch.best_score_)

In [None]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = gridsearch.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on training set...")
Y_train_proba = gridsearch.predict_proba(X_train)
print("...Done.")
print(Y_train_proba)
print()

In [None]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = gridsearch.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

# It's also possible to get the probabilities estimated by the model:
print("Probabilities on test set...")
Y_test_proba = gridsearch.predict_proba(X_test)
print("...Done.")
print(Y_test_proba)
print()

In [None]:
# Print scores
print("accuracy on training set : ", accuracy_score(Y_train, Y_train_pred))
print("accuracy on test set : ", accuracy_score(Y_test, Y_test_pred))
print()

print("f1-score on training set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))
print()

In [None]:
# Visualize confusion matrices
_ , ax = plt.subplots() # Get subplot from matplotlib
ax.set(title="Confusion Matrix on Train set") # Set a title that we will add into ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(gridsearch, X_train, Y_train, ax=ax) # ConfusionMatrixDisplay from sklearn
plt.show()

_ , ax = plt.subplots() # Get subplot from matplotlib
ax.set(title="Confusion Matrix on Test set") # Set a title that we will add into ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(gridsearch, X_test, Y_test, ax=ax) # ConfusionMatrixDisplay from sklearn
plt.show()

In [None]:
# Visualize ROC curves
_ , ax = plt.subplots() # Get subplot from matplotlib
ax.set(title="ROC Curve on Train set") # Set a title that we will add into ConfusionMatrixDisplay
RocCurveDisplay.from_estimator(gridsearch, X_train, Y_train, ax=ax) # RocCurveDisplay from sklearn
plt.show()

_ , ax = plt.subplots() # Get subplot from matplotlib
ax.set(title="ROC Curve on Test set") # Set a title that we will add into ConfusionMatrixDisplay
RocCurveDisplay.from_estimator(gridsearch, X_test, Y_test, ax=ax) # RocCurveDisplay from sklearn
plt.show()