In [None]:
#Import Librairies

import pandas as pd
import numpy as np
import os
import glob
import csv
#import Selenium for webscraping
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# visualization
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
%matplotlib inline
sns.set(style= 'darkgrid')
sns.set_palette('deep')


#transformations
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# machine learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Lasso
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Perceptron
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn import metrics

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import precision_score, recall_score

In [None]:
#FEATURES SELECTION AND ENGINEERING

In [None]:
#save dataset to csv file
df1_featuring=pd.read_csv('books_file_cleaned.csv')

#copy df1 to df1_featuring
#df1_featuring=df1.copy()

df1_featuring.info()

In [None]:
# Create features lists and separate categorical and numerical features based on the data type
categorical_features = df1_featuring.select_dtypes(include=['object']).columns.tolist()
#create a list of numerical features with no duplicates

numerical_features = df1_featuring.select_dtypes(include=['int64', 'float64']).columns.tolist()

print('categorical column of the dataset are : \n', categorical_features)
print('\n numerical column of the dataset are : \n', numerical_features)

In [None]:
#heatmap of the correlation between the numerical variables
plt.figure(figsize=(10,5))
sns.heatmap(df1_featuring[numerical_features[:5]].corr(), annot=True, fmt='.2f',cmap='PiYG')

In [None]:
#drop the columns author_occurrence and publisher occurrence and text_reviews_count from numerical_features as  strongly correlated with the target variable average_rating
#numerical_features.remove('text_reviews_count')
numerical_features.remove('author_occurrence')
numerical_features.remove('publisher_occurrence')


#drop isbn13 from the categorical_features as it is not relevant
categorical_features.remove('isbn13')
categorical_features.remove('categories')

In [None]:
#drop the columns weighted_rating and text_reviews_count, author_occurence and publisher occurrence from numerical_features as both are strongly correlated with the variable ratings_count. drop isbn13 from the categorical_features as it is not relevant
df1_featuring.drop(['text_reviews_count','author_occurrence','publisher_occurrence','isbn13', 'categories'], axis=1, inplace=True)

In [None]:
#subplot all the numerical variables and plot the distribution of each variable and the mean with axe 1, using sns.color_palette("blend:7ab,#eda, as_cmap=True")
fig, axes = plt.subplots(len(numerical_features[0:3]), 1, figsize=(20, 15))
for axe, col in zip(axes, numerical_features[0:3]):
    sns.histplot(df1_featuring[col], ax=axe, color='darkturquoise', kde=True)
    axe.axvline(df1_featuring[col].mean(), c='k', color='red', linestyle='--')
    axe.set_title(col)
plt.tight_layout()
plt.show()





In [None]:
#define a function to remove outliers with IQR
def remove_outliers(df1_featuring, col):
    q1 = df1_featuring[col].quantile(0.25)
    q3 = df1_featuring[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    return df1_featuring[(df1_featuring[col] > lower_bound) & (df1_featuring[col] < upper_bound)]

In [None]:

#remove outliers from the numerical columns and print the number of rows before and after removing outliers
#bound df1 to df2
df2 = df1_featuring.copy()
for col in numerical_features[1:3]:
    print(f'Before removing outliers from {col} : {df2.shape[0]}')
    df2 = remove_outliers(df2, col)
    print(f'After removing outliers from {col} : {df2.shape[0]}')

In [None]:

#normalization of the numerical features
for v in ['ratings_count', 'num_pages']:
    df2[v] = (df2[v] - df2[v].min()) / (df2[v].max()-df2[v].min())

fig, axes = plt.subplots(len(numerical_features[0:3]), 1, figsize=(20, 15))
for axe, col in zip(axes, numerical_features[0:3]):
    sns.histplot(df2[col], ax=axe, color='darkblue', kde=True)
    axe.axvline(df2[col].mean(), c='k', color='red', linestyle='--')
    axe.set_title(col)

In [None]:

#encode the categorical features

#one hot encoding of the categorical features
df2 = pd.get_dummies(df2, columns=categorical_features, drop_first=True)

In [None]:

#split the dataset into train and test set

X = df2.drop('average_rating', axis=1)
y = df2['average_rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#lenght df2
print(len(df2))

#lenght df2_test
print(len(X_test))

print(len(y_train))

In [None]:
#show where isnull true
df2.isnull().any()[df2.isnull().any()==True]

In [None]:
#MACHINE LEARNING : REGRESSION

In [None]:

#train a linear regression model

lr = LinearRegression()
lr.fit(X_train, y_train)

#predict the test set
y_test_pred_lr = lr.predict(X_test)

#predict the train set
y_train_pred_lr = lr.predict(X_train)


#print the MAE, MSE, R^2, RMSE for the train and test set
print('MAE in Linear Regression train: %.3f, test: %.3f' % (
        mean_absolute_error(y_train, y_train_pred_lr),
        mean_absolute_error(y_test, y_test_pred_lr)))
print('MSE in Linear Regression train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred_lr),
        mean_squared_error(y_test, y_test_pred_lr)))
print('R^2 in Linear Regression train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred_lr),
        r2_score(y_test, y_test_pred_lr)))
#print adjusted r2 for the train and test set
print('Adjusted R^2 in Linear Regression train: %.3f, test: %.3f' % (
        1 - (1-r2_score(y_train, y_train_pred_lr))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1),
        1 - (1-r2_score(y_test, y_test_pred_lr))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)))
print('RMSE in Linear Regression train: %.3f, test: %.3f' % (
        np.sqrt(metrics.mean_squared_error(y_train, y_train_pred_lr)),
        np.sqrt(metrics.mean_squared_error(y_test, y_test_pred_lr))))

In [None]:
plt.scatter(y_test_pred_lr, y_test, alpha = 0.7, color = 'b')
plt.xlabel(' Book Ratings prediction')
plt.ylabel('Actual Book Ratings')
plt.title('linear regression')
plt.tight_layout()
plt.xlim(0,5)
plt.show()

In [358]:
#train a random forest regressor
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

#predict the test set
y_test_pred_rf = rf.predict(X_test)

#predict the train set
y_train_pred_rf = rf.predict(X_train)


#print the MAE, MSE, R^2, RMSE for the train and test set
print('MAE in Random Forest train: %.3f, test: %.3f' % (
        mean_absolute_error(y_train, y_train_pred_rf),
        mean_absolute_error(y_test, y_test_pred_rf)))
print('MSE in Random Forest train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred_rf),
        mean_squared_error(y_test, y_test_pred_rf)))
print('R^2 in Random Forest train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred_rf),
        r2_score(y_test, y_test_pred_rf)))
print('RMSE in Random Forest train: %.3f, test: %.3f' % (
        np.sqrt(metrics.mean_squared_error(y_train, y_train_pred_rf)),
        np.sqrt(metrics.mean_squared_error(y_test, y_test_pred_rf))))

KeyboardInterrupt: 

In [None]:
plt.scatter(y_test_pred_rf, y_test, alpha = 0.7, color = 'b')
plt.xlabel(' Book Ratings prediction')
plt.ylabel('Actual Book Ratings')
plt.title('Random Forest')
plt.tight_layout()
plt.xlim(0,5)
plt.show()

In [None]:
#Ridge regression

#train a ridge regression
from sklearn.linear_model import Ridge


ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

#predict the test set
y_test_pred_ridge = ridge.predict(X_test)

#predict the train set
y_train_pred_ridge = ridge.predict(X_train)
#print the MAE, MSE, R^2, RMSE for the train and test set
print('MAE in Ridge train: %.3f, test: %.3f' % (
        mean_absolute_error(y_train, y_train_pred_ridge),
        mean_absolute_error(y_test, y_test_pred_ridge)))
print('MSE in Ridge train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred_ridge),
        mean_squared_error(y_test, y_test_pred_ridge)))
print('R^2 in Ridge train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred_ridge),
        r2_score(y_test, y_test_pred_ridge)))
print('RMSE in Ridge train: %.3f, test: %.3f' % (
        np.sqrt(metrics.mean_squared_error(y_train, y_train_pred_ridge)),
        np.sqrt(metrics.mean_squared_error(y_test, y_test_pred_ridge))))


In [None]:
plt.scatter(y_test_pred_ridge, y_test, alpha = 0.7, color = 'b')
plt.xlabel(' Book Ratings prediction')
plt.ylabel('Actual Book Ratings')
plt.title('Ridge Regression')
plt.tight_layout()
plt.xlim(0,5)
plt.show()

In [None]:
pred = pd.DataFrame({'Actual': y_test.tolist(), 'Predicted': y_test_pred_ridge.tolist()}).head(15)
pred.head(10)
pred.plot(kind='bar', figsize=(15, 5))
plt.title('Ridge model with predicted and real values')
plt.xlabel('Number of examples')
plt.ylabel('Average Ratings')
plt.ylim(0,5)