# Book-Oracle

Book-Oracle is a book recommendation apps which suggest books based on preferences.

## Import Libraries

In [None]:
import pandas as pd
import numpy as np

#modelling
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from scipy.stats import loguniform
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from transformers import pipeline
from tqdm import tqdm
import nltk


#plotting
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.display.float_format = "{:,.2f}".format

import warnings
warnings.filterwarnings('ignore')

RSEED = 42

## Import Data

In [None]:
# Import data for ratings csv file
df_ratings = pd.read_csv("data/Ratings.csv")

In [None]:
#Checking no of rows and columns in dataframe
df_ratings.shape

In [None]:
#Descriptive Stats 
df_ratings.describe()

In [None]:
#Checking datatypes of columns in ratings dataframe
df_ratings.info()

(ISBN) is a 13-digit number that uniquely identifies books and book-like products published internationally.

In [None]:
#Checking the string length of ISBN cell values
ISBNCount = df_ratings['ISBN'].str.len()
print(ISBNCount)

In [None]:
# There are 7 unique ISBN values
ISBNCount.unique()

In [None]:
#Total rows with All ISBN's = 1149780
#Count of ISBN's with str(len) as 13 = 1740
#Count of ISBN's with str(len) as 10 = 1139363
#Count of ISBN's with str(len) as 9 = 5140
df_13 = df_ratings[df_ratings['ISBN'].str.len()==9]

In [None]:
df_ratings.duplicated().sum()

In [None]:
print(df_ratings['User-ID'].nunique()) #105283
print(df_ratings['ISBN'].nunique()) #340556

In [None]:
# Get count duplicates multiple columns using dataframe.pivot_table() 

#Here one User-ID has given ratings for MORE than ONE ISBN(book)

df2 = df_ratings.pivot_table(index = ['User-ID', 'ISBN'], aggfunc ='size')
print(df2)

NOTE : One User can rate multiple books (ISBN) likewise a Book(ISBN) can have ratings from multiple Users

Its M-M relationship


So there are NO duplicate entries in Ratings table

In [None]:
#Get Count Duplicates for Each Unique Row

df2 = df_ratings.groupby(df_ratings.columns.tolist(), as_index=False).size()

#There are NO duplicates for unique combination of User-ID and ISBN
print(df2[df2['size'] > 1])


In [None]:
#check if one ISBN has multiple UserID's

isbn_counts = df_ratings.groupby('ISBN').size()

# Filter ISBNs with multiple UserIDs
multiple_userids_isbn = isbn_counts[isbn_counts > 1]

# Display ISBNs with multiple UserIDs
print(multiple_userids_isbn)



In [None]:
#Rename column names

column_name_mapping = {
    'User-ID': 'user_id',
    'Book-Rating': 'book_rating',
    # Add more mappings as needed
}

# Use the rename method to rename the columns
df_ratings.rename(columns=column_name_mapping, inplace=True)

Python function for data cleaning of Ratings csv file

- Renaming Columns
- Checking for duplicates if any
- Checking missing / null values

In [None]:
#Perform data cleaning operations on a DataFrame using pandas built-in functions.

def dataclean_rating(df_ratings):
    
    # Rename columns
    column_name_mapping = {
    'User-ID': 'user_id',
    'Book-Rating': 'book_rating',
    # Add more mappings as needed
}
# Use the rename method to rename the columns
    df_ratings.rename(columns=column_name_mapping, inplace=True)
    print(df_ratings.columns)

    # Check for and display duplicate rows
    duplicate_rows = df_ratings[df_ratings.duplicated(keep=False)]
    print("Duplicate Rows:")
    print(duplicate_rows)

    # Check for and display rows with missing values
    missing_values = df_ratings[df_ratings.isna().any(axis=1)]
    print("\nRows with Missing Values:")
    print(missing_values)

    return df_ratings

In [None]:
dataclean_rating(df_ratings)

## Data Cleaning for Each Table

## Exploratory Data Analysis for Each Table

### Statistical Summary

In [None]:
df_ratings.describe(include='all')

- Data Visulation based on book ratings ( 0 rating included)

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x="book_rating", data=df_ratings)

In [None]:
## Explicit Ratings (Excluded rating 0 for better visualization)
plt.figure(figsize=(8,6))
data = df_ratings[df_ratings['book_rating'] != 0]
sns.countplot(x="book_rating", data=data)
plt.title("Explicit Ratings")

In [None]:
#Description of book rating
df_ratings['book_rating'].describe()

### Observation - Book-Ratings are  distributed with median rating of 3

In [None]:
plt.figure(figsize=(13,17))
sns.pairplot(data=df_ratings.drop(['ISBN'],axis=1))
plt.show()

In [None]:
#Selecting Books with Optimum Number of Ratings (>5)

# counting the ratings >=5 by grouping ISBN and sort by ratings in desc order
isbn_rating_desc = df_ratings[df_ratings['book_rating'] >= 5].groupby(['ISBN'])['book_rating'].count().reset_index().sort_values('book_rating', ascending=False)
print(isbn_rating_desc)

In [None]:
# Convert 'ISBN' to category type for better visualization
df_ratings['ISBN'] = df_ratings['ISBN'].astype('category')

# Create a pair plot
sns.pairplot(df_ratings, hue='ISBN', markers='o', palette='viridis')

plt.show()

## Merge Tables

## Data Cleaning

## Exploratory Data Analysis

## Pipline Architecture

## Sample Size

## Modelling

## Evaluation

## Error Analysis