# Book-Oracle: Data Cleaning

- Clean threee datasets using functions in utils/data_cleaning & Save the merged dataset
- 26.11.2023
- Janina, Oliwia, Neha, Nina

## Import Libraries

In [None]:
import pandas as pd
import numpy as np


#Plotting
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.display.float_format = "{:,.2f}".format

#Custom functions
from utils.data_cleaning import clean_ratings, clean_users, clean_books

RSEED = 42

import warnings
warnings.filterwarnings('ignore')

## Import Data

In [None]:
ratings_df = pd.read_csv('data/ratings.csv')
books_df = pd.read_csv('data/books.csv')
users_df = pd.read_csv('data/users.csv')

print("Ratings table has {} rows and {} columns".format(ratings_df.shape[0], ratings_df.shape[1]))
print("Books table has {} rows and {} columns".format(books_df.shape[0], books_df.shape[1]))
print("Users table has {} rows and {} columns".format(users_df.shape[0], users_df.shape[1]))

## Data cleaning

#### Clean Books Dataset

In [None]:
books_df, common_identifier_dict = clean_books(books_df)

print("Books table has {} rows and {} columns".format(books_df.shape[0], books_df.shape[1]))

In [None]:
#Show rows with round brackets in title column from books_df
pd.set_option('display.max_rows', None)
books_df['book_author'] = books_df['book_author'].str.replace('&amp;', '&')

#Show rows with round brackets in title column from books_df

books



In [None]:
#Remove everything between brackets in book_title
books_df['book_title'] = books_df['book_title'].str.replace(r"\(.*\)","")

#Remove whitespaces and double spaces in book title
books_df['book_title'] = books_df['book_title'].str.strip()

#Replace &Amp; with & in book_author column


#Display all rows

In [None]:
books_df.head(3)

#### Clean Ratings Dataset

CHANGE LAST PRINT!!

In [None]:
ratings_df = clean_ratings(ratings_df)

#assign common identifier and convert to int
ratings_df['common_identifier'] = ratings_df['isbn'].map(common_identifier_dict)

print("Ratings table has {} rows and {} columns".format(ratings_df.shape[0], ratings_df.shape[1]))

#check how many rows in ratings_df do not have a common identifier in books_df
print("")
print("There are {} rows in ratings_df that do not have a common identifier in books_df".format(ratings_df[~ratings_df['common_identifier'].isin(books_df['common_identifier'])].shape[0]))

ratings_df.sort_values(by="common_identifier", ascending=True).head(3)

#### Clean Users Dataset

In [None]:
users_df = clean_users(users_df)

print("Users table has {} rows and {} columns".format(users_df.shape[0], users_df.shape[1]))
users_df.head(3)

## Merge tables

For Content-based filtering (NLP) we will need to do a left merge to get ALL books regardless whether they have a rating. - then do LEFT merge.

Users - consider also RIGHT merge on users, to capture all users regarrdless whether there is metadata

In [None]:
#Merge all tables
df = books_df.merge(ratings_df, on='common_identifier', how='inner')
df = df.merge(users_df, on='user_id', how='inner')

#check for missing values
print("There are {} missing values in df".format(df.isnull().sum().sum()))

#check for duplicates
print("There are {} duplicates in df".format(df.duplicated().sum()))

df.head(3)

In [None]:
df.info()

## Save cleaned data

In [None]:
df.to_csv('data/kaggle_full_df.csv', index=False)