In [None]:
import numpy as np 
import pandas as pd 
import os

In [None]:
## reading the dataset from kaggle input
user_df = pd.read_csv('/kaggle/input/amazon-books-reviews/Books_rating.csv')
book_df = pd.read_csv('/kaggle/input/amazon-books-reviews/books_data.csv')

In [None]:
## showing the user dataframe
user_df.head()

In [None]:
## printing the shape of user dataframe
user_df.shape

In [None]:
## renaming the columns for better understanding
user_df.rename(columns={
    "review/helpfulness":"rating_review",
    "review/score":"rating",
    "review/text":"review",
    "review/summary":"review_summary",
    "User_id":"user_id",
    "Title":"title"
},inplace = True)

In [None]:
## removing unnecessary columns
user_df.drop(columns=["Id","Price","profileName","review/time"], inplace = True)

In [None]:
## now showing the user dataframe again
user_df.head()

In [None]:
## printing the shape again
user_df.shape

In [None]:
## showing the book dataframe 
book_df.head()

In [None]:
## printing the shape of book dataframe
book_df.shape

In [None]:
## renaming some columns for better understanding
book_df.rename(columns={
    "Title":"title",
    "categories":"genre",
    "ratingsCount":"ratings",
},inplace = True)

In [None]:
## removing some unnecessary columns
book_df.drop(columns=["description","previewLink","publishedDate","infoLink","image"], inplace = True)

In [None]:
## showing the book dataframe again
book_df.head()

In [None]:
## printing the shape again
book_df.shape

# Handling The NULL Values

In [None]:
## checking for the empty fields for user_df 
user_df.isna().sum()

In [None]:
## removing the rows containing empty fileds on title
user_df = user_df.dropna(subset=['title'])

In [None]:
## checking for the empty fields for user_df again
user_df.isna().sum()

In [None]:
## now removing the rows containing empty fields on user_id
user_df = user_df.dropna(subset=['user_id'])

In [None]:
## checking for the empty fields for user_df again
user_df.isna().sum()

In [None]:
## now removing the rows containing empty fields on review_summary
user_df = user_df.dropna(subset=['review_summary'])

In [None]:
## checking for the empty fields for user_df again
user_df.isna().sum()

In [None]:
## now removing the rows containing empty fields on review
user_df = user_df.dropna(subset=['review'])

In [None]:
## checking for the empty fields for user_df again
user_df.isna().sum()

In [None]:
## now printing the shape again for user dataframe 
user_df.shape

In [None]:
## checking for the empty fields for book_dataframe
book_df.isna().sum()

In [None]:
## now removing the rows containing empty fields on title
book_df = book_df.dropna(subset=['title'])

In [None]:
## checking for the empty fields for book_dataframe again
book_df.isna().sum()

In [None]:
## now removing the rows containing empty fields on ratings
book_df = book_df.dropna(subset=['ratings'])

In [None]:
## checking for the empty fields for book_dataframe again
book_df.isna().sum()

In [None]:
## now removing the rows containing empty fields on genre
book_df = book_df.dropna(subset=['genre'])

In [None]:
## checking for the empty fields for book_dataframe again
book_df.isna().sum()

In [None]:
## now removing the rows containing empty fields on authors
book_df = book_df.dropna(subset=['authors'])

In [None]:
## checking for the empty fields for book_dataframe again
book_df.isna().sum()

In [None]:
## now removing the rows containing empty fields on publisher
book_df = book_df.dropna(subset=['publisher'])

In [None]:
## checking for the empty fields for book_dataframe again
book_df.isna().sum()

In [None]:
## now printing the shape again for user dataframe 
book_df.shape

# Checking For Uncertainty In Dataset After Handling Null Values

In [None]:
## extracting the user info who gave rating more than 1000 books
user_df_id_counts = user_df['user_id'].value_counts()
user_ids_g1000 = user_df_id_counts[user_df_id_counts > 1000].index

In [None]:
user_ids_g1000

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="darkgrid", palette="rainbow")

In [None]:
fig, axes = plt.subplots(nrows=len(user_ids_g1000), ncols=1, figsize=(6, 30), constrained_layout=True)

for i, user_id in enumerate(user_ids_g1000):
    user_info = user_df[user_df['user_id'] == user_id]
    sns.scatterplot(ax=axes[i],data=user_info, color='skyblue')
    axes[i].set_ylim(0,6)
    axes[i].set_title(f'Box Plot Ratings for user_id = {user_id}')
    axes[i].set_xlabel('Rating')
plt.show()

# Applying Fuzzy Logic For Removing Uncertainty

## Create A DataFrame based on the mode rating of each user according to the rating given to the books

In [None]:
user_info = user_df.groupby('user_id')

In [None]:
user_info = user_info['rating'].apply(lambda x: x.mode()[0]).reset_index()

In [None]:
user_info.columns = ['user_id','mode_rating']

In [None]:
user_info.head()

In [None]:
user_info.shape

## Create A DataFrame based on the mode rating of each book globally

In [None]:
book_info = user_df.groupby('title')

In [None]:
book_info = book_info['rating'].apply(lambda x: x.mode()[0]).reset_index()

In [None]:
book_info.columns = ['title','mode_rating']

In [None]:
book_info.shape

In [None]:
book_info.head()

In [None]:
user_df.shape

## Applying the Fuzzy Logic

In [None]:
merged_user_df = user_df

In [None]:
merged_user_df = merged_user_df.merge(user_info, on='user_id', suffixes=('', '_user_mode'))
merged_user_df = merged_user_df.merge(book_info, on='title', suffixes=('', '_book_mode'))

In [None]:
merged_user_df.shape

In [None]:
def update_rating(row):
    user_mode_rating = row['mode_rating']
    book_mode_rating = row['mode_rating_book_mode']
    rating = row['rating']
    if user_mode_rating == rating :
        if book_mode_rating <= rating :
            return rating
        else :
            return book_mode_rating
    elif user_mode_rating > rating :
        if book_mode_rating >= rating :
            return book_mode_rating
        else :
            return rating
    else :
        if book_mode_rating <= rating :
            return user_mode_rating
        else :
            return book_mode_rating
    


In [None]:
merged_user_df['rating'] = merged_user_df.apply(update_rating, axis=1)

In [None]:
merged_user_df.drop(columns=['mode_rating', 'mode_rating_book_mode'], inplace=True)

# Checking after applying the Fuzzy logic

In [None]:
user_ids_g1000

In [None]:
fig, axes = plt.subplots(nrows=len(user_ids_g1000), ncols=1, figsize=(6, 30), constrained_layout=True)

for i, user_id in enumerate(user_ids_g1000):
    user_info = merged_user_df[merged_user_df['user_id'] == user_id]
    sns.scatterplot(ax=axes[i],data=user_info, color='skyblue')
    axes[i].set_ylim(0,6)
    axes[i].set_title(f'Box Plot Ratings for user_id = {user_id}')
    axes[i].set_xlabel('Rating')
plt.show()

In [None]:
user_df = merged_user_df

In [None]:
user_df.to_csv('/kaggle/working/book_ratings.csv')

# EDA 

In [None]:
user_df.head()

In [None]:
user_count = user_df['user_id'].nunique()

In [None]:
book_df.head()

In [None]:
book_count = book_df['title'].nunique()

In [None]:
all_authors = [author for sublist in book_df['authors'] for author in sublist]
author_count = len(set(all_authors))

In [None]:
all_genres = [genre for sublist in book_df['genre'] for genre in sublist]
genre_count = len(set(all_genres))

In [None]:

plot_data = pd.DataFrame({
    'Category': ['Users','Books', 'Authors', 'Genres'],
    'Unique Count': [user_count, book_count, author_count, genre_count]
})

plt.figure(figsize=(10, 4))
ax = sns.barplot(data=plot_data, x='Category', y='Unique Count', palette='viridis')
plt.yscale('log')
plt.title('Total Unique Users, Books, Authors, and Genres')
plt.xlabel('Category')
plt.ylabel('Unique Count')
for index, row in plot_data.iterrows():
    ax.text(row.name, row['Unique Count'], f'{row["Unique Count"]:,}', color='black', ha="center")
plt.show()

In [None]:
user_df.head()

In [None]:
# Preprocessing functions 
def clean_text(text):
  # Implement text cleaning steps here (lowercase, remove stopwords, etc.)
  return text.lower()


In [None]:
# Preprocess reviews and summaries
user_df["review"] = user_df["review"].apply(clean_text)
user_df["review_summary"] = user_df["review_summary"].apply(clean_text)

In [None]:
user_df.head()

In [None]:
data = user_df.merge(book_df, on="title")

In [None]:
data["publisher"] = data["publisher"].apply(clean_text)
data['authors'] = data['authors'].apply(clean_text)
data['genre'] = data['genre'].apply(clean_text)

In [None]:
data.head()

In [None]:
data.to_csv('/kaggle/working/clean_data_amazon_book_review.csv')