<div>
 <img src="https://media4.giphy.com/media/l0DEKnWCEnW5FeyJi/giphy.gif?cid=ecf05e4746d3984002acd52b8f247fcf58cca1a853f0f0fe&rid=giphy.gif" >

<h1><center>📚Book-Crossing Analysis📚</center></h1>
<br>
This dataset contains 278,858 users (anonymized but with demographic information) providing 1,149,780 ratings (explicit / implicit) about 271,379 books. 
<br>
 <br>
</div>

> 📖**Dataset description**:
* `BX-Books.csv` has 8 columns which include information about books. 
 <ul style="list-style-type:circle;">
    <li>We can uniquely identify each book with the help of ISBN(International Standard Book Number). </li>
    <li>The title, author,publisher and year of publication of each book have been listed. </li>
    <li>The last three columns include the URLs for different sizes of images.</li>
    </ul>
* `BX-Users.csv` lists the age and location of a user along with their User-ID.
* `BX-Book-Ratings.csv` contains book ratings given by users. The User-ID and ISBN have also been provided.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<div class="alert alert-warning" role="alert">
  <h3><strong>Imports</strong></h3>
</div>

In [None]:
import pandas_profiling
import seaborn as sns
import matplotlib.pyplot as plt
import requests

from PIL import Image as im
from wordcloud import WordCloud,STOPWORDS
from IPython.core.display import Image
from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
sr_ = Style.RESET_ALL

In [None]:
custom_colors = ['#48bfe3','#56cfe1','#64dfdf','#72efdd','#80ffdb']
customPalette = sns.color_palette(custom_colors)
sns.palplot(sns.color_palette(custom_colors),size=1)

sns.set_context("poster")

<div class="alert alert-warning" role="alert">
  <h3><strong>Reading the 3 csv files</strong></h3>
</div>

In [None]:
#Users
u_cols = ['user_id', 'location', 'age']
users = pd.read_csv('../input/bookcrossing-dataset/Book reviews/BX-Users.csv', sep=';', names=u_cols, encoding='latin-1',low_memory=False)

#Books
i_cols = ['isbn', 'book_title' ,'book_author','year_of_publication', 'publisher', 'img_s', 'img_m', 'img_l']
items = pd.read_csv('../input/bookcrossing-dataset/Book reviews/BX_Books.csv', sep=';', names=i_cols, encoding='latin-1',low_memory=False)

#Ratings
r_cols = ['user_id', 'isbn', 'rating']
ratings = pd.read_csv('../input/bookcrossing-dataset/Book reviews/BX-Book-Ratings.csv', sep=';', names=r_cols, encoding='latin-1',low_memory=False)

<div class="alert alert-warning" role="alert">
  <h3><strong>What does the data look like?</strong></h3>
</div>

In [None]:
users.head(5)

In [None]:
users.describe()

In [None]:
print(f"{y_}{users.dtypes}\n") 

In [None]:
items.head(5)

In [None]:
items.describe()

In [None]:
print(f"{y_}{items.dtypes}\n") 

In [None]:
ratings.head(5)

In [None]:
ratings.describe()

In [None]:
print(f"{y_}{ratings.dtypes}\n") 

In [None]:
users = users.drop(users.index[0])
items = items.drop(items.index[0])
ratings = ratings.drop(ratings.index[0])

<div class="alert alert-warning" role="alert">
  <h3><strong>Changing datatypes and replacing nan values</strong></h3>
</div>

In [None]:
users['age'] = users['age'].astype(float)
users['user_id'] = users['user_id'].astype(int)
ratings['user_id'] = ratings['user_id'].astype(int)
ratings['rating'] = ratings['rating'].astype(int)
items['year_of_publication'] = items['year_of_publication'].astype(int)

In [None]:
users.isnull().sum()

In [None]:
users['age'].describe()

In [None]:
users.loc[(users.age>99) | (users.age<5),'age'] = np.nan
users.age = users.age.fillna(users.age.mean())

In [None]:
ratings.isnull().sum()

In [None]:
items.isnull().sum()

In [None]:
items.loc[items.publisher.isnull(),:]

In [None]:
items.loc[items.isbn=='193169656X','publisher']='Mundania Press LLC'
items.loc[items.isbn=='1931696993','publisher']='Novelbooks Incorporated'

In [None]:
items.loc[items.book_author.isnull(),:]

In [None]:
items.loc[items.isbn=='9627982032','book_author']='Larissa Anne Downe'

In [None]:
print(sorted(items['year_of_publication'].unique()))

In [None]:
items.loc[(items.year_of_publication==0)|(items.year_of_publication>2008) ,'year_of_publication' ] = np.nan
items.year_of_publication = items.year_of_publication.fillna(round(items.year_of_publication.mean()))

<div class="alert alert-warning" role="alert">
  <h3><strong>Merging the dataframes using User-ID and ISBN columns</strong></h3>
</div>

In [None]:
df = pd.merge(users, ratings, on='user_id')
df = pd.merge(df, items, on='isbn')
df.head(5)

In [None]:
df.shape

<div class="alert alert-warning" role="alert">
  <h3><strong>Spitting location column into newer columns</strong></h3>
</div>

In [None]:
location = df.location.str.split(', ', n=2, expand=True)
location.columns=['city', 'state', 'country']

df['city'] = location['city']
df['state'] = location['state']
df['country'] = location['country']

<div class="alert alert-warning" role="alert">
  <h3><strong>Fetching images of different sizes from the URLs</strong></h3>
</div>

In [None]:
def images(col,i):
    url = df[col][i]
    response = requests.get(url)
    img = Image(url)
    return img

In [None]:
images('img_s',0)

In [None]:
images('img_m',0)

In [None]:
images('img_l',0)

<div class="alert alert-warning" role="alert">
  <h3><strong>Dropping columns we won't be needing</strong></h3>
</div>

In [None]:
df = df.drop(['location','img_s','img_m','img_l'], axis = 1)

In [None]:
df.dtypes

<div class="alert alert-warning" role="alert">
  <h3><strong>Pandas Profiling</strong></h3>
</div>

In [None]:
# profile = pandas_profiling.ProfileReport(df)
# profile

<div class="alert alert-warning" role="alert">
  <h3><strong>Rating Distribution</strong></h3>
</div>

> 📌Note: This dataset contains both **explicit** ratings, on a 1–10 scale and **implicit** actions of unspecified nature. 

> 0 values indicate all interactions without rating values.

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(x='rating',data=df,palette=customPalette)
plt.title('Rating Distribution',size=20)
plt.show()

In [None]:
df['rating'].value_counts()

> Analysing the rating distribution and the value counts of the ratings column in the dataframe, we notice the that the number of implicit ratings are significantly high!

<div class="alert alert-warning" role="alert">
  <h3><strong>Explicit Rating Distribution</strong></h3>
</div>

In [None]:
df_v=df[['rating']].copy()
df_v.dtypes
df_v = df_v[df_v.rating != 0]
plt.figure(figsize=(10,8))
sns.countplot(x='rating',data=df_v,palette=customPalette)
plt.title('Explicit Rating Distribution',size=20)
plt.show()

Users generally give higher ratings to books as per the above distribution,

<div class="alert alert-warning" role="alert">
  <h3><strong>Distribution of age of users</strong></h3>
</div>

In [None]:
plt.figure(figsize=(10,8))
sns.distplot(df['age'],kde=False)
plt.xlabel('Age')
plt.ylabel('count')
plt.title('Age Distribution',size=20)
plt.show()

<div class="alert alert-warning" role="alert">
  <h3><strong>Top 25 Years of Publication</strong></h3>
</div>

In [None]:
df_v=df[['year_of_publication']].copy()
df_v['year_of_publication'] = df_v['year_of_publication'].astype(int).astype(str)
df_v=df_v['year_of_publication'].value_counts().head(25).reset_index()
df_v.columns=['year','count']
df_v['year']='Year '+df_v['year']

plt.figure(figsize=(10,8))
sns.barplot(x='count',y='year',data=df_v,palette=customPalette)
plt.ylabel('Year Of Publication')
plt.yticks(size=12)
plt.title('Years of Publication',size=20)
plt.show()

In [None]:
def barplot(df,col,l):
    df_v=df[col].value_counts().head(25).reset_index()
    df_v.columns=[col,'count']

    plt.figure(figsize=(10,12))
    sns.barplot(x='count',y=col,data=df_v,palette=customPalette)
    plt.ylabel(l)
    plt.title(l,size=20)
    plt.show()

<div class="alert alert-warning" role="alert">
  <h3><strong>Top 25 Books</strong></h3>
</div>

In [None]:
barplot(df,'book_title','Book Title')

<div class="alert alert-warning" role="alert">
  <h3><strong>Top 25 Authors</strong></h3>
</div>

In [None]:
barplot(df,'book_author','Book Author')

<div class="alert alert-warning" role="alert">
  <h3><strong>Top 25 Publishers</strong></h3>
</div>

In [None]:
barplot(df,'publisher','Book publisher')

<div class="alert alert-warning" role="alert">
  <h3><strong>Wordcloud of book titles</strong></h3>
</div>

In [None]:
def color_func(word=None, font_size=None, position=None,  orientation=None, font_path=None, random_state=None):
    h = int(188)
    s = int(100.0 * 255 / 255.0)
    l = int(100.0 * float(random_state.randint(60, 160)) / 255.0)

    return "hsl({}, {}%, {}%)".format(h, s, l)

In [None]:
plt.subplots(figsize=(10,10))
wc = WordCloud(background_color="white", max_words=100,
               stopwords=STOPWORDS, max_font_size=256,
               random_state=42, width=500, height=500,color_func=color_func)
wc.generate(' '.join(df['book_title']))
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()