<a href="https://colab.research.google.com/github/SanketJ29/SanketJ29/blob/main/Recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Mounted at /gdrive
/gdrive


In [None]:
import re
import pickle
import operator
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from scipy.sparse import csr_matrix
from pandas.api.types import is_numeric_dtype
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")

In [None]:
books = pd.read_csv(r"/gdrive/MyDrive/BX-Books.csv", delimiter=';', error_bad_lines=False, encoding='ISO-8859-1', warn_bad_lines=False)
users = pd.read_csv(r"/gdrive/MyDrive/BX-Users.csv", delimiter=';', error_bad_lines=False, encoding='ISO-8859-1', warn_bad_lines=False)
ratings = pd.read_csv(r"/gdrive/MyDrive/BX-Book-Ratings.csv", delimiter=';', error_bad_lines=False, encoding='ISO-8859-1', warn_bad_lines=False)

print("Books Data:    ", books.shape)
print("Users Data:    ", users.shape)
print("Books-ratings: ", ratings.shape)


In [None]:
## Books Dataset Preprocessing 
print("Columns: ", list(books.columns))
books.head()

In [None]:
## Drop URL columns
books.drop(['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis=1, inplace=True)
books.head()

In [None]:
## Checking for null values
books.isnull().sum() 

In [None]:
books.loc[books['Book-Author'].isnull(),:]

In [None]:
books.loc[books['Publisher'].isnull(),:]

In [None]:
books.at[187689 ,'Book-Author'] = 'Other'

books.at[128890 ,'Publisher'] = 'Other'
books.at[129037 ,'Publisher'] = 'Other'

In [None]:
## Checking for column Year-of-publication
books['Year-Of-Publication'].unique()

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
books.loc[books['Year-Of-Publication'] == 'DK Publishing Inc',:]

In [None]:
books.loc[books['Year-Of-Publication'] == 'Gallimard',:]

In [None]:
books.at[209538 ,'Publisher'] = 'DK Publishing Inc'
books.at[209538 ,'Year-Of-Publication'] = 2000
books.at[209538 ,'Book-Title'] = 'DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)'
books.at[209538 ,'Book-Author'] = 'Michael Teitelbaum'

books.at[221678 ,'Publisher'] = 'DK Publishing Inc'
books.at[221678 ,'Year-Of-Publication'] = 2000
books.at[209538 ,'Book-Title'] = 'DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)'
books.at[209538 ,'Book-Author'] = 'James Buckley'

books.at[220731 ,'Publisher'] = 'Gallimard'
books.at[220731 ,'Year-Of-Publication'] = '2003'
books.at[209538 ,'Book-Title'] = 'Peuple du ciel - Suivi de Les bergers '
books.at[209538 ,'Book-Author'] = 'Jean-Marie Gustave Le ClÃ?Â©zio'

In [None]:
## Converting year of publication in Numbers
books['Year-Of-Publication'] = books['Year-Of-Publication'].astype(int)

In [None]:
books.info()

In [None]:
books.head()

In [None]:
## Users Dataset Preprocessing
print("Columns: ", list(users.columns))
users.head()

In [None]:
## Checking null values
print(users.isna().sum())

In [None]:
## Check for all values present in Age column
print(sorted(list(users['Age'].unique())))

In [None]:
required = users[users['Age'] <= 80]
required = required[required['Age'] >= 10]

In [None]:
mean = round(required['Age'].mean())   
mean

In [None]:
users.loc[users['Age'] > 80, 'Age'] = mean    #outliers with age grater than 80 are substituted with mean 
users.loc[users['Age'] < 10, 'Age'] = mean    #outliers with age less than 10 years are substitued with mean
users['Age'] = users['Age'].fillna(mean)      #filling null values with mean
users['Age'] = users['Age'].astype(int)       #changing Datatype to int

In [None]:
users.info()

In [None]:
users.head()

In [None]:
## Books rating dataset Preprocessing
print("Columns: ", list(ratings.columns))
ratings.head()

In [None]:
## Checking for null values
ratings.isnull().sum() 

In [None]:
## checking all ratings number or not
print(is_numeric_dtype(ratings['Book-Rating']))

In [None]:
ratings.info()

In [None]:
ratings.head()

In [None]:
## Merging all three tables
dataset = pd.merge(books, ratings, on='ISBN', how='inner')
dataset = pd.merge(dataset, users, on='User-ID', how='inner')
dataset.info()

In [None]:
## Explicit Ratings Dataset
dataset1 = dataset[dataset['Book-Rating'] != 0]
dataset1 = dataset1.reset_index(drop = True)
dataset1.shape

In [None]:
## Implicit Ratings Dataset
dataset2 = dataset[dataset['Book-Rating'] == 0]
dataset2 = dataset2.reset_index(drop = True)
dataset2.shape

In [None]:
dataset1.head()

In [None]:
## Representation of Top book authors
plt.figure(figsize=(15,6))
sns.countplot(y="Book-Author", data=books,order=books['Book-Author'].value_counts().index[0:15])
plt.title("No of books by an author (Top 15)")

In [None]:
## Representation of Top rated books
plt.figure(figsize=(15,8))
sns.countplot(y="Book-Title", data=dataset, order=dataset['Book-Title'].value_counts().index[0:15])
plt.title("Number of Ratings for a book (Top 15)")

In [None]:
## RECOMMENDATION STYSTEM
bookName = input("Enter a book name: ")
number = int(input("Enter number of books to recommend: "))

In [None]:
def printBook(k, n):
    z = k['Book-Title'].unique()
    for x in range(len(z)):
        print(z[x])
        if x >= n-1:
            break

In [None]:
def get_books(dataframe, name, n):
    print("\nBooks by same Author:\n")
    au = dataframe['Book-Author'].unique()

    data = dataset1[dataset1['Book-Title'] != name]

    if au[0] in list(data['Book-Author'].unique()):
        k2 = data[data['Book-Author'] == au[0]]
    k2 = k2.sort_values(by=['Book-Rating'])
    printBook(k2, n)

    print("\n\nBooks by same Publisher:\n")
    au = dataframe['Publisher'].unique()

    if au[0] in list(data['Publisher'].unique()):
        k2 = pd.DataFrame(data[data['Publisher'] == au[0]])
    k2=k2.sort_values(by=['Book-Rating']) 
    printBook(k2, n)

In [None]:
if bookName in list(dataset1['Book-Title'].unique()):
    d = dataset1[dataset1['Book-Title'] == bookName]
    get_books(d, bookName, number)
else:
    print("Book Not Found!!")

In [None]:
## Books popoular yearly
data = pd.DataFrame(dataset1.groupby('ISBN')['Book-Rating'].count()).sort_values('Book-Rating', ascending=False)
data = pd.merge(data, books, on='ISBN', left_index = False)

years = set()
indices = []
for ind, row in data.iterrows():
    if row['Year-Of-Publication'] in years:
        indices.append(ind)
    else:
        years.add(row['Year-Of-Publication'])

data = data.drop(indices)
data = data.drop('Book-Rating', axis = 1)
data = data.sort_values('Year-Of-Publication')

pd.set_option("display.max_rows", None, "display.max_columns", None)
data


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
199,0099771519,Memoirs of a Geisha Uk,Arthur Golden,0,Trafalgar Square
50958,964442011X,Tasht-i khun,IsmaÂ°il Fasih,1376,Nashr-i Alburz
51049,9643112136,Dalan-i bihisht (Dastan-i Irani),Nazi Safavi,1378,Intisharat-i Quqnus
130403,0781228956,"Complete Works 10 Volumes [2,6,7,8,9] (Notable American Authors)",Benjamin Franklin,1806,Reprint Services Corp
92641,1551103982,The Cycling Adventures of Coconut Head: A North American Odyssey,Ted Schredd,1900,Graphic Arts Center Pub Co
122731,0671825356,W D HSE PLANTS,Jd Hersey,1901,Simon &amp; Schuster
45038,0373226888,Tommy's Mom,Linda O. Johnston,1902,Harlequin
104397,038528120X,CATCH 22,JOSEPH HELLER,1904,Delta
103215,0404089119,Charlotte Bronte and Her Sisters,Clement K. Shorter,1906,Ams Pr
25641,0911662251,Kybalion: A Study of the Hermetic Philosophy of Ancient Egypt and Greece,Three Initiates,1908,Yoga Publication Society
