# Problem statement.
# Build a recommender system by using cosine simillarties score.

In [25]:
import pandas as pd
import numpy as np

# Loading The data

In [26]:
book = pd.read_csv(r'C:\Users\mukes\OneDrive\Desktop\Data Science\Assignments\Recommendation System\book.csv', encoding ='latin1')
book.head()

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
0,1,276726,Classical Mythology,5
1,2,276729,Clara Callan,3
2,3,276729,Decision in Normandy,6
3,4,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,5,276737,The Mummies of Urumchi,6


In [27]:
book.shape

(10000, 4)

In [28]:
book.isnull().sum()

Unnamed: 0     0
User.ID        0
Book.Title     0
Book.Rating    0
dtype: int64

No null values found

In [29]:
# number of unique users in the dataset
book["User.ID"].nunique()

2182

In [30]:
# Unique titles
book["Book.Title"].nunique()

9659

In [31]:
# Dropping Duplicates
book_no_duplicates = book.drop_duplicates(subset=['User.ID', 'Book.Title'])
book_no_duplicates.shape

(9993, 4)

# Reshaping the DataFrame using Pivot()

In [32]:
user_book = book_no_duplicates.pivot(index='User.ID', columns='Book.Title', values='Book.Rating').reset_index(drop=True)
user_book

Book.Title,"Jason, Madison &amp",Other Stories;Merril;1985;McClelland &amp,Repairing PC Drives &amp,'48,'O Au No Keia: Voices from Hawai'I's Mahu and Transgender Communities,...AND THE HORSE HE RODE IN ON : THE PEOPLE V. KENNETH STARR,01-01-00: A Novel of the Millennium,"1,401 More Things That P*Ss Me Off",10 Commandments Of Dating,"100 Great Fantasy Short, Short Stories",...,Zora Hurston and the Chinaberry Tree (Reading Rainbow Book),\Even Monkeys Fall from Trees\ and Other Japanese Proverbs,\I Won't Learn from You\: And Other Thoughts on Creative Maladjustment,"\More More More,\ Said the Baby",\O\ Is for Outlaw,"\Surely You're Joking, Mr. Feynman!\: Adventures of a Curious Character","\Well, there's your problem\: Cartoons",iI Paradiso Degli Orchi,stardust,Ã?Â?bermorgen.
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2177,,,,,,,,,,,...,,,,,,,,,,
2178,,,,,,,,,,,...,,,,,,,,,,
2179,,,,,,,,,,,...,,,,,,,,7.0,,
2180,,,,,,,,,,,...,,,,,,,,,,


In [33]:
# Impute those NaNs with 0 values
user_book.fillna(0, inplace=True)
user_book

Book.Title,"Jason, Madison &amp",Other Stories;Merril;1985;McClelland &amp,Repairing PC Drives &amp,'48,'O Au No Keia: Voices from Hawai'I's Mahu and Transgender Communities,...AND THE HORSE HE RODE IN ON : THE PEOPLE V. KENNETH STARR,01-01-00: A Novel of the Millennium,"1,401 More Things That P*Ss Me Off",10 Commandments Of Dating,"100 Great Fantasy Short, Short Stories",...,Zora Hurston and the Chinaberry Tree (Reading Rainbow Book),\Even Monkeys Fall from Trees\ and Other Japanese Proverbs,\I Won't Learn from You\: And Other Thoughts on Creative Maladjustment,"\More More More,\ Said the Baby",\O\ Is for Outlaw,"\Surely You're Joking, Mr. Feynman!\: Adventures of a Curious Character","\Well, there's your problem\: Cartoons",iI Paradiso Degli Orchi,stardust,Ã?Â?bermorgen.
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0
2180,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Calculating Cosine Similarity between Users

In [34]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

In [35]:
# Using pairwise_distances function to calculate the cosine similarity between users based on the book ratings in the user_book.
user_book = user_book.fillna(0)   # filling NAN withb 'o'
user_sim = 1 - pairwise_distances( user_book.values,metric='cosine')
user_sim

# pairwise_distances: This function calculates the pairwise distances or similarities between rows in the matrix.
#  Cosine similarity is calculated as 1 - Cosine Distance
# The resulting user_sim matrix will have cosine similarities between users

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [36]:
#Storing the results in a dataframe
user_sim_df = pd.DataFrame(user_sim)
user_sim_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2172,2173,2174,2175,2176,2177,2178,2179,2180,2181
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2180,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [37]:
# Setting the index and column names to user ids
user_sim_df.index = book['User.ID'].unique()
user_sim_df.columns = book['User.ID'].unique()

In [38]:
# Selecting only 5 rows and 5 columns
user_sim_df.iloc[0:5, 0:5]

Unnamed: 0,276726,276729,276736,276737,276744
276726,1.0,0.0,0.0,0.0,0.0
276729,0.0,1.0,0.0,0.0,0.0
276736,0.0,0.0,1.0,0.0,0.0
276737,0.0,0.0,0.0,1.0,0.0
276744,0.0,0.0,0.0,0.0,1.0


In [39]:
# filling the diagonal elements of the user_sim matrix with zeros. This is a common practice in similarity matrices to set the
# self-similarity to zero, as users are identical to themselves.
np.fill_diagonal(user_sim, 0)
user_sim_df.iloc[0:5, 0:5]

Unnamed: 0,276726,276729,276736,276737,276744
276726,0.0,0.0,0.0,0.0,0.0
276729,0.0,0.0,0.0,0.0,0.0
276736,0.0,0.0,0.0,0.0,0.0
276737,0.0,0.0,0.0,0.0,0.0
276744,0.0,0.0,0.0,0.0,0.0


# Checking Similar Users

In [40]:
user_sim_df.idxmax(axis=1)[0:10]

276726    276726
276729    276726
276736    276726
276737    276726
276744    276726
276745    276726
276747    276726
276748    161677
276751    276726
276754    276726
dtype: int64

In [41]:
book[(book['User.ID']==276729) | (book['User.ID']==276726)]

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
0,1,276726,Classical Mythology,5
1,2,276729,Clara Callan,3
2,3,276729,Decision in Normandy,6


In [42]:
book[(book['User.ID']==276737) | (book['User.ID']==276726)]

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
0,1,276726,Classical Mythology,5
4,5,276737,The Mummies of Urumchi,6


In [43]:
user_1=book[book['User.ID']==276729]
user_1

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
1,2,276729,Clara Callan,3
2,3,276729,Decision in Normandy,6


In [44]:
user_2=book[book['User.ID']==276726]
user_2

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
0,1,276726,Classical Mythology,5


# Recommendations

In [45]:
sorted_book_desc = book.sort_values(by='User.ID', ascending=False)

# Print the sorted DataFrame in descending order
print(sorted_book_desc)

      Unnamed: 0  User.ID                                         Book.Title  \
2398        2399   278854  Celtic Mythology (Library of the World's Myths...   
2397        2398   278854     A Treasury of Irish Myth, Legend, and Folklore   
2396        2397   278854                                Blast From the Past   
2395        2396   278854      La crÃ³nica del PerÃº (CrÃ³nicas de AmÃ©rica)   
2394        2395   278854                                      As valkÃ­rias   
...          ...      ...                                                ...   
2403        2404        8                Keepers of the Earth Teachers Guide   
2401        2402        8                                              Wings   
2405        2406        8                                  The Art Of Celtia   
2399        2400        8                            Ancient Celtic Romances   
2400        2401        8  The Western way: A practical guide to the West...   

      Book.Rating  
2398            8  

# 1. Highly Rated Books

In [46]:
high_rated_books = book[book['Book.Rating'] == 10]
high_rated_books

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
6,7,276745,What If?: The World's Foremost Military Histor...,10
16,17,276760,More Cunning Than Man: A Social History of Rat...,10
23,24,276772,Mary-Kate &amp; Ashley Switching Goals (Mary-K...,10
24,25,276772,Tell Me This Isn't Happening,10
34,35,276788,Sturmzeit. Roman.,10
...,...,...,...,...
9958,9959,162052,HOUSE OF MIRTH,10
9967,9968,162052,The Man Who Ate the 747,10
9968,9969,162052,Women of Mystery,10
9969,9970,162054,The Maggody Militia: An Arly Hanks Mystery (Ar...,10


Books rated as 10 can be recommended more and more to the existing and new users as well if any.

In [52]:
count_of_10_ratings = (book['Book.Rating'] == 10).sum()

# Print the count
print(f"The count of rating 10 is: {count_of_10_ratings}")

The count of rating 10 is: 1732


# 2. Most Popular Book

In [48]:
most_popular_book = book['Book.Title'].value_counts().idxmax()

print(f"The most popular book is: {most_popular_book}")

The most popular book is: Fahrenheit 451


# 3. Number of Books mostly rated by a User

In [49]:
common_user_Id_book = book['User.ID'].value_counts().idxmax()

print(f"The most common Id is: {common_user_Id_book}")

The most common Id is: 3757


In [50]:
user_Id=book[book['User.ID']==3757]
user_Id

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
6998,6999,3757,The Strange Case of Dr. Jekyll and Mr. Hyde an...,7
6999,7000,3757,Almost Vegetarian : A Primer for Cooks Who Are...,8
7000,7001,3757,The Adventures of Buckaroo Banzai : Across the...,6
7001,7002,3757,A General Theory of Love,7
7002,7003,3757,"The Best American Essays, 1989 (Best American ...",7
...,...,...,...,...
7516,7517,3757,What Happened When: A Chronology of Life and E...,9
7517,7518,3757,Catching Alice,7
7518,7519,3757,Ella Minnow Pea: A Novel in Letters,7
7519,7520,3757,Dave's Way: A New Approach to Old-Fashioned Su...,8


More books can be recommended to this User Id 3757 for promotion

# 4. Least Rated Books

In [59]:
least_rated_books = book[book['Book.Rating'] <= 4]
least_rated_books

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
1,2,276729,Clara Callan,3
19,20,276762,Beloved (Plume Contemporary Fiction),3
20,21,276762,Our Dumb Century: The Onion Presents 100 Years...,4
21,22,276768,New Vegetarian: Bold and Beautiful Recipes for...,4
115,116,276853,A Kiss Remembered,1
...,...,...,...,...
9942,9943,162052,The Concrete Enema: And Other News of the Weir...,2
9943,9944,162052,Empire Falls,4
9947,9948,162052,Cloudsplitter,4
9948,9949,162052,Leaving Home,1


Conclusion:
1. There are 1732 books rated as 10 which is 17% of the total books, so can hike up the prices of those books.
2. The most popular book is Fahrenheit 451, which can be recommended more to the users.
3. User Id 3757 seems to have rated most number of books. So, if any new books comes to the market, then can be recommend to him.
4. There are totally 489 books rated very low. So, can reduce the price of those books.