## Introduction to a project

### Build a model to provide recommendations to the user based on the views of other users who share the same preferences.

### Show 5 recommendations for people who watched the movie Mona.



In [9]:
#install libraries that are not installed by default
!pip install pyxlsb



In [10]:
# import required libraries
import pandas as pd
import pyxlsb

The datasets consists of details about each customer and the movies and/or tv shows watched in addition to the genre.

In [13]:
# upload the datasets
data = pd.read_excel('/content/stc TV Data Set_T3 (1).xlsx',index_col=0)

In [14]:
# show the dataset
data.head()

Unnamed: 0,user_id_maped,program_name,rating,date_,program_genre
0,26138,100 treets,1,2017-05-27,Drama
1,7946,Moana,1,2017-05-21,Animation
2,7418,The Mermaid Princess,1,2017-08-10,Animation
3,19307,The Mermaid Princess,2,2017-07-26,Animation
4,15860,Churchill,2,2017-07-07,Biography


In [15]:
# show number of rows and columns
data.shape

(1048575, 5)

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1048575 entries, 0 to 1048574
Data columns (total 5 columns):
 #   Column         Non-Null Count    Dtype         
---  ------         --------------    -----         
 0   user_id_maped  1048575 non-null  int64         
 1   program_name   1048575 non-null  object        
 2   rating         1048575 non-null  int64         
 3   date_          1048575 non-null  datetime64[ns]
 4   program_genre  1048575 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 48.0+ MB


In [18]:
# describe the numeric values in the dataset
data.describe()

Unnamed: 0,user_id_maped,rating,date_
count,1048575.0,1048575.0,1048575
mean,17092.66,2.497283,2017-10-04 00:23:20.346183936
min,1.0,1.0,2017-03-14 00:00:00
25%,8253.0,1.0,2017-06-10 00:00:00
50%,17149.0,2.0,2017-10-14 00:00:00
75%,25665.0,3.0,2018-01-21 00:00:00
max,34280.0,4.0,2018-04-30 00:00:00
std,10035.13,1.119837,


In [20]:
# check if any column has null values
data.isnull().any()

Unnamed: 0,0
user_id_maped,False
program_name,False
rating,False
date_,False
program_genre,False


In [21]:
# import visualization libraries
import matplotlib.pyplot as plt
import plotly
import plotly.express as px
import plotly.graph_objects as go


In [36]:
# creat pivot table
data_feature= data.pivot_table(index='program_name',values='rating',columns='user_id_maped',fill_value=0)
data_feature.head()

user_id_maped,1,5,9,11,15,17,20,26,28,30,...,34259,34261,34263,34265,34267,34269,34271,34273,34277,34280
program_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#FollowFriday,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Days in a Madhouse,0.0,0.0,0.0,0.0,1.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100 treets,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102 Dalmatians,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
from scipy.sparse import csr_matrix  # Used to save space and speed up the process.
from sklearn.neighbors import NearestNeighbors  # Used to find items/movies that are closest to each other (similarity)


In [38]:
movie_features = csr_matrix(data_feature.values)  # convert the data to Sparse Matrix
model = NearestNeighbors(algorithm='brute',metric='cosine') # build the model Knn
model.fit(movie_features)

In [42]:
movie_selection = data_feature.reset_index()
movie_selection[['program_name']].head(10)

user_id_maped,program_name
0,#FollowFriday
1,10 Days in a Madhouse
2,100 treets
3,101 Dalmatians
4,102 Dalmatians
5,11.6
6,12 Monkeys Arms Of Mine
7,12 Monkeys Atari
8,12 Monkeys Blood Washed Away
9,12 Monkeys Bodies Of Water


In [47]:
# Find the nearest 5 movies
movie_index = data_feature.index.get_loc("Moana")
distances, indices = model.kneighbors(movie_features[movie_index], n_neighbors=6)

In [51]:
# show the result
for i in range(1, len(distances.flatten())):
    print(f"{i}: {data_feature.index[indices.flatten()[i]]} "
          f"(distances= {distances.flatten()[i]:.2f})")

1: Trolls (distances= 0.43)
2: Surf's Up : WaveMania (distances= 0.47)
3: The Mermaid Princess (distances= 0.51)
4: The Boss Baby (distances= 0.55)
5: The Jetsons & WWE: Robo-WrestleMania! (distances= 0.56)
