# Objective


Machine learning from data and recommend best TV shows and movies to users based on their own rating history&viewing behaviour patterns

In [1]:
# to import all the library and modules we need 

In [2]:
import pandas as pd
import numpy as np
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
sns.set_style("darkgrid")

In [3]:
#Surprise is a Python scikit for building and analyzing recommender systems that deal with explicit rating data

# !pip install scikit-surprise

In [3]:
# read dataframe and skip date
df1 = pd.read_csv('../Resources/combined_data_1.txt', header = None, names = ['Customer_Id', 'Rating'], usecols = [0,1])

df1['Rating'] = df1['Rating'].astype(float)

print(f'Dataset 1 shape: {df1.shape}')
print('-Dataset examples-')
print(df1.iloc[::2000000, :])

Dataset 1 shape: (24058263, 2)
-Dataset examples-
         Customer_Id  Rating
0                 1:     NaN
2000000      1910904     5.0
4000000       496631     3.0
6000000      1091776     4.0
8000000      1177065     2.0
10000000     2271935     2.0
12000000      700005     2.0
14000000     1947241     5.0
16000000      956220     4.0
18000000      460528     3.0
20000000     1933327     3.0
22000000     1878057     3.0
24000000      517002     4.0


In [4]:
df2 = pd.read_csv('../Resources/combined_data_2.txt', header = None, names = ['Customer_Id', 'Rating'], usecols = [0,1])
df3 = pd.read_csv('../Resources/combined_data_3.txt', header = None, names = ['Customer_Id', 'Rating'], usecols = [0,1])
df4 = pd.read_csv('../Resources/combined_data_4.txt', header = None, names = ['Customer_Id', 'Rating'], usecols = [0,1])


df2['Rating'] = df2['Rating'].astype(float)
df3['Rating'] = df3['Rating'].astype(float)
df4['Rating'] = df4['Rating'].astype(float)

print(f'Dataset 2 shape: {df2.shape}')
print(f'Dataset 3 shape: {df3.shape}')
print(f'Dataset 4 shape: {df4.shape}')

Dataset 2 shape: (26982302, 2)
Dataset 3 shape: (22605786, 2)
Dataset 4 shape: (26851926, 2)


In [5]:
df_full = pd.concat([df1,df2,df3,df4],axis=0)
df_full.index = np.arange(0,len(df_full))
print(f'Full dataset shape: {df_full.shape}')
print('-Dataset examples-')
print(df_full.iloc[::10000000, :])

Full dataset shape: (100498277, 2)
-Dataset examples-
          Customer_Id  Rating
0                  1:     NaN
10000000      2271935     2.0
20000000      1933327     3.0
30000000       961023     4.0
40000000       854274     5.0
50000000       768483     3.0
60000000      1609324     2.0
70000000      1776418     4.0
80000000       932047     4.0
90000000       932191     4.0
100000000      872339     4.0


## Exploratory Data Analysis

In [None]:
p = df1.groupby('Rating')['Rating'].agg(['count'])

# get movie count
movie_count = df1.isnull().sum()[1]

# get customer count
cust_count = df1['Customer_Id'].nunique() - movie_count

# get rating count
rating_count = df1['Customer_Id'].count() - movie_count

ax = p.plot(kind = 'barh', legend = False, figsize = (15,10))
plt.title('Total pool: {:,} Movies, {:,} customers, {:,} ratings given'.format(movie_count, cust_count, rating_count), fontsize=20)
# plt.axis('off')

for i in range(1,6):
    ax.text(p.iloc[i-1][0]/4, i-1, 'Rating {}: {:.0f}%'.format(i, p.iloc[i-1][0]*100 / p.sum()[0]), color = 'white', weight = 'bold')