In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from datetime import datetime
import os
import random
import matplotlib
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from scipy import sparse
from scipy.sparse import csc_matrix

from sklearn.decomposition import TruncatedSVD
#from sklearn.metrics.pariwise import cosine_similarity

In [None]:
start = datetime.now()
if not os.path.isfile('data.csv'):
    #read all txt file and store them in one big file
    data = open('data.csv', mode='w')
    
    row = list()
    files = ['../input/netflix-prize-data/combined_data_1.txt', '../input/netflix-prize-data/combined_data_2.txt',
            '../input/netflix-prize-data/combined_data_3.txt', '../input/netflix-prize-data/combined_data_4.txt']
    for file in files:
        print('reading ratings from {}...'.format(file))
        with open(file) as f:
            for line in f:
                del row[:]
                line = line.strip()
                if line.endswith(':'):
                    #all are rating
                    movid_id = line.replace(':', '')
                else:
                    row = [x for x in line.split(',')]
                    row.insert(0, movid_id)
                    data.write(','.join(row))
                    data.write('\n')
        print('Done.\n')
    data.close()
print('time taken:', datetime.now() - start)

In [None]:
print('creating the dataframe from data.csv file..')
df = pd.read_csv('data.csv', sep=',', names=['movie','user','rating','date'])

df.date = pd.to_datetime(df.date)
print('Done.\n')

#arranging the rating according to time
print('sorting the dataframe by date..')
df.sort_values(by='date', inplace=True)
print('sorting done.')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.describe()['rating']

**Checking NaN values**

In [None]:
print('number of NaN values in our dataset:', sum(df.isnull().any()))

Check and Remove Duplicate

In [None]:
dup = df.duplicated(['movie','user','rating'])
dups = sum(dup) #considering by column
print('there are {} duplicate rating entries in the data.....'.format(dups))

Basic Stats

In [None]:
print('Total Data')
print("-"*60)
print('\nTotal number of rating:', df.shape[0])
print('Total number of users:', len(np.unique(df.user)))
print('total number of movie:', len(np.unique(df.movie)))

**Split the Dataset**

In [None]:
if not os.path.isfile('train.csv'):
    #create a dataframe and store it
    df.iloc[:int(df.shape[0]*0.80)].to_csv("train.csv", index=False)
if not os.path.isfile('test.csv'):
    #create a dataframe and store it
    df.iloc[int(df.shape[0]*0.80)].to_csv("test.csv", index=False)

train_df = pd.read_csv('train.csv', parse_dates=['date'])
test_df = pd.read_csv('test.csv')

In [None]:
test_df.shape

Basic Statistics on Train and Test Data

In [None]:
print('Total number of rating:',train_df.shape[0])
print('Total number of users:', len(np.unique(train_df.user)))
print('Total number of movies:', len(np.unique(train_df.movie)))

In [None]:
test_df.head()

In [None]:
print('Total number of rating:',test_df.shape[0])
#print('Total number of users:', len(np.unique(test_df.user)))
#print('Total number of movies:', len(np.unique(test_df.movie)))

**EDA on Train Data**

In [None]:
def human(num, units='M'):
    units = units.lower()
    num = float(num)
    if units == 'k':
        return str(num/10**3) + "K"
    elif units == 'm':
        return str(num/10**6) + "M"
    elif units == 'b':
        return str(num/10**9) + "B"


**Distribution**

In [None]:
fig, ax = plt.subplots()
plt.title('Distribution if rating over training dataset', fontsize=10)
sns.countplot(train_df.rating)
ax.set_yticklabels([human(item,'M') for item in ax.get_yticks()])
ax.set_ylabel('No. of Ratings (Million)')
 
plt.show()

From the above distribution we see that most people give a rating of 4 and few people gave a rating of 1 .

Now add a 'week day' column for Data Analysis 

In [None]:
#It is used to skip the warnings
#pd.options.mode.chained_assignment = None
#train_df['day_of_week'] = train_df.date.dt.weekday_name

#train_df.head()

No. of Rating per Month

In [None]:
ax = train_df.resample('m', on='date')['rating'].count().plot()
ax.set_title('No. of ratings per month (Training Data)')
plt.xlabel('Month')
plt.ylabel('No. of Rating per Month')
ax.set_yticklabels([human(item,'M') for item in ax.get_yticks()])

plt.show()

We can see that there is a massive growth of Netflix during the period 2003–2006. There are about 4.5 million ratings given by the users in 2005.

**Plot PDF(Probability Distribution Function) and CDF(Cumulative Distribution Function)**

In [None]:
no_of_rated_movie_per_user = train_df.groupby(by='user')['rating'].count().sort_values(ascending=False)

In [None]:
fig = plt.figure(figsize=plt.figaspect(.5))

ax1 = plt.subplot(121)
sns.kdeplot(no_of_rated_movie_per_user, ax=ax1, shade=True)
plt.xlabel('No of ratings by user')
plt.ylabel('PDF')



ax2 = plt.subplot(122)
sns.kdeplot(no_of_rated_movie_per_user, ax=ax2, shade=True, cumulative=True)
plt.xlabel('No of ratings by user')
plt.ylabel('CDF')

plt.show()

In [None]:
no_of_rated_movie_per_user.describe()

In [None]:
no_of_ratings_per_movie = train_df.groupby(by='movie')['rating'].count().sort_values(ascending=True)

fig = plt.figure(figsize=plt.figaspect(.5))
ax = plt.gca()
plt.plot(no_of_ratings_per_movie.values)
plt.title('Rating Per Movie')
plt.xlabel('Movie')
plt.ylabel('No. of Users who rated a Movie')
ax.set_xticklabels([])

plt.show()

Creating sparse matrix from data frame

In [None]:
start = datetime.now()
if os.path.isfile('train_sparse_matrix.npz'):
    train_sparse_matrix = sparse.load_npz('train_sparse_matrix.npz')
else:
    train_sparse_matrix = sparse.csr_matrix((train_df.rating.values, (train_df.user.values, train_df.movie.values)),)
    print('It is shape is:(user, movie):', train_sparse_matrix.shape)
    
print(datetime.now() - start)

Sparsity of Train Sparse Matrix

In [None]:
us, mv = train_sparse_matrix.shape
elem = train_sparse_matrix.count_nonzero()

print(elem)

In [None]:
print('sparsity of train matrix:{}%'.format((1-(elem/us*mv)))*100)

Find Average of all movie ratings, average rating per user, average rating per movie

In [None]:
def get_average_ratings(sparse_matrix, of_users):
    #avg rating from user
    ax = 1 if of_users else 0
    
    #'.A1' is for converting column_matrix to 1-D numpy array
    sum_of_ratings = sparse_matrix.sum(axis=ax).A1
    
    #boolean matrix of ratings (user raed or not)
    is_rated = sparse_matrix!=0
    
    #no.of ratings that each user
    no_of_ratings = is_rated.sum(axis=ax).A1
    
    u,m = sparse_matrix.shape
    
    #create a dictionary of users and their avg 
    average_ratings = {i : sum_of_ratings[i]/no_of_ratings[i] for i in range(u if of_users else m) if no_of_ratings[i]!=0}
    
    return average_ratings

Global average of all movie ratings

In [None]:
train_averages = dict()

#get global average 
train_global_average = train_sparse_matrix.sum()/train_sparse_matrix.count_nonzero()
train_averages['global'] = train_global_average
train_averages

> Avg Rating per Movie

In [None]:
train_averages['movie'] = get_average_ratings(train_sparse_matrix, of_users=False)
print('\n Average rating of movie 15:', train_averages['movie'][15])

**PDF & CDF of Avg Ratings of Users and Movies**

In Training Data********

In [None]:
start = datetime.now()
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=plt.figaspect(.5))
fig.suptitle('Avg Ratings per users and per Movie', fontsize=15)

ax1.set_title('users-avg-ratings')

#getting a list of avg user rating from the avg dictionary
user_avg = [rat for rat in train_averages['user'].values()]

sns.distplot(user_avg, ax=ax1, hint=False, kde_kws = dict(cumulative=True), label='Cdf')

sns.distplot(user_avg, ax=ax1, hint=False,label='Pdf')
ax2.set_title('Movies-Avg-Rating')

#getting a list of movie_avg user rating from the avg dictionary

movie_average = [rat for rat in train_averages['movie'].values()]

sns.distplot(movie_average, ax=ax2, hint=False, kde_kws = dict(cumulative=True), label='Cdf')

sns.distplot(movie_average, ax=ax2, hint=False,label='Pdf')

plt.show()
print(datetime.now() - start)

Computing Movie-Movie similarity Matrix

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
start = datetime.now()
if not os.path.isfile('m_m_sparse.npz'):
    print('It seems dont have a file. computing movie_movie smimilarity...')
    start = datetime.now()
    m_m_sim_sparse = cosine_similarity(X=train_sparse_matrix.T, dense_output = False)
    
    #store this sparse matrix in disk 
    #print('saving it to disk without the need of re-computing it again')
    #sparse.save_npz("m_m_sim_sparse.npz", m_m_sim_sparse)
else:
    print('it is there.')
    m_m_sim_sparse = sparse.load_npz("m_m_sim_sparse")
    
print("it is a ", m_m_sim_sparse.shape, "dimensional matrix")

print(datetime.now() - start)

We take only those top similar movie ratings and store them in a separate dictionary.

In [None]:
movie_ids = np.unique(m_m_sim_sparse.nonzero()[1])

start  = datetime.now()
similaer_movies = dict()
for movie in movie_ids:
    sim_movies = m_m_sim_sparse[movie].toarry().ravel().argsort()[::-1][1:]
    similar_movies[movie] = sim_movie[:100]
print(datetime.now() - start)

#testing similar movies for movie_15
similar_movies[15]


Finding Most Similar Movie

In [None]:
movie_titles = pd.read_csv("../input/netflix-prize-data/movie_titles.csv", sep=',', header=None, names=['movie_id', 'year_of_release', 'title'], verbose=True, index_col='movie_id', encoding='ISO-8859-1')

movie_titles.head()

