In [108]:
'''
DESCRIPTION:=====================================================================================================================

The dataset provided contains movie reviews given by Amazon customers. Reviews were given between May 1996 and July 2014.

Data Dictionary
UserID – 4848 customers who provided a rating for each movie
Movie 1 to Movie 206 – 206 movies for which ratings are provided by 4848 distinct users

Data Considerations
- All the users have not watched all the movies and therefore, all movies are not rated. These missing values are represented by NA.
- Ratings are on a scale of -1 to 10 where -1 is the least rating and 10 is the best.

Analysis Task
- Exploratory Data Analysis:

Which movies have maximum views/ratings?
What is the average rating for each movie? Define the top 5 movies with the maximum ratings.
Define the top 5 movies with the least audience.
- Recommendation Model: Some of the movies hadn’t been watched and therefore, are not rated by the users. 
Netflix would like to take this as an opportunity and build a machine learning recommendation algorithm which provides 
the ratings for each of the users.

Divide the data into training and test data
Build a recommendation model on training data
Make predictions on the test data
'''



In [109]:
#importing Amazon movie ratings dataset
import pandas as pd
amazon_df=pd.read_csv("E:/Education/PGP Simplilearn-Purdue/PGP in Data Science/Machine Learning/Machine-Learning--Projects-master/Projects/Projects for Submission/Amazon - Movies and TV Ratings.csv")
amazon_df

Unnamed: 0,user_id,Movie1,Movie2,Movie3,Movie4,Movie5,Movie6,Movie7,Movie8,Movie9,...,Movie197,Movie198,Movie199,Movie200,Movie201,Movie202,Movie203,Movie204,Movie205,Movie206
0,A3R5OBKS7OM2IR,5.0,5.0,,,,,,,,...,,,,,,,,,,
1,AH3QC2PC1VTGP,,,2.0,,,,,,,...,,,,,,,,,,
2,A3LKP6WPMP9UKX,,,,5.0,,,,,,...,,,,,,,,,,
3,AVIY68KEPQ5ZD,,,,5.0,,,,,,...,,,,,,,,,,
4,A1CV1WROP5KTTW,,,,,5.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4843,A1IMQ9WMFYKWH5,,,,,,,,,,...,,,,,,,,,,5.0
4844,A1KLIKPUF5E88I,,,,,,,,,,...,,,,,,,,,,5.0
4845,A5HG6WFZLO10D,,,,,,,,,,...,,,,,,,,,,5.0
4846,A3UU690TWXCG1X,,,,,,,,,,...,,,,,,,,,,5.0


In [110]:
#Exploring dataset
amazon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4848 entries, 0 to 4847
Columns: 207 entries, user_id to Movie206
dtypes: float64(206), object(1)
memory usage: 7.7+ MB


In [111]:
#Checking for duplicates
amazon_df[amazon_df.duplicated()]
#Observation:
#No duplicate records are present in the dataset

Unnamed: 0,user_id,Movie1,Movie2,Movie3,Movie4,Movie5,Movie6,Movie7,Movie8,Movie9,...,Movie197,Movie198,Movie199,Movie200,Movie201,Movie202,Movie203,Movie204,Movie205,Movie206


In [112]:
#Analysis Tasks:
#Which movies have maximum views/ratings?
views=amazon_df.notna().sum()
views_df1=pd.DataFrame(views.sort_values(ascending=False),columns=['total_views'])
views_df1.head(6)
#Observation:
#Movie127 has maximum views/ratings of 2313

Unnamed: 0,total_views
user_id,4848
Movie127,2313
Movie140,578
Movie16,320
Movie103,272
Movie29,243


In [113]:
#What is the average rating for each movie? Define the top 5 movies with the maximum ratings.
import numpy as np
ratings=np.mean(amazon_df)
ratings_df=pd.DataFrame(ratings.sort_values(ascending=False),columns=['avg_rating'])
print(ratings_df)
#Observation:
#Movie1, Movie55, Movie131, Movie132, Movie133 are the top 5 movies with maximum average ratings
#Movie127, Movie140, Movie16, Movie103, Movie29 are the top 5 movies with maximum ratings

          avg_rating
Movie1           5.0
Movie55          5.0
Movie131         5.0
Movie132         5.0
Movie133         5.0
...              ...
Movie60          1.0
Movie58          1.0
Movie45          1.0
Movie67          1.0
Movie144         1.0

[206 rows x 1 columns]


In [114]:
#Define the top 5 movies with the least audience.
views_df2=pd.DataFrame(views.sort_values(ascending=True),columns=['total_views'])
views_df2.head(5)
#Observation:
#Movie100, Movie73, Movie74, Movie75, Movie77 are the top 5 movies with least audience

Unnamed: 0,total_views
Movie100,1
Movie77,1
Movie75,1
Movie74,1
Movie73,1


In [115]:
#Replacing NA values with 0
amazon_df.fillna(0,inplace=True)
amazon_df

Unnamed: 0,user_id,Movie1,Movie2,Movie3,Movie4,Movie5,Movie6,Movie7,Movie8,Movie9,...,Movie197,Movie198,Movie199,Movie200,Movie201,Movie202,Movie203,Movie204,Movie205,Movie206
0,A3R5OBKS7OM2IR,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,AH3QC2PC1VTGP,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A3LKP6WPMP9UKX,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,AVIY68KEPQ5ZD,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A1CV1WROP5KTTW,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4843,A1IMQ9WMFYKWH5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
4844,A1KLIKPUF5E88I,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
4845,A5HG6WFZLO10D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
4846,A3UU690TWXCG1X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


In [116]:
#Creating a new data frame with movie names in rows
amazon_df1=amazon_df.transpose()[1:]
amazon_df1.reset_index(inplace=True)
amazon_df1.rename(columns={'index':'movie_title'},inplace=True)
amazon_df1

Unnamed: 0,movie_title,0,1,2,3,4,5,6,7,8,...,4838,4839,4840,4841,4842,4843,4844,4845,4846,4847
0,Movie1,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Movie2,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Movie3,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Movie4,0,0,5,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Movie5,0,0,0,0,5,2,5,2,5,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,Movie202,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
202,Movie203,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
203,Movie204,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
204,Movie205,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [117]:
#importing required libraries
from scipy.spatial.distance import cosine
from sklearn.metrics import pairwise_distances
import numpy as np

#Dropping user_id column
amazon_ratings_df=amazon_df.drop(['user_id'],axis=1)

#Changing column names to numbers
amazon_ratings_df.columns=np.arange(1,207)

#Transposing the dataframe to get movies in row indices
amazon_ratings_df=amazon_ratings_df.transpose()

#Replacing NA with 0's
amazon_ratings_df.fillna(0,inplace=True)

#Calculating cosine similarity matrix
movie_similarity=1-pairwise_distances(amazon_ratings_df,metric='l2')
movie_similarity

array([[  1.        ,   1.        ,  -4.38516481, ..., -12.85640646,
        -27.33725463, -17.46618531],
       [  1.        ,   1.        ,  -4.38516481, ..., -12.85640646,
        -27.33725463, -17.46618531],
       [ -4.38516481,  -4.38516481,   1.        , ..., -12.07669683,
        -26.96426291, -16.88854382],
       ...,
       [-12.85640646, -12.85640646, -12.07669683, ...,   1.        ,
        -29.7408523 , -20.97726098],
       [-27.33725463, -27.33725463, -26.96426291, ..., -29.7408523 ,
          1.        , -32.07567082],
       [-17.46618531, -17.46618531, -16.88854382, ..., -20.97726098,
        -32.07567082,   1.        ]])

In [118]:
#Filling the diagonal with 0's
np.fill_diagonal(movie_similarity,0)
ratings_matrix=pd.DataFrame(movie_similarity)
ratings_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,196,197,198,199,200,201,202,203,204,205
0,0.000000,1.000000,-4.385165,-7.660254,-23.000000,-5.403124,-6.071068,-6.071068,-6.071068,-6.071068,...,-9.392305,-7.660254,-6.071068,-13.071247,-8.165151,-11.288206,-4.830952,-12.856406,-27.337255,-17.466185
1,1.000000,0.000000,-4.385165,-7.660254,-23.000000,-5.403124,-6.071068,-6.071068,-6.071068,-6.071068,...,-9.392305,-7.660254,-6.071068,-13.071247,-8.165151,-11.288206,-4.830952,-12.856406,-27.337255,-17.466185
2,-4.385165,-4.385165,0.000000,-6.348469,-22.558438,-3.472136,-4.385165,-4.385165,-4.385165,-4.385165,...,-8.327379,-6.348469,-4.385165,-12.304135,-6.937254,-10.401754,-2.605551,-12.076697,-26.964263,-16.888544
3,-7.660254,-7.660254,-6.348469,0.000000,-23.515301,-7.124038,-7.660254,-7.660254,-7.660254,-7.660254,...,-10.532563,-9.000000,-7.660254,-13.933185,-9.440307,-12.266499,-6.681146,-13.730920,-27.774989,-18.131126
4,-23.000000,-23.000000,-22.558438,-23.515301,0.000000,-22.811762,-23.000000,-23.000000,-23.000000,-23.000000,...,-24.179357,-23.515301,-23.000000,-25.907248,-23.698178,-25.019224,-22.664319,-25.795522,-35.455452,-28.444864
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,-11.288206,-11.288206,-10.401754,-12.266499,-25.019224,-10.916375,-11.288206,-11.288206,-11.288206,-11.288206,...,-13.456832,-12.266499,-11.288206,-16.291616,-12.601471,0.000000,-10.618950,-16.117243,-29.066593,-20.023796
202,-4.830952,-4.830952,-2.605551,-6.681146,-22.664319,-4.000000,-4.830952,-4.830952,-4.830952,-4.830952,...,-8.591663,-6.681146,-4.830952,-12.490738,-7.246211,-10.618950,0.000000,-12.266499,-27.053520,-17.027756
203,-12.856406,-12.856406,-12.076697,-13.730920,-25.795522,-12.527749,-12.856406,-12.856406,-12.856406,-12.856406,...,-14.811388,-13.730920,-12.856406,-17.439089,-14.033296,-16.117243,-12.266499,0.000000,-29.740852,-20.977261
204,-27.337255,-27.337255,-26.964263,-27.774989,-35.455452,-27.178006,-27.337255,-27.337255,-27.337255,-27.337255,...,-28.342802,-27.774989,-27.337255,-29.838288,-27.930952,-29.066593,-27.053520,-29.740852,0.000000,-32.075671


In [119]:
#Takin user input and giving recommendations--Recommender System for Amazon
try:
    user_input=input("Enter the movie name for recommendation:")
    inp=amazon_df1[amazon_df1['movie_title']==user_input].index.tolist()
    inp=inp[0]
    
    amazon_df1['similarity']=ratings_matrix.iloc[inp]
    print("Recommended movies based on your choice of ",user_input,":\n",amazon_df1.sort_values(['similarity'],ascending=False)[['movie_title','similarity']][1:10])

except:
    print(amazon_df1.head(10))

Enter the movie name for recommendation: Movie100


Recommended movies based on your choice of  Movie100 :
     movie_title  similarity
68      Movie69   -3.123106
143    Movie144   -3.123106
57      Movie58   -3.123106
66      Movie67   -3.123106
44      Movie45   -3.123106
153    Movie154   -3.123106
59      Movie60   -3.123106
58      Movie59   -3.472136
2        Movie3   -3.472136
