In [2]:
# Import modules
import numpy as np
import pandas as pd
import quandl
from itertools import combinations
from datetime import datetime
from datetime import timedelta


from sklearn.neighbors import KNeighborsClassifier

# Plotting preferences
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import string
import re

In [3]:
#Reading tags data
tags = pd.read_csv('ml-20m/tags.csv',engine='python')

#Reading ratings data
ratings = pd.read_csv('ml-20m/ratings.csv',engine='python')

#Reading movies data
movies = pd.read_csv('ml-20m/movies.csv', engine='python')

#Reading links data
links = pd.read_csv('ml-20m/links.csv',low_memory=False)

#Reading genome-scores data
tagscore = pd.read_csv('ml-20m/genome-scores.csv',low_memory=False)

#Reading genome-tags data
genome_tag = pd.read_csv('ml-20m/genome-tags.csv',low_memory=False)


In [367]:
print "Tags Info\n"
tags.info()

print "\nRatings Info\n"
ratings.info()

print "\n Tag Scores Info\n"
tagscore.info()


Tags Info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 465564 entries, 0 to 465563
Data columns (total 4 columns):
userId       465564 non-null int64
movieId      465564 non-null int64
tag          465548 non-null object
timestamp    465564 non-null int64
dtypes: int64(3), object(1)
memory usage: 14.2+ MB

Ratings Info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 4 columns):
userId       int64
movieId      int64
rating       float64
timestamp    int64
dtypes: float64(1), int64(3)
memory usage: 610.4 MB

 Tag Scores Info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11709768 entries, 0 to 11709767
Data columns (total 3 columns):
movieId      int64
tagId        int64
relevance    float64
dtypes: float64(1), int64(2)
memory usage: 268.0 MB


In [368]:
#Checking if there are any null values in the dataframe 
ratings.isnull().sum()

KeyboardInterrupt: 

In [369]:
links.isnull().sum()

movieId      0
imdbId       0
tmdbId     252
dtype: int64

In [None]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

No NULL values found in either ratings, links or movies dataframes

In [None]:
ratings.shape

In [None]:
movies.shape

In [None]:
tags.shape

In [None]:
tags.isnull().sum()

In [None]:
tags[tags.tag.isnull()].head()

In [None]:
#Dropping any null tags
tags=tags.dropna()
tags.shape

Since the tags dataset consisted of 16 null values, we just drop those rows 

In [None]:
# Only keeping the movie id and title from the movies dataframe
movies = movies.loc[:,["movieId","title"]]
movies.head(10)

In [None]:
# The ratings dataframe contains a timestamp column, which is not informative for our purpose.
# So we will go ahead and only keep colums - userid, movieid and rating
ratings = ratings.loc[:,["userId","movieId","rating"]]
ratings.head(10)

In [None]:
# the tags dataframe contains a timestamp column, which is not informative for our purpose.
#So we will go ahead and only keep colums - userid, movieid and tag
tags = tags.loc[:,["userId","movieId","tag"]]
tags.head(10)

In [None]:
# we now merge the ratings and tags dataframes
data = pd.merge(ratings,tags)

In [None]:
# let us merge this "data" dataframe with the movies dataframe to get the title of the movie
df = pd.merge(data,movies)

In [None]:
df.head()

In [None]:
# Tags for genres
movies_tags = movies.loc[:,["movieId","title"]]

#merge on columns (default join is inner)
movies_tags = pd.merge(tags, movies_tags, on =['movieId'])


title_tags = movies_tags.loc[:,["title","tag"]]


In [None]:
movies_tags.head()

In [None]:
#Converting all tags to lower case
df['tag'] = df['tag'].str.lower()

In [None]:
# Exploring the ratings data column to see the mean, min and max ratings given by users
#least rating given
df['rating'].min()


In [None]:
#highest rating given 
df['rating'].max()

In [None]:
df.corr()

In [None]:
#Viszulaizing the ratings 

df.hist(column='rating', figsize=(10,5))

In [None]:
df.boxplot(column='rating', figsize=(10,5), return_type='axes')

The box plot and bar graph show us that most of the user ratings are within the range of 3.5-5.0

In [None]:
# Counting the tags
tag_counts = df['tag'].value_counts()

In [None]:
tag_counts.head(10)

In [None]:
my_colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
tag_counts[:10].plot(kind='bar', figsize=(10,5),color=my_colors)

The above bar graph depicts the top 10 tags occuring in all movies. It can be seen that they are- sci-fi, atmospheric, action, comedy, surreal, based on a book, twist ending, funny, dystopia and quirky

In [None]:
#counting the mean rating for each movie
avg_ratings= df.groupby('movieId', as_index=False).mean()
del avg_ratings['userId']
avg_ratings.head()

In [None]:
#Counting the total number of ratings by all users to every movie
movie_count = df[['movieId','rating']].groupby('movieId').count()
movie_count.head()

In [None]:
#Number of user ratings per movie 
userRatings=df[['movieId','userId']].groupby('movieId',as_index=False).count().rename(columns={'userId':'numberOfUserRatings'})
userRatings.head()

In [None]:
#Number of unique movies
len(list(df.movieId.unique()))

In [None]:
#Using the genome scores and tags data to get the relevance score of the tags
tags_in_movies  = pd.merge(tagscore, genome_tag, on='tagId')[['movieId', 'tag', 'relevance']]

In [None]:
tags_in_movies.head()

In [None]:
# Since the dataset consists of 20M records, we need to filter out rows that are not useful for our exploration
# let us look at the tag relevance scores to determine a good cut-off

tagscore.head()

In [None]:
#Using a pivot table with index as movie id, columns as the tag id's. The values in the table are the relevance scores
tagscore[:100000].pivot(index='movieId', columns='tagId')['relevance'].head()

In [None]:
#Creating a new table with the mean relevance scores
table = tagscore[:100000].pivot_table('relevance', index='movieId', columns='tagId', aggfunc='mean')

In [None]:
table.head()

In [None]:
table[:1].T.hist()

It can be seen from the above histogram that most of the relevance scores lie in the 0 to 0.3 range. As these values dont provide useful information, we can use 0.3 as a suitable cutoff 

In [None]:
movie_tags = tagscore[tagscore.relevance > 0.3][['movieId', 'tagId','relevance']]

In [None]:
genome_tag.head()

In [None]:
#Merging tag and movie names 

tags_to_movies = pd.merge(movie_tags, genome_tag, on='tagId', how='left')[['movieId', 'tagId','tag','relevance']]

In [None]:
tags_to_movies['tagId'] = tags_to_movies.tagId.astype(str)

In [None]:
tags_to_movies.head()

In [None]:
# Concatenating tags into movies dataset
def  concat_tags_of_movie (tags):
    tags_as_str = ','.join(set(tags))
    return tags_as_str



In [None]:
#Storing number of tags per movie
tags_per_movie  = tags_to_movies.groupby('movieId')['tag'].agg({
    'movie_tags': concat_tags_of_movie
}).reset_index()

In [None]:
#Getting average ratings for movies 
avg_ratings  = ratings.groupby('movieId')['rating'].agg({
    'rating_mean': 'mean',
    'rating_median': 'median',
    'num_ratings': 'size'
}).reset_index()

In [None]:
#Storing all movies with average of all ratings given to them by users
movies_with_ratings = pd.merge(movies, avg_ratings, how='left', on='movieId')

In [None]:
dataset = pd.merge(movies_with_ratings, tags_per_movie, how='left', on='movieId')

In [None]:
dataset.rename(columns={'median': 'median_rating', 'mean': 'mean_rating', 'tag': 'movie_tags'}, inplace=True)

In [None]:
dataset.head()

In [None]:
dataset.isnull().sum()

In [None]:
#Dropping any null tags
filtered_df = dataset[dataset['num_ratings'].notnull()]

filtered_df = dataset[dataset['movie_tags'].notnull()]

filtered_df.info()

In [None]:
#Setting movieId as index of the dataframe

final_df.movieId = final_df.movieId.astype(str)

final_df = final_df.set_index('movieId')



In [None]:
#Creatin a new column called 'num_tags' that stores the number of tags per movie
final_df['num_tags'] = final_df['movie_tags'].str.split(',').apply(len)

In [None]:
# Setting up the K means clustering model

# Recommend 20 similar items
engine = KNeighborsClassifier(n_neighbors=20)

# Training data points
data_points = final_df[['num_ratings', 'rating_mean','rating_median','num_tags']].values

#Training labels
labels = filtered_df.index.values

print("Data points: ")
print(data_points)
print("Labels: ")
print(labels)

engine.fit(data_points, labels)

In [None]:
# Enter movie ID to get a list of 20 recommended items

# User entered value
product_id = '131262'

product_data = [final_df.loc[product_id]['num_ratings','rating_mean','rating_median','num_tags'].values]

recommended_products = engine.kneighbors(X=product_data, n_neighbors=20, return_distance=False)

# List of movie IDs 

products_list = []

for each in recommended_products:
    products_list.append(filtered_df.iloc[each].index)

print("Recommended movies based on entered movieID: ")
print(products_list)

In [None]:
# Showing recommended movies in a scatter plot

ax = final_df.plot(kind='scatter', x='rating_mean', y='num_ratings', color='grey', alpha=0.20)
box_office.iloc[recommended_products[0]].plot(kind='scatter', x='rating_mean', y='num_ratings',\
                                                   color='orange', alpha=0.5, ax=ax)

ax2 = final_df.plot(kind='scatter', x='rating_median', y='num_ratings', color='grey')
box_office.iloc[recommended_products[0]].plot(kind='scatter', x='rating_median', y='num_ratings',\
                                                   color='orange', alpha=0.5, ax=ax2)


ax3 = final_df.plot(kind='scatter', x='num_tags', y='num_ratings', color='grey')
box_office.iloc[recommended_products[0]].plot(kind='scatter', x='num_tags', y='num_ratings',\
                                                   color='orange', alpha=0.5, ax=ax2)


plt.show()