#                                                       MoviFlex

In [90]:
#importing required libraries
import pandas as pd 
import numpy as np
import ast
from ast import literal_eval  # evaluate strings containing Python code in the current Python environment
from nltk.stem.snowball import SnowballStemmer # Removing stem words
from sklearn.feature_extraction.text import CountVectorizer  # To convert text to numerical data
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from collections import defaultdict
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud
import seaborn as sns
import networkx as nx
import difflib

import warnings  # disable python warnings
warnings.filterwarnings("ignore")

In [91]:
#loading datasets
movies_data = pd.read_csv("movies_metadata.csv", low_memory=False)
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')
links_small = pd.read_csv('links_small.csv')
ratings = pd.read_csv("ratings_small.csv")

In [92]:
#checking for null values
print(movies_data.isnull().sum(),'\n')
print(links_small.isnull().sum(),'\n')
print(ratings.isnull().sum(),'\n')
print(keywords.isnull().sum(),'\n')
print(credits.isnull().sum(),'\n')

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64 

movieId     0
imdbId      0
tmdbId     13
dtype: int64 

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64 

id          0
keywords    0
dtype: int64 

cast    0
crew    0
id      0
dtype: int64 



In [93]:
# Removing the rows with null value in the vote_average and vote_count columns in movies_data dataframe

movies_data = movies_data.dropna(subset=['vote_average', 'vote_count'])

# Removing the rows with null value in links_small

links_small = links_small.dropna()

# Tkinter

Now we will create the interface for our recommender system using python Tkinter library. In this part we have used Genre based and Collaborative filltering based recommender system created in the above sections.

In [94]:
import tkinter as tk 
import tkinter.ttk
from tkinter import *
from tkinter import messagebox
from PIL import ImageTk,Image

### Simple Recommendation System

   Here, I am using weighted rating: (v/v+m)*R + (m/m+v)*C
    
    where,
           m = Minimum vote count required to be listed in chart.
           v = Total number of votes of the movie (given in the dataset with column name 'vote_count')
           R = Average rating of the movie (given in the dataset with column name 'vote_average' )
           C = Average vote across all dataset (total vote divided by total movies)


#### Simple Recommendation system based on rating

In [95]:
# Weighted rating
def weighted_rating(v,R):
    return ((v/(v+m)) * R) + ((m/(m+v)) * C)  

C = movies_data['vote_average'].mean()         # mean vote across all data
m = movies_data['vote_count'].quantile(0.95)   # movies with more than 95% votes is taken (95 percentile)

# Taking movies whose vote count is greater than m
top_movies = movies_data.copy().loc[movies_data['vote_count'] >= m]
top_movies = top_movies.reset_index()

top_movies['score'] = ''

for i in range(top_movies.shape[0]):
    v = top_movies['vote_count'][i]          # number of vote count of the movie
    R = top_movies['vote_average'][i]        # average rating of the movie
    top_movies['score'][i] = weighted_rating(v,R)

top_movies = top_movies.sort_values('score', ascending=False)  # sorting movies in descending order according to score
top_movies = top_movies.reset_index()
t1 =top_movies[['title', 'score']].head(20)
p = [None for i in range(100)]
# top_movies[['title', 'vote_count', 'vote_average', 'score']].head(20) # top 20 movies
def tpmovies():    #function to print values in GUI
    for i in range(100):
        if p[i] is not None:
            p[i].grid_remove()
    a=clicked2.get()
    j=0
    mvp=list((top_movies[['title']].head(100))['title'])
    n=Label(top1,text="Result",font=('Helvetica',15,'bold'),bg='black',fg="#7F7FFF").grid(row=3,column=2,sticky=W,pady=5)
    for k in range(10):
        if j<a:
            for i in range(10):
                p[j]=Label(top1, text=mvp[j],bg='black',fg='orchid2').grid(row = 4+i, column = 2+k, sticky = W,pady=5,padx=10)
                j=j+1
        else:
            break

#### Simlpe Recommendation system based on Genre


In [96]:
genres = set()

# Finding the exhaustive set of genres in the dataset 
top_movies['genres'] = top_movies['genres'].apply(ast.literal_eval)
for i in range(top_movies['genres'].shape[0]):   # converting string in map
    for x in top_movies['genres'][i]:
        genres.add(x['name'])
        
top_movies.head(10)

Unnamed: 0,level_0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,score
0,46,314,False,,25000000,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",,278,tt0111161,en,...,28341470.0,142.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Fear can hold you prisoner. Hope can set you f...,The Shawshank Redemption,False,8.5,8358.0,8.357746
1,111,834,False,"{'id': 230, 'name': 'The Godfather Collection'...",6000000,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",http://www.thegodfather.com/,238,tt0068646,en,...,245066400.0,175.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,An offer you can't refuse.,The Godfather,False,8.5,6024.0,8.306334
2,1135,12481,False,"{'id': 263, 'name': 'The Dark Knight Collectio...",185000000,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",http://thedarkknight.warnerbros.com/dvdsite/,155,tt0468569,en,...,1004558000.0,152.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Why So Serious?,The Dark Knight,False,8.3,12269.0,8.208376
3,439,2843,False,,63000000,"[{'id': 18, 'name': 'Drama'}]",http://www.foxmovies.com/movies/fight-club,550,tt0137523,en,...,100853800.0,139.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Mischief. Mayhem. Soap.,Fight Club,False,8.3,9678.0,8.184899
4,43,292,False,,8000000,"[{'id': 53, 'name': 'Thriller'}, {'id': 80, 'n...",,680,tt0110912,en,...,213928800.0,154.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Just because you are a character doesn't mean ...,Pulp Fiction,False,8.3,8670.0,8.172155
5,52,351,False,,55000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,13,tt0109830,en,...,677945400.0,142.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"The world will never be the same, once you've ...",Forrest Gump,False,8.2,8147.0,8.069421
6,75,522,False,,22000000,"[{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...",http://www.schindlerslist.com/,424,tt0108052,en,...,321365600.0,195.0,"[{'iso_639_1': 'de', 'name': 'Deutsch'}, {'iso...",Released,"Whoever saves one life, saves the world entire.",Schindler's List,False,8.3,4436.0,8.061007
7,1863,23673,False,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,244786,tt2582802,en,...,13092000.0,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,False,8.3,4376.0,8.058025
8,671,5481,False,,15000000,"[{'id': 14, 'name': 'Fantasy'}, {'id': 12, 'na...",http://movies.disney.com/spirited-away,129,tt0245429,ja,...,274925100.0,125.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,The tunnel led Chihiro to a mysterious town...,Spirited Away,False,8.3,3968.0,8.035598
9,159,1154,False,"{'id': 10, 'name': 'Star Wars Collection', 'po...",18000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.starwars.com/films/star-wars-episod...,1891,tt0080684,en,...,538400000.0,124.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Adventure Continues...,The Empire Strikes Back,False,8.2,5998.0,8.025793


In [97]:
# creating map of string (genre name) and movies names(dataframe)
genres_based = dict()   
for i in range(top_movies['genres'].shape[0]):  
    for x in top_movies['genres'][i]:
        if x['name'] not in genres_based.keys():
            genres_based[x['name']] = pd.DataFrame(columns = top_movies.columns)
        genres_based[x['name']] = genres_based[x['name']].append(top_movies.iloc[i])
genres_based


{'Drama':      level_0  index  adult                              belongs_to_collection  \
 0         46    314  False                                                NaN   
 1        111    834  False  {'id': 230, 'name': 'The Godfather Collection'...   
 2       1135  12481  False  {'id': 263, 'name': 'The Dark Knight Collectio...   
 3        439   2843  False                                                NaN   
 5         52    351  False                                                NaN   
 ...      ...    ...    ...                                                ...   
 2233    1770  21848  False                                                NaN   
 2237     985  11012  False                                                NaN   
 2246    2081  32891  False                                                NaN   
 2249    1682  20598  False                                                NaN   
 2255     892  10010  False  {'id': 52785, 'name': 'xXx Collection', 'poste...   
 
     

In [98]:
# Visualizing frequency of occurence of different genres

# Creating a count vector (list) containing frequency of a perticular genre
cnt = list()
for i in genres:
    cnt.append(genres_based[i].shape[0])
    
# Making a datafram 
genre_cnt = pd.DataFrame( { 'genres' : list(genres),
                            'count'  : cnt
    
},
                         columns = ['genres','count']
)

In [99]:
def genres_based_rcmnd(name):
    if name not in genres:
        return None
    else:
        return genres_based[name][['title', 'vote_count', 'vote_average', 'score']].head(5)
m = [None for i in range(10)] 
# function for printing name of movie based on genre 
def gen():
    for i in range(10):
        if m[i] is not None:
            m[i].grid_remove()
    event1 = clicked1.get()
    llm = list((genres_based[event1][['title']].head(10))['title'])
    for i in range(min(10,len(llm))):
        m[i] = Label(top ,  text = llm[i])
        m[i].grid(row = 2+i, column = 2, sticky = W, pady = 5)

# CONTENT BASED RECOMMENDER SYSTEM 

### Metadata Based Recommender [ recommender based on movies keywords, cast, director(from crew dataset) and genres ]

In [100]:
#Preprocessing the data

movies_data['id'] = movies_data['id'].astype('int')  #The astype() function is used to cast a pandas object to a specified data type.

# merging both credits and keywords in movies_data on the basis of movie id
movies_data = movies_data.merge(credits, on='id')
movies_data = movies_data.merge(keywords, on='id')

movies_data.drop_duplicates(subset='title',keep='first',inplace=True)# to remove the duplicate values based on title

links_small = links_small['tmdbId'].astype('int')

# taking only those movies whos id is present in link_small because of limited computing power
smd = movies_data[movies_data['id'].isin(links_small)]  
smd = smd.reset_index()
smd


Unnamed: 0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8690,40948,False,,8000000,"[{'id': 18, 'name': 'Drama'}]",,159550,tt0255313,en,The Last Brickmaker in America,...,[],Released,,The Last Brickmaker in America,False,7.0,1.0,"[{'cast_id': 1, 'character': 'Henry Cobb', 'cr...","[{'credit_id': '544475aac3a36819fb000578', 'de...","[{'id': 6054, 'name': 'friendship'}, {'id': 20..."
8691,41168,False,,1000000,"[{'id': 53, 'name': 'Thriller'}, {'id': 10749,...",,392572,tt5165344,hi,रुस्तम,...,"[{'iso_639_1': 'hi', 'name': 'हिन्दी'}]",Released,Decorated Officer. Devoted Family Man. Defendi...,Rustom,False,7.3,25.0,"[{'cast_id': 0, 'character': 'Rustom Pavri', '...","[{'credit_id': '5951baf692514129c4016600', 'de...","[{'id': 10540, 'name': 'bollywood'}]"
8692,41221,False,,15050000,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",,402672,tt3859980,hi,Mohenjo Daro,...,"[{'iso_639_1': 'hi', 'name': 'हिन्दी'}]",Released,,Mohenjo Daro,False,6.7,26.0,"[{'cast_id': 0, 'character': 'Sarman', 'credit...","[{'credit_id': '57cd5d3592514179d50018e8', 'de...","[{'id': 10540, 'name': 'bollywood'}]"
8693,41387,False,,15000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,315011,tt4262980,ja,シン・ゴジラ,...,"[{'iso_639_1': 'it', 'name': 'Italiano'}, {'is...",Released,A god incarnate. A city doomed.,Shin Godzilla,False,6.6,152.0,"[{'cast_id': 4, 'character': 'Rando Yaguchi : ...","[{'credit_id': '560892fa92514177550018b2', 'de...","[{'id': 1299, 'name': 'monster'}, {'id': 7671,..."


In [101]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan


# Applying literal_eval to get the right data type from the expression of string
smd['cast'] = smd['cast'].apply(ast.literal_eval)
smd['crew'] = smd['crew'].apply(ast.literal_eval)
smd['keywords'] = smd['keywords'].apply(ast.literal_eval)
smd['genres'] = smd['genres'].apply(ast.literal_eval)

smd['director'] = smd['crew'].apply(get_director) 

# Taking all the movie cast in a list and then taking only the top 3 cast
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])# Strip Spaces and Convert to Lowercase

smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

smd['genres'] = smd['genres'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['genres'] = smd['genres'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x,x,x])  # giving more weight to the director relative to the entire cast

In [102]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in keywords_count.keys():
            words.append(i)
    return words


# Creating the count of every keywords
keywords_count = dict()
for i in range(len(smd['keywords'])):
    for j in range(len(smd['keywords'][i])):
        if smd['keywords'][i][j] not in keywords_count.keys():
            keywords_count[smd['keywords'][i][j]] = 0
        keywords_count[smd['keywords'][i][j]] +=1

# removing those keywords which occur only once
for i in list(keywords_count):
    if keywords_count[i] == 1:
        del keywords_count[i]

In [103]:
# preprocessing

# Stemming the words 
stemmer = SnowballStemmer('english')

smd['keywords'] = smd['keywords'].apply(filter_keywords) # removing those keywords which occur only once
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

# combining keywords, cast, director and genres
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))
smd['soup'][0] 

'jealousi toy boy friendship friend rivalri boynextdoor newtoy toycomestolif tomhanks timallen donrickles johnlasseter johnlasseter johnlasseter animation comedy family'

In [104]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2) ,min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

cosine_sim = cosine_similarity(count_matrix, count_matrix)

titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])  # Creating a mapping between movie and title and index

In [105]:
df_cosine=pd.DataFrame(cosine_sim)
df_cosine


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8685,8686,8687,8688,8689,8690,8691,8692,8693,8694
0,1.000000,0.024419,0.027390,0.025777,0.024419,0.000000,0.026547,0.029348,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.023816,0.020947,0.030500,0.000000,0.000000,0.000000,0.0
1,0.024419,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.061633,0.027067,0.021592,...,0.000000,0.000000,0.000000,0.050016,0.000000,0.000000,0.000000,0.029735,0.025641,0.0
2,0.027390,0.000000,1.000000,0.060718,0.028760,0.000000,0.062531,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.024671,0.000000,0.034565,0.033352,0.000000,0.0
3,0.025777,0.000000,0.060718,1.000000,0.027067,0.019263,0.058849,0.032530,0.000000,0.000000,...,0.000000,0.000000,0.023669,0.026398,0.023218,0.033806,0.032530,0.062776,0.027067,0.0
4,0.024419,0.000000,0.028760,0.027067,1.000000,0.000000,0.027875,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.021995,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8690,0.030500,0.000000,0.000000,0.033806,0.000000,0.022792,0.000000,0.038490,0.000000,0.000000,...,0.000000,0.000000,0.028006,0.031235,0.000000,1.000000,0.000000,0.037139,0.032026,0.0
8691,0.000000,0.000000,0.034565,0.032530,0.000000,0.021932,0.033501,0.000000,0.032530,0.025950,...,0.000000,0.000000,0.026948,0.000000,0.000000,0.000000,1.000000,0.071474,0.000000,0.0
8692,0.000000,0.029735,0.033352,0.062776,0.000000,0.021162,0.032325,0.107211,0.031388,0.025039,...,0.000000,0.000000,0.026003,0.087002,0.000000,0.037139,0.071474,1.000000,0.089205,0.0
8693,0.000000,0.025641,0.000000,0.027067,0.000000,0.036497,0.000000,0.154083,0.081200,0.043183,...,0.033389,0.000000,0.022422,0.125039,0.065986,0.032026,0.000000,0.089205,1.000000,0.0


In [106]:
indices


title
Toy Story                                                0
Jumanji                                                  1
Grumpier Old Men                                         2
Waiting to Exhale                                        3
Father of the Bride Part II                              4
                                                      ... 
The Last Brickmaker in America                        8690
Rustom                                                8691
Mohenjo Daro                                          8692
Shin Godzilla                                         8693
The Beatles: Eight Days a Week - The Touring Years    8694
Length: 8695, dtype: int64

In [107]:
list_of_titles=smd['title'].tolist()
def get_recommendations(title):
    find_close_match=difflib.get_close_matches(title,list_of_titles)
    close_match=find_close_match[0]
    idx = indices[close_match] # movie id corrosponding to the given title 
    sim_scores = list(enumerate(cosine_sim[idx])) # list of cosine similarity scores value along the given index
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) # sorting the given scores in ascending order
    sim_scores = sim_scores[1:31] # Taking only the top 30 scores
    movie_indices = [i[0] for i in sim_scores] # Finding the indices of 30 most similar movies
    return titles.iloc[movie_indices] 

#get_recommendations("The Avengers").head(10)

    



In [108]:
l = [None for i in range(10)]
def search_movies():
    for i in range(10):
        if l[i] is not None:
            l[i].grid_remove()
    title=e.get()
    find_close_match=difflib.get_close_matches(title,list_of_titles)
    close_match=find_close_match[0]
    ll=list(get_recommendations(title).head(10))
    l[9] = Label(frame ,  text = close_match, bg="light cyan")
    l[9].grid(row = 2, column = 13, sticky = W, pady = 5)
    for i in range(9):
        l[i] = Label(frame ,  text = ll[i], bg="light cyan")
        l[i].grid(row = 3+i, column = 13, sticky = W, pady = 5)
        

In [109]:
# columns to use for training
columns = ['userId','movieId','rating']

# create reader from surprise 
# the rating should lie in the provided scale
reader = Reader(rating_scale =(0.5,5))

#create dataset from dataframe
data = Dataset.load_from_df(ratings[columns],reader)

# create trainset ie the data which is present (ratings of those movies which are rated by respective users)
trainset = data.build_full_trainset()

# create testset, here the anti_testset is testset
# data containing users movie pairs which are not rated by that particular user
testset = trainset.build_anti_testset()
 

model = SVD(n_epochs = 25, verbose = True) #n_epochs:The number of iteration of the SGD(simple gradient descent) procedure. Default is 20
                                           #verbose:If True, prints the current epoch. Default is False.
    
cross_validate(model, data, measures=['RMSE','MAE'], cv= 5, verbose= True)
print('Training Done')

#prediction
prediction = model.test(testset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 0
P

## Collaborative filtering

In [110]:
l = [None for i in range(10)]
def genre_based():
    for i in range(10):
        if l[i] is not None:
            l[i].grid_remove()
    event = clicked.get()
    a=l11.get()
    b=l12.get()
    c=l13.get()
    d=l14.get()
    e=l15.get()
    k=0
    if a=='0':
        if b=='0':
            if c=='0':
                if d=='0':
                    if e=='0':
                        k=1
    # produced recommendations for the user based on the rating provided to selected movies if no genre is selected else recommend top movies from the selected genres
    if event == "Select Genre":
        rtts = list()
        rtts.append(l11.get())
        rtts.append(l12.get())
        rtts.append(l13.get())
        rtts.append(l14.get())
        rtts.append(l15.get())
        movie_ratings = [int(i) for i in rtts]
        print(movie_ratings)
        
        example = { 'userId' : [99999,99999,99999,99999,99999],
           'movieId' : [155, 13,637,550, 11],
           'rating'  : movie_ratings,
                  }
        
        df = pd.DataFrame(example)
        frames = [ratings, df]
        result = pd.concat(frames)
        
        #create dataset from dataframe
        data= Dataset.load_from_df(result[columns],reader)

        #create trainset
        trainset= data.build_full_trainset()

        #create testset, here the anti_testset is testset
        testset = trainset.build_anti_testset()

        cross_validate(model,data, measures=['RMSE','MAE'], cv= 5, verbose= True)
        print('Training Done')

        #prediction
        prediction = model.test(testset)
        prediction[99999]

        # An RMSE value of less than 2 is considered good
        #Now Recommend Users top 10 movies based on prediction

        from collections import defaultdict
        def get_top_n(prediction, n):

            # First map the predictions to each user.
            top_n = defaultdict(list)
            for uid, iid, true_r, est, _ in prediction:
                top_n[uid].append((iid, est))

            # Then sort the predictions for each user and retrieve the n highest ones.
            for uid, user_ratings in top_n.items():
                user_ratings.sort(key=lambda x: x[1], reverse=True)
                top_n[uid] = user_ratings[:n]

            return top_n

        rcmnd = []
        rk = []
        top_n = get_top_n(prediction, n=30)
        for uid, user_ratings in top_n.items():
            if uid == 99999:
                for (iid,rating) in user_ratings:
                    for i in range(smd.shape[0]):
                        if smd['id'][i] == iid:
                            rcmnd.append([smd['id'][i],smd['title'][i]])
                            rk.append([smd['id'][i],smd['genres'][i]])
                break
    
        for i in range(min(10,len(rcmnd))):
            l[i] = Label(frame ,  text = rcmnd[i][1], bg="light cyan")
            l[i].grid(row = 2+i, column = 13, sticky = W, pady = 5)                
    else:
        if k==1:
            ll = list((genres_based[event][['title']].head(10))['title'])
            for i in range(min(10,len(ll))):
                l[i] = Label(frame ,  text = ll[i], bg="light cyan")
                l[i].grid(row = 2+i, column = 13, sticky = W, pady = 5)
        else: 
            rtts = list()
            rtts.append(l11.get())
            rtts.append(l12.get())
            rtts.append(l13.get())
            rtts.append(l14.get())
            rtts.append(l15.get())
            movie_ratings = [int(i) for i in rtts]
            print(movie_ratings)
        
        
            example = { 'userId' : [99999,99999,99999,99999,99999],
                 'movieId' : [155, 13,637,550, 11],
                  'rating'  : movie_ratings,
                      }
        
            df = pd.DataFrame(example)
            frames = [ratings, df]
            result = pd.concat(frames)
        
        #create dataset from dataframe
            data= Dataset.load_from_df(result[columns],reader)

        #create trainset
            trainset= data.build_full_trainset()

        #create testset, here the anti_testset is testset
            testset = trainset.build_anti_testset()

            cross_validate(model,data, measures=['RMSE','MAE'], cv= 5, verbose= True)
            print('Training Done')

        #prediction
            prediction = model.test(testset)
            prediction[99999]

        # An RMSE value of less than 2 is considered good
        #Now Recommend Users top 10 movies based on prediction

            from collections import defaultdict
            def get_top_n(prediction, n):
                

            # First map the predictions to each user.
                top_n = defaultdict(list)
                for uid, iid, true_r, est, _ in prediction:
                    top_n[uid].append((iid, est))

            # Then sort the predictions for each user and retrieve the n highest ones.
                for uid, user_ratings in top_n.items():
                    user_ratings.sort(key=lambda x: x[1], reverse=True)
                    top_n[uid] = user_ratings[:n]

                return top_n

            rcmnd = []
            rk = []
            top_n = get_top_n(prediction, n=1000)
            for uid, user_ratings in top_n.items():
                if uid == 99999:
                    for (iid,rating) in user_ratings:
                        for i in range(smd.shape[0]):
                            if smd['id'][i] == iid:
                                rcmnd.append([smd['id'][i],smd['title'][i]])
                                rk.append([smd['id'][i],smd['genres'][i]])
                    break
                    
            mp=[]
            
            genre1=['drama', 'thriller','family', 'animation','comedy','romance','action', 'adventure','sciencefiction','western','horror','crime',
                     'fantasy','documentary','history','tvmovie','war','mystery','music']
            genre2=difflib.get_close_matches(event,genre1)
            event=genre2[0]
            for i in range(len(rk)):
                for genre in rk[i][1]:
                    if genre == event:
                        mp.append(rcmnd[i][1])
            
            for i in range(min(10,len(mp))):
                l[i] = Label(frame ,  text = mp[i], bg="light cyan")
                l[i].grid(row = 2+i, column = 13, sticky = W, pady = 5) 

                           
                
            
    
            

In [111]:
# temp dataframe contains movies which are present in both ratings and movies_data dataframe

id_list = list(ratings['movieId'])
temp = movies_data.loc[movies_data['id'].isin(id_list)]
temp = temp.reset_index()
print(temp[['id' , 'title']].head(10))
temp['title'][1]

     id                      title
0   949                       Heat
1   710                  GoldenEye
2  1408           Cutthroat Island
3   524                     Casino
4  4584      Sense and Sensibility
5     5                 Four Rooms
6  8012                 Get Shorty
7   451          Leaving Las Vegas
8   902  The City of Lost Children
9    63             Twelve Monkeys


'GoldenEye'

In [112]:
# This snippet is used to find top ranked movies according to imdb score which are present in temp dataframe.

available_movies = []

for movie in list(t1['title']):
    movie = movie.lower()
    for i in range(temp.shape[0]):
        name = temp['title'][i].lower()
        if name == movie:
            available_movies.append((temp['id'][i] , movie))
            
print(available_movies)

[(278, 'the shawshank redemption'), (238, 'the godfather'), (155, 'the dark knight'), (550, 'fight club'), (680, 'pulp fiction'), (13, 'forrest gump'), (424, "schindler's list"), (129, 'spirited away'), (1891, 'the empire strikes back'), (637, 'life is beautiful'), (240, 'the godfather: part ii'), (122, 'the lord of the rings: the return of the king'), (101, 'leon: the professional'), (510, "one flew over the cuckoo's nest"), (497, 'the green mile'), (11, 'star wars')]


In [113]:
root=Tk()
root.title('MovieFlex')
root.iconbitmap('play.ico')
#root.geometry('1000x600')

# create a menubar
menubar = Menu(root)

menubar.config(bg = "GREEN",fg='white',activebackground='red',activeforeground='pink',font=("Verdana",20))
# if you are using Os:-windows tha color of menubar is set default and can't be alter 

# create the file_menu
file_menu = Menu(
	menubar,
	tearoff=0,
	bg="#F0F0FF"
)

# add menu items to the File menu
file_menu.add_command(label='New')
file_menu.add_command(label='Open...')
file_menu.add_command(label='Close')
file_menu.add_separator()

# add Exit menu item
file_menu.add_command(
	label='Exit',
	command=root.destroy
)

# add the File menu to the menubar
menubar.add_cascade(
	label="Home",
	menu=file_menu
)

#create the movie_menu
movie_menu = Menu(
	menubar,
	tearoff=0,
	bg="#F0F0FF"
)

def Movies():
	global top1
	top1=Toplevel()
	top1.title('Movies')
	top1.iconbitmap('play.ico')
	top1.geometry("800x550")
	top1.config(bg="black")
	options1=[
	   '10',
	   '20',
	   '30',
	   '40'
	]
	Font_tuple = ("Comic Sans MS", 20, "bold")
	l1=Label(top1, text="Movie Recommender System", fg="orange", bg="black",font=Font_tuple)
	l1.grid(row=0,column=1, sticky=S,pady=20)
	l2 = Label(top1, text = "Select list of top movies : ", font=('Helvetica',15,'bold'), bg="black", fg="#7F7FFF")
	l2.grid(row = 3,column = 0,sticky = W, pady = 5,padx=15)
	# datatype of menu text
	global clicked2
	clicked2 = IntVar()
	# initial menu text
	clicked2.set( "Top Movies" )
	# button widget with green color text
	button = Button(top1, text = "SUBMIT" , fg = "white",bg = "green",command=tpmovies)
	button.grid(row = 9,column = 1, sticky = S)
	# Create Dropdown menu
	drop = OptionMenu( top1, clicked2 , *options1)
	drop.grid(row = 5,column = 0, sticky = W, padx=25, pady=5)


movie_menu.add_command(label='Top Movies', command=Movies)

#add the movie menu to menubar
menubar.add_cascade(
	label="Movie",
	menu=movie_menu

)

#create the genre_menu

genre_menu = Menu(
	menubar,
	tearoff=0,
	bg="#F0F0FF"
)

def dropdown():
	global top
	top=Toplevel()
	top.title('Genres')
	top.iconbitmap('play.ico')
	top.geometry("800x550")
	options = [
	   'Action',
	   'Adventure',
	   'Animation',
	   'Comedy',
	   'Crime',
	   'Documentary',
	   'Drama',
	   'Family',
	   'Fantasy',
	   'History',
	   'Horror',
	   'Music',
	   'Mystery',
	   'Romance',
	   'Science Fiction',
	   'TV Movie',
	   'Thriller',
	   'War',
	   'Western'
	]
	l1=Label(top, text="Movie Recommender System", fg="blue")
	l1.grid(row=0,column=1)
	l2 = Label(top, text = "Select genre of the movie you want to watch : ")
	l2.grid(row = 1,column = 0,sticky = W, pady = 2)
	# datatype of menu text
	global clicked1
	clicked1 = StringVar()
	# initial menu text
	clicked1.set( "Genre" )
	# button widget with green color text
	button = Button(top, text = "SUBMIT" , fg = "white",bg = "green",command=gen)
	button.grid(row = 9,column = 1, sticky = S)

	# Create Dropdown menu
	drop = OptionMenu( top, clicked1 , *options)
	drop.grid(row = 3,column = 0, sticky = W)


genre_menu.add_command(label='Select Genres', command=dropdown)
#add the movie menu to menubar

menubar.add_cascade(
	label="Genre",
	menu=genre_menu

)



# create the Help menu
help_menu = Menu(
	menubar,
	tearoff=0,
	bg="#F0F0FF"
)

def welcome():
	top=Toplevel()
	top.title('Welcome')
	top.iconbitmap('play.ico')
	top.config(bg="black")
	top.geometry('600x350')
	ln=Label(top,text="Welcome!",pady=40,font=("Copper Black",30,'bold'),fg="#7F7FFF", bg="black").pack()
	ln1=Label(top,text=" Welcome to Movieflex!",padx=10,pady=5,fg="salmon",bg="black",font=(15)).pack()
	ln1=Label(top,text="We wish you have a great experience with us and the movie we suggest!",pady=10,fg="salmon",bg="black",font=(15)).pack()


def about():
	top=Toplevel()
	top.title('About')
	top.iconbitmap('play.ico')
	top.config(bg="black")
	ln=Label(top,text="MovieFlex",pady=40,font=("Copper Black",30,'bold'),fg="#7F7FFF", bg="black").pack()
	ln1=Label(top,text="Movieflex is a platform, which recommends its users the best of the movies to experience based on standard ratings used internationally.",padx=10,pady=5,fg="indian red",bg="black",font=('Helvetica',12)).pack()
	ln2=Label(top,text="Movieflex also aims personalisation of rating of movies, so that every user can have a result of recommendations which are a merger of their own choices and international standards.",padx=10,pady=5,fg="indian red",bg="black",font=('Helvetica',12)).pack()
	ln3=Label(top,text="This way, Movieflex provides an enriched set of recommendations, personalised to its every user. We, at Movieflex strive to give the best of experience and we hope that you are enjoying our service.",padx=10,pady=5,fg="indian red",bg="black",font=('Helvetica',12)).pack()
	ln4=Label(top,text="Hope! you are enjoying our service.",pady=10,fg="indian red",bg="black",font=('Helvetica',12)).pack()
	ln5=Label(top,text=" ",pady=50,bg="black").pack()


help_menu.add_command(label='Welcome',command=welcome)
help_menu.add_command(label='About...',command=about)

# add the Help menu to the menubar
menubar.add_cascade(
	label="Help",
	menu=help_menu
)

ln2=Label(root,text="MovieFlex",font=("Copper Black",30,'bold'),fg="#7F7FFF", bg="black")
ln2.pack(pady=5)
e=Entry(root, width=30, font=('Helvetica',18), borderwidth=5)
e.pack(pady=10)

btn=Button(root,text="Search Movies",font=('Helvetica',12,'bold'),fg="#40768D",command=search_movies)
btn.pack()

frame=LabelFrame(root,padx=50,pady=10,bg="light cyan")
frame.pack(padx=15,pady=50)

# Dropdown menu options
options = [
    'Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western'
]

# adding a label to the root window
l1 = Label(frame, text = "MOVIE  RECOMMENDER  SYSTEM", fg = "tomato",font=('Helvetica',26, 'bold'),bg='light cyan')
l1.grid(row = 0,column = 3,columnspan=12,pady=20,sticky=W)
l2 = Label(frame, text = "Select genre of the movie you want to watch : ",bg="light cyan",font=('Helvetica',12,'bold'))
l2.grid(row = 1,column = 0,columnspan=5,sticky = W, pady = 2)


# datatype of menu text
clicked = StringVar()

# initial menu text
clicked.set( "Select Genre")

# Create Dropdown menu
drop = OptionMenu( frame , clicked , *options)
drop.grid(row = 2,column = 0, sticky = W)

# Creating seperators for better UI 
x1 = tkinter.ttk.Separator(frame, orient=VERTICAL).grid(column=5, row=1, rowspan=12, sticky='ns')

l3 = Label(frame, text = "Rate the following movies",bg="light cyan",font=('Helvetica',12,'bold'))
l3.grid(row = 1,column = 6,columnspan=3,sticky = W, pady = 2)

# labels for movies name
l4 = Label(frame, text="Movies Name",bg="light cyan",font=('Helvetica',10,'bold')).grid(row=2, column=6, columnspan=2,sticky=W,pady=5)
l5 = Label(frame, text="The Dark Knight",bg="light cyan").grid(row=3,  column=6, columnspan=3,sticky=W,pady=5) # 278
l6 = Label(frame, text="Forest Gump",bg="light cyan").grid(row=4,  column=6, columnspan=2,sticky=W,pady=5) # 13
l7 = Label(frame, text="Life is Beautiful",bg="light cyan").grid(row=5, column=6, columnspan=2,sticky=W,pady=5) #637
l8 = Label(frame, text="Fight Club",bg="light cyan").grid(row=6, column=6, columnspan=3,sticky=W,pady=5) #122
l9 = Label(frame, text="Star Wars",bg="light cyan").grid(row=7, column=6, columnspan=2,sticky=W,pady=5) # 11

# label for movies rating
l10 = Label(frame, text="Rate the movie on the scale of 5",bg="light cyan",font=('Helvetica',10,'bold')).grid(row=2, column=9,columnspan=3,sticky=W,pady=5)
l11 = Spinbox(frame, from_= 0, to = 5)
l11.grid(row=3, column=9,columnspan=2,sticky=W,padx=25,pady=5)
l12 = Spinbox(frame, from_= 0, to = 5)
l12.grid(row=4, column=9,columnspan=2,sticky=W,padx=25,pady=5)
l13= Spinbox(frame, from_= 0, to = 5)
l13.grid(row=5, column=9,columnspan=2,sticky=W,padx=25,pady=5)
l14 = Spinbox(frame, from_= 0, to = 5)
l14.grid(row=6, column=9,columnspan=2,sticky=W,padx=25,pady=5)
l15 = Spinbox(frame, from_= 0, to = 5)
l15.grid(row=7, column=9,columnspan=2,sticky=W,padx=25,pady=5)


# button widget with green color text
button = Button(frame, text = "SUBMIT" , fg = "white",bg = "green",command = genre_based)
button.grid(row = 14,column = 8, sticky = S,pady=50)

# Creating seperators for better UI 
x2 = tkinter.ttk.Separator(frame, orient=VERTICAL).grid(column=12, row=1, rowspan=12, sticky='ns',padx=5)

l16 = Label(frame, text="Results",bg="light cyan",font=('Helvetica',12,'bold')).grid(row=1, column= 13,columnspan=4, sticky = W, pady=10,padx=80)

root.config(menu=menubar, bg="black")
root.mainloop()

[2, 0, 2, 0, 4]
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Proc