# *Video Recommender - IBM Summer Project*

(I024) Abhinav Kumar

(I026) Abhinav Kumar

(I027) Saket Lakhotia

(I030) Anishka Moona

(I050) Nishidh Singh Shekhawat


Dataset: Movies + User Ratings Dataset

We selected movies as our dataset in video recommendations as it contains simillar metadata and same concepts can be applied to videos.


## Imports

In [1]:
# Importing important libraries
import numpy as np
import pandas as pd
import regex as re
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download(["punkt","stopwords","wordnet"])
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from flask import Flask, render_template, jsonify, request, redirect
from flask_table import Table, Col, LinkCol

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nishidh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nishidh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nishidh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data exploration

In [2]:
# Loading the dataset
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies.shape

(10329, 3)

The movies dataset contains the movie id along with tile and genre of 10329 Movies

In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [6]:
ratings.shape

(105339, 4)

The ratings dataset contains a userId, movieId he watched, and the rating he gave along with a timestamp

In [7]:
# Extracting genres
genre = set()

for txt in movies['genres']:
  x = txt.split("|")
  for g in x:
    genre.add(g)
genre

{'(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

We can see the dataset contains movies with 19 genres and some with no genre listed

## Creating Class for recommendation

We create multiple functions inside the class based on recommendation type

We recommend movies based on:

1. Ratings Correlation
2. Simillar Users
3. Cosine Similarity

In [8]:
class RecommendationSystem:
    def __init__(self):
        self.movies = pd.read_csv("movies.csv")
        self.ratings = pd.read_csv("ratings.csv")
        self.dataset = self.movies.merge(self.ratings)
        
    def sort_movies_by_year(self,li):
        def merge_sort(a,l,r):
            if l==r:
                return
            mid=(l+r)//2
            merge_sort(a,l,mid)
            merge_sort(a,mid+1,r)
            merge(a,l,mid,r)

        def merge(a,l,mid,r):
            n1=mid-l+1
            n2=r-(mid+1)+1
            L=[a[i+l] for i in range(n1)]
            R=[a[i+mid+1] for i in range(n2)]
            i,j,k=0,0,l
            while(i<n1 and j<n2):
                if int(L[i][-5:-1])>int(R[j][-5:-1]) :
                    a[k]=L[i]
                    i+=1
                else:
                    a[k]=R[j]
                    j+=1
                k+=1
            while(i<n1):
                a[k]=L[i]
                i+=1
                k+=1
            while(j<n2):
                a[k]=R[j]
                j+=1
                k+=1
        merge_sort(li,0,len(li)-1)
    
    
    def get_movie_by_id(self,mv_id):
        return self.movies.loc[rs.movies['movieId']==mv_id,['title']].values[0][0]
    
    
    def clean_feature_and_return_ndarray(self,genres):
        lemmatizer = WordNetLemmatizer()
        li=[]
        for i in range(len(genres)):
            temp = genres[i].lower()
            temp = temp.split("|")
            temp = [lemmatizer.lemmatize(word) for word in temp]
            li.append(" ".join(temp))
        
        cv = CountVectorizer()
        return cv.fit_transform(li).toarray(),cv,li
        
    def content_based_filtering(self,userId,no_of_movies=15):
        #Finding based on similar movies
        
        X,cv,li = self.clean_feature_and_return_ndarray(self.movies["genres"])
        movies_dataset = pd.DataFrame(li,columns=["genres"],index=self.movies["title"])
        
        def get_movie_by_index(movies_dataset,idx):   
            return movies_dataset.index[idx]
        
        similarities = cosine_similarity(X)
        
        def latest_movieId_watched(uid):
            time = self.ratings.loc[self.ratings["userId"]==uid,["movieId","timestamp"]]
            return time.sort_values(by="timestamp",ascending=False)["movieId"].values[0]
        
        latest_movieId_watched_by_user = latest_movieId_watched(userId)
        movie_index = self.movies.loc[self.movies['movieId']==latest_movieId_watched_by_user,["title"]].index[0]
        similarity_values = pd.Series(similarities[movie_index])
        
        
        similar_movie_indexes = list(similarity_values.sort_values(ascending=False).index)
        similar_movie_indexes.remove(movie_index) #Remove the already watched movie from index list
        
        li = [get_movie_by_index(movies_dataset,idx) for idx in similar_movie_indexes]
        li = li[:no_of_movies]
        
        self.sort_movies_by_year(li)
        
        print("Since u have watched: ",self.get_movie_by_id(latest_movieId_watched_by_user),"<--- \n Recommended:",end="\n\n")
        
        
        for i in range(no_of_movies):
            print(li[i])
        
        return li
    
    def collaborative_filtering(self,uid,no_of_movies=15):
        #Finding based on similar users
        
        X,cv,li = self.clean_feature_and_return_ndarray(self.dataset['genres'])
        genres = pd.DataFrame(X,columns=cv.get_feature_names())
        
        users = pd.DataFrame(self.dataset['userId'],columns=['userId'])
        users = users.join(genres)
        
        users_moviemat = users.groupby("userId").sum()
        X = users_moviemat.iloc[:,:].values

        classifier = NearestNeighbors()
        classifier.fit(X)
        
        li = classifier.kneighbors([X[uid-1]],n_neighbors=5,return_distance=False)
        current_user = self.dataset.loc[self.dataset["userId"]==li[0][0],:]["title"].values
        similar_user = self.dataset.loc[self.dataset["userId"]==li[0][1],:]["title"].values
 
        movies_list = [movie for movie in similar_user if movie not in current_user]
        
        self.sort_movies_by_year(movies_list)
        
        
        print("U May Like These Movies \n\n")
        
        for i in range(no_of_movies):
            print(movies_list[i])
            
        return movies_list[:no_of_movies]
            
    def based_on_ratings(self,movieId):
        avg_ratings = self.dataset.groupby("title")['rating'].mean()
        count = self.dataset.groupby("title")['rating'].count()
        dataset_based_on_ratings = pd.DataFrame({"rating":avg_ratings,"number of ratings":count})
        
        
        df = self.dataset.loc[:,["userId","rating","title"]]
        users_movie_matrix = pd.pivot_table(df,columns='title',index='userId',values='rating') 
        
        
        movie_watched = users_movie_matrix[self.get_movie_by_id(movieId)]
        li = []
        for i in range(len(users_movie_matrix.columns)):
            li.append(movie_watched.corr(users_movie_matrix.iloc[:,i]))
        li = pd.Series(li)
        
        
        df = pd.DataFrame({"title": users_movie_matrix.columns,"Correlation": li,"number of ratings" : dataset_based_on_ratings["number of ratings"].values})
        recommendation_set = df[df["number of ratings"] >= 50].sort_values(by=["Correlation","number of ratings"],ascending=False)
        
        recommended_movies = recommendation_set["title"].values
        print("Movies which have similar ratings like given movie --->",self.get_movie_by_id(movieId),"<--- are",end="\n\n")
        
        for i in range(1,16):
            print(recommended_movies[i])
        return recommended_movies
            
        
    def recommend(self,user_id=None,movie_id=None):
        if movie_id is None and user_id is None:
            print("Error, No user id or movie id found")
        elif user_id is not None and movie_id is not None:
            rmcb = self.content_based_filtering(user_id)
            print("\n\n\n")
            rmc = self.collaborative_filtering(user_id)
            print("\n\n\n")
            rm = self.based_on_ratings(movie_id)
            return rmcb,rmc,rm
        elif user_id is None and movie_id is not None:
            rm = self.based_on_ratings(movie_id)
            return rm
        else:
            rm = self.content_based_filtering(user_id)
            
            print("\n\n\n")
            rmc = self.collaborative_filtering(user_id)
            return rm,rmc
            
        

Now that all the methods have been implemented in the class we can test them below by creating an object and calling the respective functions

In [9]:
# Creating a object of the class
rs = RecommendationSystem()

rs.recommend(movie_id= 1000) # Based on Cosine Similarity
rs.recommend(user_id = 145 ,movie_id = 1000)  # Ratings Correlation
rs.recommend(user_id = 145) #  Based on SimilarUsers

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


Movies which have similar ratings like given movie ---> Curdled (1996) <--- are

Star Trek: First Contact (1996)
Leaving Las Vegas (1995)
Liar Liar (1997)
Lost World: Jurassic Park, The (1997)
Chasing Amy (1997)
Wedding Singer, The (1998)
Devil's Advocate, The (1997)
Mars Attacks! (1996)
Independence Day (a.k.a. ID4) (1996)
Mission: Impossible (1996)
Jerry Maguire (1996)
Grosse Pointe Blank (1997)
Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
Pulp Fiction (1994)
Forrest Gump (1994)
Since u have watched:  Usual Suspects, The (1995) <--- 
 Recommended:

Man from Reno (2014)
Love Crime (Crime d'amour) (2010)
Righteous Kill (2008)
Mercy (2000)
Switchback (1997)
Thunderheart (1992)
Frantic (1988)
St. Ives (1976)
Farewell, My Lovely (1975)
Murder on the Orient Express (1974)
Last of Sheila, The (1973)
Sherlock Holmes: Terror by Night (1946)
Sherlock Holmes and the Secret Weapon (1942)
Hound of the Baskervilles, The (1939)
Adventures of Sherlock Holmes, The (1939)




U May Like These Movies 


W

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


Movies which have similar ratings like given movie ---> Curdled (1996) <--- are

Star Trek: First Contact (1996)
Leaving Las Vegas (1995)
Liar Liar (1997)
Lost World: Jurassic Park, The (1997)
Chasing Amy (1997)
Wedding Singer, The (1998)
Devil's Advocate, The (1997)
Mars Attacks! (1996)
Independence Day (a.k.a. ID4) (1996)
Mission: Impossible (1996)
Jerry Maguire (1996)
Grosse Pointe Blank (1997)
Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
Pulp Fiction (1994)
Forrest Gump (1994)
Since u have watched:  Usual Suspects, The (1995) <--- 
 Recommended:

Man from Reno (2014)
Love Crime (Crime d'amour) (2010)
Righteous Kill (2008)
Mercy (2000)
Switchback (1997)
Thunderheart (1992)
Frantic (1988)
St. Ives (1976)
Farewell, My Lovely (1975)
Murder on the Orient Express (1974)
Last of Sheila, The (1973)
Sherlock Holmes: Terror by Night (1946)
Sherlock Holmes and the Secret Weapon (1942)
Hound of the Baskervilles, The (1939)
Adventures of Sherlock Holmes, The (1939)




U May Like These Movies 


W

(['Man from Reno (2014)',
  "Love Crime (Crime d'amour) (2010)",
  'Righteous Kill (2008)',
  'Mercy (2000)',
  'Switchback (1997)',
  'Thunderheart (1992)',
  'Frantic (1988)',
  'St. Ives (1976)',
  'Farewell, My Lovely (1975)',
  'Murder on the Orient Express (1974)',
  'Last of Sheila, The (1973)',
  'Sherlock Holmes: Terror by Night (1946)',
  'Sherlock Holmes and the Secret Weapon (1942)',
  'Hound of the Baskervilles, The (1939)',
  'Adventures of Sherlock Holmes, The (1939)'],
 ['Waterworld (1995)',
  'Die Hard: With a Vengeance (1995)',
  'Seven (a.k.a. Se7en) (1995)',
  'Babe (1995)',
  'GoldenEye (1995)',
  'True Lies (1994)',
  'Lion King, The (1994)',
  'Forrest Gump (1994)',
  'Shawshank Redemption, The (1994)',
  'Specialist, The (1994)',
  'Nell (1994)',
  'Interview with the Vampire: The Vampire Chronicles (1994)',
  'Jurassic Park (1993)',
  'Fugitive, The (1993)',
  'Cliffhanger (1993)'])

## Creating a sql server to save the dataset movies in a database 

In [10]:
from sqlalchemy import create_engine
engine = create_engine('sqlite://', echo=False)

In [11]:
# adding movies dataset to the database
movies = pd.read_csv('movies.csv')
movies.to_sql('movieId', 
              con=engine,
              if_exists='replace',
              index=False)

abc = engine.execute("SELECT * FROM movieId").fetchall()
lis = []
lis2 = []
#Printing 1st 5 movies in database //Testing
for x in abc[:5]:
    name = x["title"]
    lis.append(name)
    id = x["movieId"]
    lis2.append(id)
print(lis)
lis2
#[str(x) for x in abc[:5]]

['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)', 'Waiting to Exhale (1995)', 'Father of the Bride Part II (1995)']


[1, 2, 3, 4, 5]

## Deployement on flask

Making a basic webpage to display the recomendation system

In [12]:
app = Flask(__name__, template_folder='tmplate')
result = []
@app.route('/')
def main():
    movies = pd.read_csv('movies.csv') # Load again
    movies.to_sql('movieId', con=engine ,if_exists='replace',
           index=False)
    result = engine.execute("SELECT * FROM movieId").fetchall()
    return render_template('index.html', movies=result)
    
@app.route("/movies",methods=["POST","GET"])
def movies():  
    if request.method == 'POST':
        category_id = request.form['category_id'] 
        return
        
@app.route("/predict",methods=["POST","GET"])
def predict():  
    if request.method == 'POST':
        category_id = request.form['submitted'] 

        # Declare your table
        class ItemTable(Table):
            name = Col('Predictions')

        # Get some objects
        class Item(object):
            def __init__(self, name):
                self.name = name

        items = []
        for x in rs.recommend(movie_id= int(category_id))[1:16]:
            items.append(Item(str(x)))

        # Populate the table
        table = ItemTable(items)

        # Print the html    
        return render_template('predict.html', pred= table)


if __name__ == '__main__':
    app.run(debug=False)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [02/Jun/2021 01:43:25] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [02/Jun/2021 01:43:25] "[33mGET /style.css HTTP/1.1[0m" 404 -


## Conclusion

Video Recommendation system was implemented successfully by using movies dataset. Same technique can be applied to any video dataset which has genres or tags. We used three methods to recommend the videos and combined them together in a class. The model was then deployed on flask successfully. 

We learned a lot about recommendation methods and flask deployment whilde doing this project. 
