## Recomendation System For Movies

In [1]:
#libraries 
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv('movies.csv')
users = pd.read_csv('ratings.csv')

In [3]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
users.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [5]:
#modify the titles to make it easier to perform any NLP operation
#movies = movies.drop_duplicates(subset=['title'])
movies['mod_title']= movies['title'].apply(lambda x: x[:x.find('(')])
movies.head()

Unnamed: 0,movieId,title,genres,mod_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II


In [6]:
movies['mod_title'] = movies['mod_title'].str.lower()

In [7]:
#zero null titles, so we don't need to remove any rows from the dataset
movies['title'].isna().sum() 

0

In [8]:
movies[movies['title'].str.contains('Toy')]

Unnamed: 0,movieId,title,genres,mod_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story
1928,2017,Babes in Toyland (1961),Children|Fantasy|Musical,babes in toyland
2162,2253,Toys (1992),Comedy|Fantasy,toys
2993,3086,Babes in Toyland (1934),Children|Comedy|Fantasy|Musical,babes in toyland
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,toy story 2
4823,4929,"Toy, The (1982)",Comedy,"toy, the"
5731,5843,Toy Soldiers (1991),Action|Drama,toy soldiers
8533,26033,Giants and Toys (Kyojin to gangu) (1958),Comedy|Drama,giants and toys
14720,78062,Puppet Master vs. Demonic Toys (Puppet Master ...,Comedy|Fantasy|Horror|Sci-Fi|Thriller,puppet master vs. demonic toys
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,toy story 3


In [9]:
#ml packages
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 
import re

In [10]:
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(movies['mod_title'])

In [11]:
query = 'toy story'
processed = re.sub("[^a-zA-Z0-9 ]","", query.lower())
query_vec = vectorizer.transform([processed])
query_vec.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [12]:
similarity = cosine_similarity(query_vec, tfidf).flatten()
similarity

array([1., 0., 0., ..., 0., 0., 0.])

In [13]:
indices = np.argpartition(similarity,-10)[-5:]
indices

array([ 4823, 14813,  3021, 59767,     0])

In [14]:
results = movies.iloc[indices]
results

Unnamed: 0,movieId,title,genres,mod_title
4823,4929,"Toy, The (1982)",Comedy,"toy, the"
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,toy story 3
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,toy story 2
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,toy story 4
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story


In [15]:
#function that acts as a search engine 
def search(query):
    vectorizer = TfidfVectorizer()

    tfidf = vectorizer.fit_transform(movies['mod_title'])
    
    processed = re.sub("[^a-zA-Z0-9 ]","", query.lower())
    query_vec = vectorizer.transform([processed])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    
    indices = np.argpartition(similarity,-10)[-5:]
    
    results = movies.iloc[indices]
    
    return results
   
query = 'Toy Story'

search_df = search(query)
search_df

Unnamed: 0,movieId,title,genres,mod_title
4823,4929,"Toy, The (1982)",Comedy,"toy, the"
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,toy story 3
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,toy story 2
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,toy story 4
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story


In [16]:
liked_mov_id = search_df['movieId'].iloc[-1]
liked_mov_id

1

In [17]:
liked_mov_genre = search_df['genres'].iloc[-1]
liked_mov_genre

'Adventure|Animation|Children|Comedy|Fantasy'

In [18]:
users.shape

(25000095, 4)

In [19]:
#get the set of users who also like the same movie as you
overlap_users = set()

for i in range(users.shape[0]):
    
    if users['movieId'].iloc[i] == liked_mov_id and users['rating'].iloc[i] >= 4.5:
        user_id = users['userId'].iloc[i]
        overlap_users.add(user_id)
        
    

In [20]:
similar_users = users[users['userId'].isin(overlap_users)]
similar_users

Unnamed: 0,userId,movieId,rating,timestamp
5101,36,1,5.0,857131378
5102,36,7,3.0,857131397
5103,36,10,3.0,857131163
5104,36,11,4.0,840790432
5105,36,34,5.0,834413787
...,...,...,...,...
24998887,162533,85774,4.0,1329514173
24998888,162533,88129,4.5,1329514139
24998889,162533,88744,1.5,1329514370
24998890,162533,89470,2.0,1329514353


In [21]:
merged_df = similar_users.merge(movies,on='movieId',how='inner')
merged_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,mod_title
0,36,1,5.0,857131378,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story
1,75,1,5.0,1537207651,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story
2,86,1,5.0,945462775,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story
3,90,1,5.0,863538043,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story
4,93,1,5.0,1496543050,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story


In [22]:
#look at the movies that have been seen by a fair amount of people 
freq_lst = ((merged_df['mod_title'].value_counts() > 300).apply(lambda x: x if x else np.NaN)).dropna()
freq_names = list(freq_lst.index)
freq_names

['toy story ',
 'forrest gump ',
 'star wars: episode iv - a new hope ',
 'shawshank redemption, the ',
 'pulp fiction ',
 'jurassic park ',
 'matrix, the ',
 'silence of the lambs, the ',
 'star wars: episode vi - return of the jedi ',
 'star wars: episode v - the empire strikes back ',
 'aladdin ',
 'back to the future ',
 'lion king, the ',
 'terminator 2: judgment day ',
 'independence day ',
 'raiders of the lost ark ',
 'braveheart ',
 'toy story 2 ',
 'apollo 13 ',
 'beauty and the beast ',
 "schindler's list ",
 'lord of the rings: the fellowship of the ring, the ',
 'shrek ',
 'fugitive, the ',
 'usual suspects, the ',
 'fargo ',
 'twelve monkeys ',
 'sixth sense, the ',
 'monsters, inc. ',
 'fight club ',
 'seven ',
 'men in black ',
 'lord of the rings: the two towers, the ',
 'batman ',
 'gladiator ',
 'saving private ryan ',
 'groundhog day ',
 'lord of the rings: the return of the king, the ',
 'american beauty ',
 'godfather, the ',
 'finding nemo ',
 'mission: impossibl

In [23]:
common_liked = merged_df[merged_df['mod_title'].isin(freq_names)]
common_liked['mod_genre'] = common_liked['genres'].str.replace('[|]'," ", regex=True)
common_liked

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  common_liked['mod_genre'] = common_liked['genres'].str.replace('[|]'," ", regex=True)


Unnamed: 0,userId,movieId,rating,timestamp,title,genres,mod_title,mod_genre
0,36,1,5.0,857131378,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story,Adventure Animation Children Comedy Fantasy
1,75,1,5.0,1537207651,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story,Adventure Animation Children Comedy Fantasy
2,86,1,5.0,945462775,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story,Adventure Animation Children Comedy Fantasy
3,90,1,5.0,863538043,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story,Adventure Animation Children Comedy Fantasy
4,93,1,5.0,1496543050,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story,Adventure Animation Children Comedy Fantasy
...,...,...,...,...,...,...,...,...
4810970,152653,135745,1.5,1440608023,Labyrinth (2012),Adventure|Fantasy,labyrinth,Adventure Fantasy
4811128,154484,187265,3.5,1524881814,The Accountant (2001),Comedy|Drama,the accountant,Comedy Drama
4811159,154484,187431,2.0,1524930699,Bad Company (1999),Drama|Romance,bad company,Drama Romance
4811266,155163,175433,4.0,1500204900,Get Out (2010),Animation|Comedy,get out,Animation Comedy


In [24]:
most_liked = common_liked.groupby('mod_title').mean()[['rating']].sort_values('rating',ascending=False).\
reset_index()
most_liked

Unnamed: 0,mod_title,rating
0,toy story,4.858535
1,planet earth,4.618056
2,"shawshank redemption, the",4.589576
3,schindler's list,4.474211
4,"godfather, the",4.459941
...,...,...
3010,police academy 6: city under siege,1.863260
3011,catwoman,1.804772
3012,baby geniuses,1.735905
3013,dumb and dumberer: when harry met lloyd,1.705634


In [25]:
#the ratings seem to be more popularity-based, we need to make it more niche -> use genres to filter 

In [26]:
genres_movies = common_liked[['mod_title','mod_genre']].drop_duplicates(subset=['mod_title'])
genres_movies

Unnamed: 0,mod_title,mod_genre
0,toy story,Adventure Animation Children Comedy Fantasy
18835,sabrina,Comedy Romance
21375,goldeneye,Action Adventure Thriller
26224,"american president, the",Comedy Drama Romance
29514,babe,Children Drama
...,...,...
4597180,crazy heart,Drama Romance
4599529,little children,Drama Romance
4609196,rocketman,Drama
4609541,swing kids,Drama War


In [27]:
general_rec_df = most_liked.merge(genres_movies, on='mod_title')
general_rec_df

Unnamed: 0,mod_title,rating,mod_genre
0,toy story,4.858535,Adventure Animation Children Comedy Fantasy
1,planet earth,4.618056,Documentary
2,"shawshank redemption, the",4.589576,Crime Drama
3,schindler's list,4.474211,Drama War
4,"godfather, the",4.459941,Crime Drama
...,...,...,...
3010,police academy 6: city under siege,1.863260,Comedy Crime
3011,catwoman,1.804772,Action Crime Fantasy
3012,baby geniuses,1.735905,Comedy
3013,dumb and dumberer: when harry met lloyd,1.705634,Comedy


In [28]:
vectorizer2 = TfidfVectorizer()

tfidf2 = vectorizer2.fit_transform(general_rec_df['mod_genre'])

In [29]:
query_genre = 'Adventure|Animation|Children|Comedy|Fantasy'
processed2 = re.sub("[|]"," ", query_genre)
processed2

'Adventure Animation Children Comedy Fantasy'

In [30]:
query_vec2 = vectorizer2.transform([processed2])
similarity2 = cosine_similarity(query_vec2, tfidf2).flatten()
similarity2

array([1.        , 0.        , 0.        , ..., 0.28763292, 0.28763292,
       0.        ])

In [31]:
indices2 = np.argpartition(similarity2,-10)[-50:]
indices2

array([1537, 2467, 2228, 2581,   79, 1788, 2200, 1850, 1737,  779, 2329,
        542, 1008,   68,   47,   26,  655, 1795, 1920, 2379, 1449,  518,
         41, 1744, 1955, 2289, 1592, 2620,  174,  433,  987,  412, 1664,
        557,  226,  158, 1657, 2238, 2208, 1488,   70, 2042, 2973,   12,
        320, 2444,   43, 1684,  749,    0])

In [32]:
recs_df = general_rec_df.iloc[indices2].sort_values('rating',ascending=False)
final_df = recs_df.iloc[1:11]
rec_lst = list(final_df['mod_title'])
rec_lst

['toy story 2 ',
 'spirited away ',
 'finding nemo ',
 'monsters, inc. ',
 'my neighbor totoro ',
 'incredibles, the ',
 'inside out ',
 'coco ',
 "kiki's delivery service ",
 'partly cloudy ']

In [33]:
#put the all above steps into a function with only 2 parameters (generated by previous function)
def suggested_lst(mov_ID,mov_genre): 
    overlap_users = set()

    for i in range(users.shape[0]):
    
        if users['movieId'].iloc[i] == mov_ID and users['rating'].iloc[i] >= 4.5:
            user_id = users['userId'].iloc[i]
            overlap_users.add(user_id)
            
    similar_users = users[users['userId'].isin(overlap_users)]
    merged_df = similar_users.merge(movies,on='movieId',how='inner')
    
    
    freq_lst = ((merged_df['mod_title'].value_counts() > 30).apply(lambda x: x if x else np.NaN)).dropna()
    freq_names = list(freq_lst.index)
    
    if (len(freq_names)==0):
        return []
    
    common_liked = merged_df[merged_df['mod_title'].isin(freq_names)]
    common_liked['mod_genre'] = common_liked['genres'].str.replace('[|]'," ", regex=True)
    
    most_liked = common_liked.groupby('mod_title').mean()[['rating']].sort_values('rating',ascending=False).\
    reset_index()
    
    genres_movies = common_liked[['mod_title','mod_genre']].drop_duplicates(subset=['mod_title'])
    
    general_rec_df = most_liked.merge(genres_movies, on='mod_title')
    
    vectorizer2 = TfidfVectorizer()
    tfidf2 = vectorizer2.fit_transform(general_rec_df['mod_genre'])
    
    query_genre = mov_genre
    processed2 = re.sub("[|]"," ", query_genre)
    
    query_vec2 = vectorizer2.transform([processed2])
    similarity2 = cosine_similarity(query_vec2, tfidf2).flatten()
    indices2 = np.argpartition(similarity2,-10)[-50:]
    
    recs_df = general_rec_df.iloc[indices2].sort_values('rating',ascending=False)
    final_df = recs_df.iloc[1:11]
    rec_lst = list(final_df['mod_title'])
    
    
    return rec_lst
    
    

In [34]:
#suggested_lst(liked_mov_id,liked_mov_genre)

In [37]:
def model(query):
    
    search_df = search(query)
    
    liked_mov_id = search_df['movieId'].iloc[-1]
    
    liked_mov_genre = search_df['genres'].iloc[-1]
    
    ans = suggested_lst(liked_mov_id, liked_mov_genre)
    
    return ans

In [38]:
model("Toy Story")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  common_liked['mod_genre'] = common_liked['genres'].str.replace('[|]'," ", regex=True)


['toy story 2 ',
 'spirited away ',
 'finding nemo ',
 'monsters, inc. ',
 'my neighbor totoro ',
 'incredibles, the ',
 'inside out ',
 'coco ',
 "kiki's delivery service ",
 'partly cloudy ']

In [57]:
functions = {'func1':search, 'func2':suggested_lst, 'func3':model}

In [58]:
import os 
import pickle

In [59]:
current = os.getcwd()
current

'/Users/u71072/Desktop/Rec_System'

In [60]:
model_fp = os.path.join(current,"model")
os.makedirs(model_fp, exist_ok=True)
model_file = os.path.join(model_fp, "model.pkl")
with open(model_file, "wb") as f:
    pickle.dump(functions, f)

- Pickle does not seem to deserialize without error in the .py file that will hold the code for the website
- For the time being, the functions had to be transferred over to a newly .py file that contains the definitions of the functions we have been using, and we imported that file into the .py file of the website. 