<a href="https://colab.research.google.com/github/VardanDavtyan/ML-DL/blob/main/movie_recommending_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install scikit-surprise



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate

##Collaborative filtering

In [3]:
from surprise import Reader, Dataset, SVD


reader = Reader()
ratings = pd.read_csv('ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [5]:
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9000  0.8932  0.8912  0.8961  0.8982  0.8957  0.0032  
MAE (testset)     0.6912  0.6892  0.6875  0.6897  0.6916  0.6898  0.0015  
Fit time          1.57    1.64    1.59    2.10    1.59    1.70    0.20    
Test time         0.13    0.46    0.21    0.12    0.13    0.21    0.13    


{'test_rmse': array([0.89997876, 0.89324215, 0.89116635, 0.89612083, 0.89820742]),
 'test_mae': array([0.69116171, 0.68921715, 0.6874752 , 0.6897442 , 0.6915729 ]),
 'fit_time': (1.5667238235473633,
  1.6366300582885742,
  1.5861949920654297,
  2.10353946685791,
  1.5941798686981201),
 'test_time': (0.13211894035339355,
  0.46011996269226074,
  0.21269607543945312,
  0.12234663963317871,
  0.13269591331481934)}

In [6]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7efc0812a770>

In [12]:
ratings[ratings['userId'] == 333]

Unnamed: 0,userId,movieId,rating,timestamp
46068,333,1,4.0,1441197471
46069,333,318,5.0,1441197184
46070,333,356,4.5,1441197368
46071,333,527,5.0,1441197187
46072,333,588,3.5,1441198986
...,...,...,...,...
46144,333,105844,3.5,1441198673
46145,333,109487,4.5,1441197391
46146,333,116797,5.0,1441197436
46147,333,117176,4.0,1441197950


In [9]:
ratings[ratings['movieId'] == 3977]

Unnamed: 0,userId,movieId,rating,timestamp
1728,15,3977,3.0,997937848
2890,17,3977,1.0,1127468701
3935,22,3977,2.5,1131753363
6648,37,3977,3.0,981307838
9898,69,3977,3.5,1366831839
...,...,...,...,...
95483,626,3977,4.0,974770048
95604,627,3977,1.5,1201382056
97894,654,3977,3.5,1145392227
98285,656,3977,5.0,986243275


In [14]:
ratings[ratings['movieId'] == 3977]

Unnamed: 0,userId,movieId,rating,timestamp
1728,15,3977,3.0,997937848
2890,17,3977,1.0,1127468701
3935,22,3977,2.5,1131753363
6648,37,3977,3.0,981307838
9898,69,3977,3.5,1366831839
...,...,...,...,...
95483,626,3977,4.0,974770048
95604,627,3977,1.5,1201382056
97894,654,3977,3.5,1145392227
98285,656,3977,5.0,986243275


In [11]:
svd.predict(333, 3977, 5)

Prediction(uid=333, iid=3977, r_ui=5, est=3.4087971741417795, details={'was_impossible': False})

##other way

In [70]:
movie_metadata = pd.read_csv("movies_metadata.csv")
movie_metadata = movie_metadata[['title', 'genres']]
movie_metadata.head()

  movie_metadata = pd.read_csv("movies_metadata.csv")


Unnamed: 0,title,genres
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]"


In [77]:
ratings_mat = np.ndarray(
    shape=(np.max(ratings.movieId.values), np.max(ratings.userId.values)),
    dtype=np.uint8)
ratings_mat[ratings.movieId.values-1, ratings.userId.values-1] = ratings.rating.values

In [78]:
normalised_mat = ratings_mat - np.asarray([(np.mean(ratings_mat, 1))]).T

In [79]:
ratings_mat

array([[0, 0, 0, ..., 0, 4, 5],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [None]:
A = normalised_mat.T / np.sqrt(ratings_mat.shape[0] - 1)
#need high RAM
U, S, V = np.linalg.svd(A)

In [None]:
def top_cosine_similarity(data, movie_id, top_n=10):
    index = movie_id - 1 # Movie id starts from 1
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]

def print_similar_movies(movie_data, movie_id, top_indexes):
    print('Recommendations for {0}: \n'.format(
    movie_data[movie_data.movie_id == movie_id].title.values[0]))
    for id in top_indexes + 1:
        print(movie_data[movie_data.movie_id == id].title.values[0])

In [None]:
k = 50
movie_id = 1 # Grab an id from movies.dat
top_n = 10

sliced = V.T[:, :k] # representative data
indexes = top_cosine_similarity(sliced, movie_id, top_n)
print_similar_movies(moviemeta_data, movie_id, indexes)

##Content Based Filtering


In [15]:
df1=pd.read_csv('tmdb_5000_credits.csv')
df2=pd.read_csv('tmdb_5000_movies.csv')

In [16]:
df1.columns = ['id','tittle','cast','crew']
df2= df2.merge(df1, on='id')

In [17]:
C= df2['vote_average'].mean()
C

6.092171559442016

In [18]:
m= df2['vote_count'].quantile(0.9)
m

1838.4000000000015

In [19]:
q_movies = df2.copy().loc[df2['vote_count'] >= m]
q_movies.shape

(481, 23)

In [20]:
df2['overview'].head(5)

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

In [21]:
df2.loc[999]

budget                                                           60000000
genres                  [{"id": 35, "name": "Comedy"}, {"id": 18, "nam...
homepage                                                              NaN
id                                                                  10416
keywords                [{"id": 3767, "name": "noises"}, {"id": 4862, ...
original_language                                                      en
original_title                                  What Planet Are You From?
overview                A highly-evolved planet, whose denizens feel n...
popularity                                                       2.275331
production_companies             [{"name": "Columbia Pictures", "id": 5}]
production_countries    [{"iso_3166_1": "US", "name": "United States o...
release_date                                                   2000-03-03
revenue                                                                 0
runtime                               

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
df2['overview'] = df2['overview'].fillna('')
tfidf_matrix = tfidf.fit_transform(df2['overview'])
tfidf_matrix.shape

(4803, 20978)

In [23]:
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [24]:
indices = pd.Series(df2.index, index=df2['title']).drop_duplicates()

In [25]:
def get_recommendations(title, cosine_sim=cosine_sim):

    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df2['title'].iloc[movie_indices]

In [26]:
get_recommendations('What Planet Are You From?')

1603                        Stranger Than Fiction
2144              A Very Harold & Kumar Christmas
2823    Harold & Kumar Escape from Guantanamo Bay
3158                                        Alien
581                       Star Trek: Insurrection
1425                                    Abduction
1383                                        Radio
2819                                 Act of Valor
3155                                  Melancholia
2512                         Three Men and a Baby
Name: title, dtype: object

In [27]:
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df2[feature] = df2[feature].apply(literal_eval)

In [28]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [29]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes return only first three. If no, return entire list
        if len(names) > 3:
            names = names[:3]
        return names
    return []

In [30]:
df2['director'] = df2['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    df2[feature] = df2[feature].apply(get_list)

In [31]:
df2[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",James Cameron,"[culture clash, future, space war]","[Action, Adventure, Fantasy]"
1,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley]",Gore Verbinski,"[ocean, drug abuse, exotic island]","[Adventure, Fantasy, Action]"
2,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",Sam Mendes,"[spy, based on novel, secret agent]","[Action, Adventure, Crime]"


In [32]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [33]:
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    df2[feature] = df2[feature].apply(clean_data)

In [34]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
df2['soup'] = df2.apply(create_soup, axis=1)

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df2['soup'])

In [36]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [37]:
df2 = df2.reset_index()
indices = pd.Series(df2.index, index=df2['title'])

In [38]:
get_recommendations('What Planet Are You From?', cosine_sim2)

4247        Me You and Five Bucks
2364                  Being Julia
2459                   The Artist
2472                    The Women
681        The American President
3989         Lage Raho Munna Bhai
323            Sex and the City 2
1258    Life or Something Like It
1692                      Mumford
2011                        Cheri
Name: title, dtype: object