<a href="https://colab.research.google.com/github/Theophilus12/Intelligent-Movie-Recommendation-System/blob/main/group_5_movie_recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget https://files.grouplens.org/datasets/movielens/ml-25m.zip

In [None]:
import zipfile
z= zipfile.ZipFile('./ml-25m.zip')
z.extractall()

In [None]:
!head ./ml-25m/ratings.csv

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

from sklearn.preprocessing import MinMaxScaler

import implicit

In [None]:
ratings = pd.read_csv('./ml-25m/ratings.csv')
movies = pd.read_csv('./ml-25m/movies.csv')
tags = pd.read_csv('./ml-25m/tags.csv')

In [None]:
tags.head()

In [None]:
ratings.head()

In [None]:
movies.tail()

In [None]:
movies['genres'].value_counts()

# EDA

In [None]:
colors = ['steelblue', 'seagreen', 'black', 'darkorange', 'purple', 'firebrick', 'slategrey']

In [None]:
ratings['rating'].value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(6, 8))

sns.countplot(x='rating', data=ratings, palette=colors, ax=ax)

for index, value in enumerate(sorted(ratings['rating'].value_counts())):
    label = '{}%'.format(round((value / ratings['rating'].shape[0]) * 100, 2))
plt.title("The of rating")

ax.set_xlabel('Status')
ax.set_ylabel('Count')


In [None]:
dict_rating = ratings['rating'].value_counts().to_dict()

In [None]:
pcts = []
for index, value in enumerate(sorted(ratings['rating'].value_counts(), reverse=True)):
    pcts.append(round((value / ratings['rating'].shape[0]) * 100, 2))


In [None]:
rating_pct = pd.DataFrame()

rating_pct['rating'] = dict_rating.keys()
rating_pct['rating_percentage'] = pcts

In [None]:
rating_pct

In [None]:
movies['year'] = movies['title'].apply(lambda x: x.replace("(","").replace(")","").split(' ')[-1] if '(' in x else 0)
movies['year'] = movies['year'].astype('str')
movies['year'] = movies['year'].apply(lambda x: x if x.isdigit() else 0)
movies['year'] = movies['year'].astype('int')

In [None]:
movies.head()

In [None]:
relevant_movies = movies[movies['year'] > 2013]
relevant_movies.head()

In [None]:
relevant_movie_ids = relevant_movies['movieId'].to_list()
relevant_ratings = ratings[ratings['movieId'].isin(relevant_movie_ids)]
relevant_tags = tags[tags['movieId'].isin(relevant_movie_ids)]

In [None]:
relevant_ratings = relevant_ratings.reset_index(drop=True)
relevant_tags = relevant_tags.reset_index(drop=True)

In [None]:
relevant_tags.head(3)

In [None]:
relevant_ratings.drop('timestamp', axis=1, inplace=True)

In [None]:
relevant_ratings.head()

In [None]:
is_highly_rated = relevant_ratings['rating'] >= 4.0
tp_mvs = relevant_ratings[is_highly_rated]['movieId'].to_list()
top_movies = relevant_tags[relevant_tags['movieId'].isin(tp_mvs)]
top_movies.head()

In [None]:
top_genres = relevant_movies[relevant_movies['movieId'].isin(tp_mvs)]
top_genres.head()

In [None]:
top_genres['year'].value_counts().plot(kind='bar', figsize=(10,5))
plt.title("The count plot of movies in each year")
plt.xlabel('Year')
plt.ylabel('Movie Count')


In [None]:
drop_tg_idx = top_genres[top_genres['genres'] == '(no genres listed)'].index

In [None]:
top_genres.drop(index = drop_tg_idx, axis = 0, inplace=True)

In [None]:
tg = top_genres['genres'].value_counts()
tg[:10].plot(kind='bar', figsize=(10,5))
plt.title("The count plot of top genre")
plt.xlabel('Genre')
plt.ylabel('Genre Count')


In [None]:
tag_counts = top_movies['tag'].value_counts()

In [None]:
tag_counts[:10].plot(kind='bar', figsize=(10,5))
plt.title("The count plot of tags")
plt.xlabel('Tag')
plt.ylabel('Tag Count')


In [None]:
relevant_ratings[relevant_ratings['userId'] == 3]['movieId'].shape

In [None]:
relevant_ratings[relevant_ratings['userId'] == 3].head()

In [None]:
relevant_ratings['userId'] = relevant_ratings['userId'].astype('category')
relevant_ratings['movieId'] = relevant_ratings['movieId'].astype('category')
relevant_ratings['user_id'] = relevant_ratings['userId'].cat.codes
relevant_ratings['movie_id'] = relevant_ratings['movieId'].cat.codes

In [None]:
relevant_ratings.head()

In [None]:
# The implicit library expects data as a item-user matrix so we
# create two matricies, one for fitting the model (item-user) 
# and one for recommendations (user-item)
sparse_item_user = sparse.csr_matrix((relevant_ratings['rating'].astype(float), (relevant_ratings['movie_id'], relevant_ratings['user_id'])))
sparse_user_item = sparse.csr_matrix((relevant_ratings['rating'].astype(float), (relevant_ratings['user_id'], relevant_ratings['movie_id'])))

In [None]:
# Initialize the als model and fit it using the sparse item-user matrix
model = implicit.als.AlternatingLeastSquares(factors=128, regularization=0.1, 
                                             iterations=20)

In [None]:
# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 15
data_conf = (sparse_item_user * alpha_val).astype('double')

In [None]:
# Fit the model
model.fit(data_conf)

In [None]:
movies[movies['movieId'] == 108945]

In [None]:
relevant_ratings[relevant_ratings['movieId'] == 108945]['userId'].shape

In [None]:
# FIND SIMILAR MOVIES

# Find the 10 most similar movies to Robocop
item_id = 15
n_similar = 10

In [None]:
# Get the user and item vectors from our trained model
user_vecs = model.user_factors
item_vecs = model.item_factors

# Calculate the vector norms
item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))

In [None]:
# Calculate the similarity score, grab the top N items and
# create a list of item-score tuples of most similar movies

scores = item_vecs.dot(item_vecs[item_id]) / item_norms
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
similar = sorted(zip(top_idx, scores[top_idx] / item_norms[item_id]), key=lambda x: -x[1])

In [None]:
# Print the names of our most similar movies
for item in similar:
    idx, score = item
    mov = relevant_ratings['movieId'].loc[relevant_ratings['movie_id'] == idx].iloc[0]
    print(movies[movies['movieId'] == mov]['title'].iloc[0], movies[movies['movieId'] == mov]['genres'].iloc[0], score)
    

In [None]:
def recommend(user_id, sparse_user_item, user_vecs, item_vecs, num_items=10):

    user_interactions = sparse_user_item[user_id,:].toarray()

    user_interactions = user_interactions.reshape(-1) + 1
    user_interactions[user_interactions > 1] = 0

    rec_vector = user_vecs[user_id,:].dot(item_vecs.T).toarray()

    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    recommend_vector = user_interactions * rec_vector_scaled

    item_idx = np.argsort(recommend_vector)[::-1][:num_items]

    movs = []
    scores = []
    gens = []

    for idx in item_idx:
        stuff = relevant_ratings['movieId'].loc[relevant_ratings['movie_id'] == idx].iloc[0]
        movs.append(movies[movies['movieId'] == stuff]['title'].iloc[0])
        gens.append(movies[movies['movieId'] == stuff]['genres'].iloc[0])
        scores.append(recommend_vector[idx])

    recommendations = pd.DataFrame({'movies': movs, 'score': scores, 'genres': gens})

    return recommendations

In [None]:
# Get the trained user and item vectors. We convert them to 
# csr matrices to work with our previous recommend function.
user_vecs = sparse.csr_matrix(model.user_factors)
item_vecs = sparse.csr_matrix(model.item_factors)

In [None]:
# Create recommendations for user with id 44000
user_id = 44000

In [None]:
recommendations = recommend(user_id, sparse_user_item, user_vecs, item_vecs)

In [None]:
recommendations

In [None]:
# Get 10 of user's already watched movies 
user_movie_ids = relevant_ratings[relevant_ratings['user_id'] == user_id][:10]['movieId'].to_list()
relevant_movies[relevant_movies['movieId'].isin(user_movie_ids)]

In [None]:
relevant_ratings['user_id'].nunique()

In [None]:
# extract year from title in movie dataset
# get years between 2014 - 2019
# extract ids with valid years from ratings dataset
# remove timestamp
# check user's watched movies
# reset user and movie ids

# preprocess data for recommender system