<a href="https://colab.research.google.com/github/Rachana-Baditha/movie-recommender-system/blob/main/%5BMP%5D_Similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# -> All Functions <-

In [None]:
def getCombinedFeatures(movie_df, features):
  combined_features = ""

  for feature in features:
    combined_features += movie_df[feature] + " "

  return combined_features

In [None]:
def getSimilarity(combined_features):

  #Getting feature vector

  vectorizer = TfidfVectorizer()

  feature_vectors = vectorizer.fit_transform(combined_features)


  #Get Similarity

  similarity = cosine_similarity(feature_vectors)


  return similarity

In [None]:
def getRecommendations(movie_id, similarity, recList):

  for x in enumerate(similarity[movie_id - 1]):
    if (x[0]) == movie_id-1:
      continue
    if (x[0]) in recList:
      recList[(x[0])] = max(x[1],recList[x[0]])
    else:
      recList[(x[0])] = x[1]

  return recList

In [None]:
# -> Initialise all DataFrames <-

In [None]:
dateparse = lambda x: datetime.utcfromtimestamp(int(x)).strftime('%Y-%m-%d %H:%M:%S')

base1_df = pd.read_csv('/content/drive/MyDrive/ML100K/u1.base', sep='\t', 
                        names=['user_id', 'movie_id', 'rating', 'timestamp'], 
                        parse_dates=['timestamp'], 
                        date_parser=dateparse)

base1_df['enjoyed'] = base1_df['rating'] >=4

base1_df = base1_df.drop(['timestamp'],axis=1)

In [None]:
test1_df = pd.read_csv('/content/drive/MyDrive/ML100K/u1.test', sep='\t', 
                        names=['user_id', 'movie_id', 'rating', 'timestamp'], 
                        parse_dates=['timestamp'], 
                        date_parser=dateparse)

test1_df['enjoyed'] = test1_df['rating'] >=4

test1_df = test1_df.drop(['timestamp'],axis=1)

In [None]:
clean_movie_df = pd.read_csv('/content/drive/MyDrive/ML100K/movies_clean.csv', sep=',', names=["id","title","year","genres","actors","director","language","rating","runtime","plot-synopsis","poster"])

pd.set_option('display.max_rows', None)

In [None]:
movie_df = pd.read_csv('/content/drive/MyDrive/ML100K/u.item', sep='|', encoding='latin-1',
                    names=['movie_id', 'movie_title', 'release_date', 'video_release_date', 'imdb_url', 'unknown', 'action', 
                           'adventure', 'animation', 'childrens', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 
                           'film_noir', 'horror', 'musical', 'mystery', 'romance', 'sci_fi', 'thriller', 'war', 'western'])

In [None]:
clean_movie_df = clean_movie_df.fillna("")

In [None]:
# clean_movie_df

In [None]:
# -> Get Similarity Matrix <-

In [None]:
combined_features = getCombinedFeatures(clean_movie_df, ["actors","genres","director",	"language"]) #Removed ratings and runtime and director and language

In [None]:
similarity = getSimilarity(combined_features)

In [None]:
# -> Aggregate Similarity Scores <-

In [None]:
userId = 1

In [None]:
no_of_users = len(base1_df["user_id"].unique())

enjoyed_movies = []

In [None]:
# for user in range(1,no_of_users + 1):
#   m_enjoy = base1_df.loc[(base1_df["user_id"] == user) & (base1_df["enjoyed"] == True)]["movie_id"].tolist()

#   enjoyed_movies.append(m_enjoy)

# enjoyed_movies

In [None]:
enjoyed_movies = base1_df.loc[(base1_df["user_id"] == userId) & (base1_df["enjoyed"] == True)]["movie_id"].tolist()


In [None]:
recList = {}

for movie in enjoyed_movies:
  recList = getRecommendations(movie,similarity, recList)

sorted_similarity_score = sorted(recList.items(), key=lambda x:x[1], reverse=True)

In [None]:
top_rec = sorted_similarity_score[:100]

In [None]:
# -> Display Top Recommendations <-

In [None]:
# print(f"Recommendations for User 1\n-----------------------------------------------------------")

# for i in range(0,100):
#   id = movie_df.iloc[sorted_similarity_score[i][0]]["movie_id"]
#   title = movie_df.iloc[sorted_similarity_score[i][0]]["movie_title"]
#   score = sorted_similarity_score[i][1]

#   print(f"{i} \t|\t {id} \t|\t{title} \t|\t {score}")

In [None]:
# -> Verifying Accuracy <-
# test1_df

In [None]:
# enjoyed_test = test1_df.loc[(test1_df["user_id"] == userId) & (test1_df["enjoyed"] == True)]["movie_id"].tolist()

In [None]:
# enjoyed_test

In [None]:
# count = 0

# for i in range(len(top_rec)):
#   if movie_df.iloc[top_rec[i][0]]["movie_id"] in enjoyed_test:
#     count+=1
#     id = movie_df.iloc[top_rec[i][0]]["movie_id"]
#     title = movie_df.iloc[top_rec[i][0]]["movie_title"]
#     score = top_rec[i][1]

#     print(f"{i} \t|\t {id} \t|\t{title} \t|\t {score}")

# print(f"Total -> {count}")

In [None]:
user_count = len(base1_df["user_id"].unique())
no_of_users = 400

no_rec = 15

precision = 0
recall = 0

avg_p = 0
avg_r = 0

for userID in range(1,no_of_users):
  enjoyed_movies = base1_df.loc[(base1_df["user_id"] == userID) & (base1_df["enjoyed"] == True)]["movie_id"].tolist()

  # if len(enjoyed_movies) == 0:
  #   break

  recList = {}

  for movie in enjoyed_movies:
    recList = getRecommendations(movie,similarity, recList)

  sorted_similarity_score = sorted(recList.items(), key=lambda x:x[1], reverse=True)

  top_rec = sorted_similarity_score[:no_rec]

  enjoyed_test = test1_df.loc[(test1_df["user_id"] == userID) & (test1_df["enjoyed"] == True)]["movie_id"].tolist()

  count = 0

  for i in range(len(top_rec)):
    if movie_df.iloc[top_rec[i][0]]["movie_id"] in enjoyed_test:
      count+=1

  precision = round(( count / len(top_rec) )*100,2)
  recall = round((count / len(enjoyed_movies))*100,2)

  avg_p += precision
  avg_r += recall

  print(f"User {userID} -> \tC = {count}\tP = {precision}%\tR = {recall}%")


avg_p = round(avg_p/no_of_users,2)
avg_r = round(avg_r/no_of_users,2)

print(f"Average P = {avg_p}\tAverage R = {avg_r}")







User 1 -> 	C = 1	P = 6.67%	R = 1.19%
User 2 -> 	C = 0	P = 0.0%	R = 0.0%
User 3 -> 	C = 1	P = 6.67%	R = 10.0%
User 4 -> 	C = 0	P = 0.0%	R = 0.0%
User 5 -> 	C = 2	P = 13.33%	R = 6.06%
User 6 -> 	C = 2	P = 13.33%	R = 2.86%
User 7 -> 	C = 4	P = 26.67%	R = 2.88%
User 8 -> 	C = 3	P = 20.0%	R = 16.67%
User 9 -> 	C = 0	P = 0.0%	R = 0.0%
User 10 -> 	C = 3	P = 20.0%	R = 3.49%
User 11 -> 	C = 1	P = 6.67%	R = 2.04%
User 12 -> 	C = 1	P = 6.67%	R = 4.55%
User 13 -> 	C = 4	P = 26.67%	R = 2.47%
User 14 -> 	C = 1	P = 6.67%	R = 2.94%
User 15 -> 	C = 0	P = 0.0%	R = 0.0%
User 16 -> 	C = 2	P = 13.33%	R = 3.51%
User 17 -> 	C = 0	P = 0.0%	R = 0.0%
User 18 -> 	C = 4	P = 26.67%	R = 3.92%
User 19 -> 	C = 0	P = 0.0%	R = 0.0%
User 20 -> 	C = 1	P = 6.67%	R = 7.14%
User 21 -> 	C = 1	P = 6.67%	R = 3.7%
User 22 -> 	C = 4	P = 26.67%	R = 10.53%
User 23 -> 	C = 2	P = 13.33%	R = 3.7%
User 24 -> 	C = 0	P = 0.0%	R = 0.0%
User 25 -> 	C = 2	P = 13.33%	R = 5.41%
User 26 -> 	C = 0	P = 0.0%	R = 0.0%
User 27 -> 	C = 1	P = 6.67%	