<a href="https://colab.research.google.com/github/RohanSaxena1224/RecommendationSystems/blob/main/Recommendation_Systems.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Movies - Learning Cosine Similarity

In [None]:
%cd drive/My Drive/Machine Learning/Recommendation Systems/ml-25m

In [None]:
import pandas as pd
import numpy as np
import re

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import ipywidgets as widgets              # for inputs etc, need an output widget with it
from IPython.display import display

### Data

In [None]:
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

### Search Engine

In [None]:
# cleaning title function
def clean_title(title):
  return re.sub("[^a-zA-Z0-9 ]", "", title)  # search through title, remove special characters

movies["clean_title"] = movies["title"].apply(clean_title)

In [None]:
# building search engine


vectorizer = TfidfVectorizer(ngram_range=(1,2))           # Instead of looking at individual words, looking at groups of two words etc. makes it more accurate
tfidf = vectorizer.fit_transform(movies["clean_title"])   # Turn titles into sets of matrices

search_results = None

In [None]:
# creating a function to create a button for every genre in the movie
def create_buttons(genre_array):
  buttons = []                              # creating an empty array to store the buttons
  for genre in genre_array:
    button = widgets.Button(
        description=genre,                  # setting description to be the genre
        disabled=False,
        button_style="success",
    )
    button.on_click(on_button_clicked)      # when button is clicked, print genre
    buttons.append(button)

  return widgets.HBox(buttons)              # return the buttons as a widget horizontally

In [None]:
# when the button is clicked, it should filter the results based on that genre
def on_button_clicked(b):
  global search_results
  genre = b.description

  movie_id = search_results.iloc[0]["movieId"]
  recommendations = movie_recommendations(movie_id)
  filtered_recommendations = recommendations[recommendations["genres"].str.contains(genre)]

  with movie_list:
    movie_list.clear_output()
    display(filtered_recommendations[["score", "title", "genres"]])

In [None]:
def search(title):
  global search_results
  title = clean_title(title)
  query_vec = vectorizer.transform([title])
  similarity = cosine_similarity(query_vec, tfidf).flatten()      # compare query term to each clean title and return how similar as an array
  indices = np.argpartition(similarity, -5)[-5:]                   # find 5 most similar titles
  search_results = movies.iloc[indices][::-1]                            # most similar movie at the top so reversed

  genre_options = search_results.iloc[0]["genres"].split("|")
  buttons = create_buttons(genre_options)

  return search_results, buttons

### Recommendation System

In [None]:
def movie_recommendations(movie_id):
  similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"].unique()             # find all unique userIds that also liked our movie
  similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >=4)]["movieId"]            # find movies they also all rated higher than a 4
  similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
  similar_user_recs = similar_user_recs[similar_user_recs > 0.1]

  all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
  all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

  rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
  rec_percentages.columns = ["similar", "all"]

  rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
  rec_percentages = rec_percentages.sort_values("score", ascending=False)

  return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

### Combination

In [None]:
movie_input_name = widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disabled=False
)

movie_list = widgets.Output()

def on_type(data):
  with movie_list:
    movie_list.clear_output()
    title = data["new"]
    if len(title) > 5:
      results, buttons = search(title)
      # movie_id = results.iloc[0]["movieId"]
      # results = movie_recommendations(movie_id)

      display(buttons)
      display(results)

movie_input_name.observe(on_type, names="value")
display(movie_input_name, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

# Books - Single Value Decomposition


In [None]:
%cd drive/My Drive/Machine Learning/Recommendation Systems/books_dataset

/content/drive/My Drive/Machine Learning/Recommendation Systems/books_dataset


In [None]:
users = pd.read_csv('Users.csv')
books = pd.read_csv('Books.csv')
ratings = pd.read_csv('Ratings.csv')

  books = pd.read_csv('Books.csv')


In [None]:
from sklearn.cluster import KMeans
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Identifying users and books popularity
pop_users = ratings.groupby('User-ID')["Book-Rating"].count().sort_values(ascending=False)
pop_books = ratings.groupby('ISBN')["Book-Rating"].count().sort_values(ascending=False)

pop_users = pop_users[pop_users > 200]        # selecting users that have more than 200 reviews
pop_books = pop_books[pop_books > 100]        # selecting books that have more 100 reviews

In [None]:
# create final reviews dataframe - this will be converted to dataframe
reviews = ratings[ratings["User-ID"].isin(pop_users.index) & ratings["ISBN"].isin(pop_books.index)].reset_index(drop=True)
reviews.head()

In [None]:
review_matrix = reviews.pivot_table(index="User-ID", columns="ISBN", values="Book-Rating").fillna(0)
review_matrix

In [None]:
# implementing single value decomposition
from numpy.linalg import svd
matrix = review_matrix.values
u, sigma, vt = svd(matrix, full_matrices=False)

In [None]:
def cosine_similarity(vector1, vector2):
  return np.dot(vector1, vector2) / (np.linalg.norm(vector1)* np.linalg.norm(vector2))

In [None]:
highest_sim = -np.inf
highest_sim_col = -1
for col in range(1, vt.shape[1]):
  similarity = cosine_similarity(vt[0], vt[col])
  if similarity > highest_sim:
    highest_sim = similarity
    highest_sim_col = col