<a href="https://colab.research.google.com/github/PrateekCoder/Recommendation-Systems/blob/main/Content_Based_Movie_Recommendation_System_Using_Bags_Of_Words_Vectorizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## YouTube 
###https://youtu.be/fd8lYpnGHy0

## Connect the Colab File with Google Drive

In [16]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [17]:
#Import all the required packages
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
# Load the movies.csv file into a Pandas dataframe
movies = pd.read_csv('gdrive/My Drive/datasets/movielens-10m/movies.csv')

In [19]:
movies

Unnamed: 0.1,Unnamed: 0,movieId,title,genres
0,0,2,Jumanji (1995),Adventure|Children|Fantasy
1,1,3,Grumpier Old Men (1995),Comedy|Romance
2,2,4,Waiting to Exhale (1995),Comedy|Drama|Romance
3,3,5,Father of the Bride Part II (1995),Comedy
4,4,6,Heat (1995),Action|Crime|Thriller
...,...,...,...,...
10675,10675,65088,Bedtime Stories (2008),Adventure|Children|Comedy
10676,10676,65091,Manhattan Melodrama (1934),Crime|Drama|Romance
10677,10677,65126,Choke (2008),Comedy|Drama
10678,10678,65130,Revolutionary Road (2008),Drama|Romance


~~~
Bag of Words (BoW) is a way of representing text data in numerical form, where 
each word in a text document is treated as a separate feature. In the context 
of movie recommendation, BoW is typically used to represent movie descriptions, 
summaries, or genre information.

When creating a BoW representation for movies in the MovieLens dataset, you 
would process the text data for each movie (e.g., movie genre, title, or 
summary) and create a vocabulary of all the unique words. You would then use 
this vocabulary to represent each movie as a numerical vector, with the length 
of the vector being equal to the size of the vocabulary.

Each element in the vector would correspond to a word in the vocabulary, and 
the value of each element would indicate the frequency of that word in the 
movie description, summary, or genre information.

A simple example of a BoW representation for two movies might look like this:

Movie 1: "Action", "Adventure", "Thriller"
Movie 2: "Comedy", "Romance", "Drama"

Vocabulary: "Action", "Adventure", "Thriller", "Comedy", "Romance", "Drama"

Movie 1 vector: [1, 1, 1, 0, 0, 0]
Movie 2 vector: [0, 0, 0, 1, 1, 1]

In this example, the vocabulary includes all the unique words from the movie 
genres, and the vectors for each movie represent the presence or absence of 
each word in the vocabulary for that movie.
~~~

In [20]:
# Extract the movie titles and genres into separate lists
titles = movies['title'].tolist()
genres = movies['genres'].str.split("|").tolist()

In [21]:
# Create a bag of words representation of the movie genres
def create_bow(genre_list):
    bow = {}
    for genre in genre_list:
        bow[genre] = 1
    return bow

In [22]:
# Create a list of bags of words representations of the movie genres
bags_of_words = [create_bow(movie_genres) for movie_genres in genres]

In [24]:
# Create a dataframe to store the bags of words representation of the movie genres
genre_df = pd.DataFrame(bags_of_words, index=titles).fillna(0)

In [25]:
genre_df

Unnamed: 0,Adventure,Children,Fantasy,Comedy,Romance,Drama,Action,Crime,Thriller,Horror,Animation,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Film-Noir,Western,(no genres listed)
Jumanji (1995),1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Grumpier Old Men (1995),0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Waiting to Exhale (1995),0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Father of the Bride Part II (1995),0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Heat (1995),0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Bedtime Stories (2008),1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Manhattan Melodrama (1934),0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Choke (2008),0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Revolutionary Road (2008),0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# Calculate the cosine similarity matrix between the movies
cosine_similarity = cosine_similarity(genre_df)

# Create a dataframe with the cosine similarity scores
similarity_df = pd.DataFrame(cosine_similarity, index=genre_df.index, columns=genre_df.index)

In [28]:
similarity_df

Unnamed: 0,Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),"American President, The (1995)",...,Impulse (2008),Zona Zamfirova (2002),Double Dynamite (1951),"Death Kiss, The (1933)",Ben X (2007),Bedtime Stories (2008),Manhattan Melodrama (1934),Choke (2008),Revolutionary Road (2008),Blackadder Back & Forth (1999)
Jumanji (1995),1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.816497,0.00000,0.333333,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.666667,0.000000,0.000000,0.000000,0.000000
Grumpier Old Men (1995),0.000000,1.000000,0.816497,0.707107,0.000000,1.000000,0.000000,0.00000,0.000000,0.816497,...,0.000000,0.500000,0.500000,0.500000,0.000000,0.408248,0.408248,0.500000,0.500000,0.707107
Waiting to Exhale (1995),0.000000,0.816497,1.000000,0.577350,0.000000,0.816497,0.000000,0.00000,0.000000,1.000000,...,0.000000,0.816497,0.408248,0.408248,0.577350,0.333333,0.666667,0.816497,0.816497,0.577350
Father of the Bride Part II (1995),0.000000,0.707107,0.577350,1.000000,0.000000,0.707107,0.000000,0.00000,0.000000,0.577350,...,0.000000,0.707107,0.707107,0.707107,0.000000,0.577350,0.000000,0.707107,0.000000,1.000000
Heat (1995),0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.57735,0.666667,0.000000,...,0.408248,0.000000,0.000000,0.000000,0.000000,0.000000,0.333333,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Bedtime Stories (2008),0.666667,0.408248,0.333333,0.577350,0.000000,0.408248,0.816497,0.00000,0.333333,0.333333,...,0.000000,0.408248,0.408248,0.408248,0.000000,1.000000,0.000000,0.408248,0.000000,0.577350
Manhattan Melodrama (1934),0.000000,0.408248,0.666667,0.000000,0.333333,0.408248,0.000000,0.00000,0.000000,0.666667,...,0.000000,0.408248,0.000000,0.000000,0.577350,0.000000,1.000000,0.408248,0.816497,0.000000
Choke (2008),0.000000,0.500000,0.816497,0.707107,0.000000,0.500000,0.000000,0.00000,0.000000,0.816497,...,0.000000,1.000000,0.500000,0.500000,0.707107,0.408248,0.408248,1.000000,0.500000,0.707107
Revolutionary Road (2008),0.000000,0.500000,0.816497,0.000000,0.000000,0.500000,0.000000,0.00000,0.000000,0.816497,...,0.000000,0.500000,0.000000,0.000000,0.707107,0.000000,0.816497,0.500000,1.000000,0.000000


In [29]:
# Ask the user for a movie they like
movie = input('Enter a movie you like: ')

Enter a movie you like: Prestige, The (2006)


In [30]:
# Find the index of the movie in the similarity dataframe
movie_index = similarity_df.index.get_loc(movie)

# Get the top 5 most similar movies to the movie
top_10 = similarity_df.iloc[movie_index].sort_values(ascending=False)[1:11]

# Print the top 5 most similar movies to the movie
print(f'Top 10 similar movies to {movie}:')
print(top_10)

Top 10 similar movies to Prestige, The (2006):
Fabled (2002)                              1.0
Prize, The (1963)                          1.0
No Way Out (1987)                          1.0
Unknown (2006)                             1.0
I'm Not Scared (Io non ho paura) (2003)    1.0
Client, The (1994)                         1.0
Number 23, The (2007)                      1.0
Pelican Brief, The (1993)                  1.0
Trauma (2004)                              1.0
Fallen Idol, The (1948)                    1.0
Name: Prestige, The (2006), dtype: float64
