# **Importing the necessary libraries**

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# **Getting the relevent data**

### Load the Dataset

In [None]:
movies = pd.read_csv("movies.csv")

In [None]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
movies.shape

(9742, 3)

### Convet genre into useable format

First the split the genres of every movie that are being seperated by '|' so we get rid of that. Convert lists of strings to lowercase and remove whitespaces

In [None]:
movies['genres'] = movies['genres'].str.split('|')
movies['genres'] = movies['genres'].apply(lambda x: ' '.join(x).lower().strip())

In [None]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),adventure animation children comedy fantasy
1,2,Jumanji (1995),adventure children fantasy
2,3,Grumpier Old Men (1995),comedy romance
3,4,Waiting to Exhale (1995),comedy drama romance
4,5,Father of the Bride Part II (1995),comedy


In [None]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

# **Calculations**

### Vectorize the text data using Bag of Words

In [None]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(movies['genres']).toarray()

In [None]:
vector

array([[0, 1, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### Calculate similarity matrix based on Bag of Words vectors

In [None]:
similarity = cosine_similarity(vector)

In [None]:
similarity

array([[1.        , 0.77459667, 0.31622777, ..., 0.        , 0.31622777,
        0.4472136 ],
       [0.77459667, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 1.        , ..., 0.        , 0.        ,
        0.70710678],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.4472136 , 0.        , 0.70710678, ..., 0.        , 0.        ,
        1.        ]])

### Define the recommendation function cosine similarity:

In [None]:
def recommend_tfidf(movie_title, cosine_sim=similarity):
    idx = movies[movies['title'] == movie_title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    for i in sim_scores:
        print(movies.iloc[i[0]]['title'])

# **Giving Output**

### Get user input for their favourite movie and recommend 5 other movies

In [None]:
title = input("Enter the title of your favorite movie: ")

# Get the recommended movies
print("Top 5 similar movies:")
recommend_tfidf(title)

Enter the title of your favorite movie: Jaws (1975)
Top 5 similar movies:
Jaws 3-D (1983)
Crow: Salvation, The (2000)
Dog Soldiers (2002)
House of the Dead, The (2003)
Outpost (2008)
Alone in the Dark II (2008)
Underworld: Blood Wars (2016)
Mayhem (2017)
Tales from the Hood (1995)
Blade (1998)


# **DONE**

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

movies = pd.read_csv("movies.csv")

movies = movies[['movieId', 'title', 'genres']]

movies['genres'] = movies['genres'].str.split('|')

movies['genres'] = movies['genres'].apply(lambda x: ' '.join(x).lower().strip())

cv = CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(movies['genres']).toarray()

similarity = cosine_similarity(vector)

def recommend(movie_title, cosine_sim=similarity):
    idx = movies[movies['title'] == movie_title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    for i in sim_scores:
        print(movies.iloc[i[0]]['title'])

title = input("Enter the title of your favorite movie: ")

print("Top 5 similar movies:")
recommend(title)


Enter the title of your favorite movie: Saving Private Ryan (1998)
Top 5 similar movies:
Apocalypse Now (1979)
Boot, Das (Boat, The) (1981)
All Quiet on the Western Front (1930)
Saving Private Ryan (1998)
Thin Red Line, The (1998)
