In [1]:
text=["London Paris London","Paris Paris London"]
#Now we need to find a way to represent these texts as vectors. The CountVectorizer() class from 
#sklearn.feature_extraction.text librarycan do this for us. We need to import this library before we can create a new
#CountVectorizer() object

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer()
count_matrix = cv.fit_transform(text)
#count_matrix gives us a sparse matrix . To make it in human readable form , we need to apply  toarray() method over it.
#And before printing out this count_matrix, let us first print out the feature list(or,word list), which have been fed to our 
#CountVectorizer() object

In [3]:
print(cv.get_feature_names())
print(count_matrix.toarray())
#This indicates that the word 'london' occurs 2 times in A and 1 time in B.Similarly,the word 'paris' occurs 1 time in A and 2 times in B .Makes sense. right?
#now,we need to find cosine(or "cos")similarly between these vectors to find out how similar they are from each other . We can calculate this using cosine_similarity() function from sklearn.metrics.pairwise library

['london', 'paris']
[[2 1]
 [1 2]]


In [4]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_scores = cosine_similarity(count_matrix)
print(similarity_scores)

[[1.  0.8]
 [0.8 1. ]]


In [5]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df=pd.read_csv("C:\\Users\\happy\\Winter Training\\movie_dataset.csv")

#If you visualize the dataset, you will see that it has many extra info about a movie .we don't need all of them. So, we choose keywords,cast,genres and director column to use as our feature set(the so called "content" of the movie).

In [6]:
features=['keywords','cast','genres','director']
#our next task is to create a function for combining the values of these columns into a single string


In [7]:
def combine_features(row):
    return row['keywords']+" "+row['cast']+" "+row['genres']+" "+row['director']
#Now, we need to call this function over each row of our dataframe. But, before doing that , we need to clean and preprocess the 
#data for our use . We will fill all the NaN values with blank string in the dataframe.

In [8]:
for feature in features:
    df[feature]=df[feature].fillna('')# filling all NaNs with blank string.
df["combined_features"] = df.apply(combine_features,axis=1)#applying combined features() method over each rows of dataframe and storing the combined string in"combined_features" columns

In [9]:
df.iloc[0].combined_features

'culture clash future space war space colony society Sam Worthington Zoe Saldana Sigourney Weaver Stephen Lang Michelle Rodriguez Action Adventure Fantasy Science Fiction James Cameron'

In [10]:
#Now that we obtained the combined strings, we can now feed these strings to a CountVectorizer() 
#object for getting the count matrix.

In [11]:
cv= CountVectorizer() #Creating new CountVectorizer() object
count_matrix= cv.fit_transform(df["combined_features"]) #feeding combined strings(movie contents) to Count_Vectorizer() object

In [12]:
#At this point , 60% work is done . Now, we need to obtain the cosine similarity matrix from the count matrix.

In [13]:
cosine_sim = cosine_similarity(count_matrix)

In [14]:
#Now we will define two helper functions to get movie title from  movie index and vice-versa

In [15]:
def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]
def get_index_from_title(title):
    return df[df.title == title]["index"].values[0]

In [16]:
#our next step is to get the title of the movie that user currently likes .Then we wwill find the index of that  movie after that we will
#access the row corresponding to this similarity matrix.Thus we will get the similarity scores of all other movies from the current movies
#then we will enumerate through all the similarity scores of that movie to make a tuple of movie index and similarity score.This will convert a row similarity
#scores like this - [1 0.5 0.2 0.9] to this - [(0,1)(1,0.5)(2,0.2)(3,0.9)]. Here each item is in this form-(movie index,similarity score)

In [17]:
movie_user_likes="X-Men: First Class"
movie_index = get_index_from_title(movie_user_likes)
similar_movies = list(enumerate(cosine_sim[movie_index]))# accessing the row corresponding to given 

In [18]:
#Now comes the most vital point . we will sport the list similar_movies according to similarity scores in descending order.
#since the most similar movie is to a given movie will be itselfm we will discard the first element after sorting the movies.

In [19]:
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)[1:]

In [20]:
#Now, we will run a loop to print first 5 entries from sorted_similar_movies list

In [21]:
i=0
print("top 5 similar movies to "+movie_user_likes+"are:\n")
for element in sorted_similar_movies:
    print(get_title_from_index(element[0]))
    i=i+1
    if i>20:
        break

top 5 similar movies to X-Men: First Classare:

X-Men: Days of Future Past
X-Men: Apocalypse
Ant-Man
X-Men
X2
The Wolverine
Man of Steel
X-Men: The Last Stand
Iron Man 2
The Helix... Loaded
Captain America: The Winter Soldier
Fantastic Four
Avengers: Age of Ultron
X-Men Origins: Wolverine
The Avengers
Superman Returns
Iron Man
Guardians of the Galaxy
The Hunger Games: Catching Fire
The Incredible Hulk
The Hunger Games: Mockingjay - Part 2
