In [2]:
import pandas as pd

In [5]:
movies=pd.read_csv("dataset.csv")

In [14]:
#DATA DESCRIPTION COMMANDS
# movies.head(10)   - gives details of first 10 movies
# movies.descrive() - give dataset details
# movies.info()     - gives concise summary of dataframe

### FEATURE SELECTION PART

In [16]:
movies.columns

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')

In [27]:
# Needed details mostly - id, title, genre, overview
# (id for getting the right posters)
movies = movies[["id","title","overview","genre"]]
# Since content based recomendation system, we merge the "overview" and "genre" part
movies["tags"]=movies["overview"]+movies["genre"]

In [41]:
new_data = movies.drop(columns=["overview","genre"])
#new_data

### Actual Data Work

In [55]:
# Now to convert the tags into essentially vectorls for ml to work
# stop_words are those word in english that have no contribbution to the meaning of the sentence, ex: is, are, the, of etc..
# max_features basicaly joins all the tags and then takes the top 5000 most common workds tht arnt stop_words

''' The logic is to take those 5000 words and compare them on (10000,10000) vector chart, and to reccomend, we take the top most
    closest vectors to the input. Essential we cant create a "similarity" score for each tags since they are not numbers 
    so we do this approach.'''

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=10000,stop_words="english")

In [56]:
vector=cv.fit_transform(new_data["tags"].values.astype("U")).toarray()
#vector.shape - gives the shape of the vecotr

In [97]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vector)

Converts a collection of text documents into a matrix of token (word) counts.

🧠 In Simple Words:
It transforms text into numbers — so ML models can understand and use it.

📥 Input:
A list of strings (documents):
texts = ["I love movies", "I love Python", "Movies are fun"]
📤 Output:
A matrix like:

       are fun i love movies python
Doc1	0	0	1	1	1	0
Doc2	0	0	1	1	0	1
Doc3	1	1	0	0	1	0

This is a sparse matrix of word counts.

In [98]:
 #similarity

In [99]:
# to get title we need index (not id), example
# new_data[new_data["title"]=="The Godfather"].index[0] - gives output 2
# this is an example if we the entered movie is "the godfather"
distance = sorted(list(enumerate(similarity[2])), reverse=True,key =lambda vector: vector[1])
#this will give the index for the closest ones to the entered (ex, heres its the godfather)
for i in distance[0:10]:
    print(new_data.iloc[i[0]].title)

The Godfather
The Godfather: Part II
Blood Ties
Joker
Bomb City
Gotti
Felon
Rope
Batman: The Killing Joke
The Big Heat


In [100]:
# Now making it into an actual function
def recommend(movies):
    index = new_data[new_data["title"]==movies].index[0]
    distance = sorted(list(enumerate(similarity[index])), reverse=True,key =lambda vector: vector[1])   
    for i in distance[0:10]:
        print(new_data.iloc[i[0]].title)

In [101]:
# Example
recommend("Iron Man")

Iron Man
Iron Man 3
Guardians of the Galaxy Vol. 2
Avengers: Age of Ultron
Star Wars: Episode III - Revenge of the Sith
G.O.R.A.
Iron Man 2
Charlie's Angels
Star Wars: Episode I - The Phantom Menace
The Rocketeer


In [103]:
# Here the function is done mainly
# Now we can export the data in a pickle file
import pickle
f1=open("movies_list.pkl","wb")
f2=open("similarity.pkl","wb")
pickle.dump(new_data,f1)
pickle.dump(similarity,f2)
f1.close()
f2.close()