In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline 
import seaborn as sns

In [3]:
df=pd.read_csv("moviesdatset.csv")

In [4]:
df.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [6]:
df.isnull().sum()

id                    0
title                 0
genre                 3
original_language     0
overview             13
popularity            0
release_date          0
vote_average          0
vote_count            0
dtype: int64

In [7]:
df.dtypes

id                     int64
title                 object
genre                 object
original_language     object
overview              object
popularity           float64
release_date          object
vote_average         float64
vote_count             int64
dtype: object

# Feature Selection


In [8]:
df.columns

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')

In [9]:
df=df[['id','title','genre','overview']]

In [10]:
df

Unnamed: 0,id,title,genre,overview
0,278,The Shawshank Redemption,"Drama,Crime",Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance","Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Drama,Crime","Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,"Drama,History,War",The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,"Drama,Crime",In the continuing saga of the Corleone crime f...
...,...,...,...,...
9995,10196,The Last Airbender,"Action,Adventure,Fantasy","The story follows the adventures of Aang, a yo..."
9996,331446,Sharknado 3: Oh Hell No!,"Action,TV Movie,Science Fiction,Comedy,Adventure",The sharks take bite out of the East Coast whe...
9997,13995,Captain America,"Action,Science Fiction,War","During World War II, a brave, patriotic Americ..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,"Adventure,Fantasy,Action,Drama",A man named Farmer sets out to rescue his kidn...


In [11]:
df['tags']=df['genre']+df['overview']

In [13]:
df=df.drop(["genre","overview"],axis=1)

In [16]:
df.head()

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,"Drama,CrimeFramed in the 1940s for the double ..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,RomanceRaj is a rich, carefree, h..."
2,238,The Godfather,"Drama,CrimeSpanning the years 1945 to 1955, a ..."
3,424,Schindler's List,"Drama,History,WarThe true story of how busines..."
4,240,The Godfather: Part II,"Drama,CrimeIn the continuing saga of the Corle..."


In [17]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Our sentences
sentences = [
    "The cat sat on the mat.",
    "The dog sat on the log.",
    "The cat chased the dog."
]

# Bag of Words
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(sentences)
print("Bag of Words Matrix:")
print(bow_matrix.toarray())
print("Vocabulary:", vectorizer.get_feature_names_out())

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())
print("Vocabulary:", tfidf_vectorizer.get_feature_names_out())


Bag of Words Matrix:
[[1 0 0 0 1 1 1 2]
 [0 0 1 1 0 1 1 2]
 [1 1 1 0 0 0 0 2]]
Vocabulary: ['cat' 'chased' 'dog' 'log' 'mat' 'on' 'sat' 'the']

TF-IDF Matrix:
[[0.37420726 0.         0.         0.         0.49203758 0.37420726
  0.37420726 0.58121064]
 [0.         0.         0.37420726 0.49203758 0.         0.37420726
  0.37420726 0.58121064]
 [0.40352536 0.53058735 0.40352536 0.         0.         0.
  0.         0.62674687]]
Vocabulary: ['cat' 'chased' 'dog' 'log' 'mat' 'on' 'sat' 'the']


In [18]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [19]:
cv=CountVectorizer(max_features=1000,stop_words="english")
cv

In [21]:
vector=cv.fit_transform(df['tags'].values.astype('U')).toarray()

In [23]:
vector.shape

(10000, 1000)

In [25]:
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
similarity=cosine_similarity(vector)

In [28]:
df[df['title']=="The Godfather"].index[0]

2

In [30]:
distance = sorted(list(enumerate(similarity[2])), reverse=True, key=lambda vector:vector[1])
for i in distance[0:5]:
    print(df.iloc[i[0]].title)

The Godfather
House of Gucci
The Godfather: Part II
Felon
Gotti


In [38]:
def recommend(movies):
    index=df[df['title']==movies].index[0]
    distance = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda vector:vector[1])
    for i in distance[0:5]:
        print(df.iloc[i[0]].title)

In [39]:
recommend("The Godfather")

The Godfather
House of Gucci
The Godfather: Part II
Felon
Gotti


In [40]:
import pickle

In [55]:
pickle.dump(df,open('df.pkl','wb'))

In [53]:
pickle.dump(similarity,open("similarity.pkl",'wb'))

In [57]:
pickle.load(open('df.pkl','rb'))

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,"Drama,CrimeFramed in the 1940s for the double ..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,RomanceRaj is a rich, carefree, h..."
2,238,The Godfather,"Drama,CrimeSpanning the years 1945 to 1955, a ..."
3,424,Schindler's List,"Drama,History,WarThe true story of how busines..."
4,240,The Godfather: Part II,"Drama,CrimeIn the continuing saga of the Corle..."
...,...,...,...
9995,10196,The Last Airbender,"Action,Adventure,FantasyThe story follows the ..."
9996,331446,Sharknado 3: Oh Hell No!,"Action,TV Movie,Science Fiction,Comedy,Adventu..."
9997,13995,Captain America,"Action,Science Fiction,WarDuring World War II,..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,"Adventure,Fantasy,Action,DramaA man named Farm..."


In [48]:
pickle.load(open('movies_list.pkl','rb'))

EOFError: Ran out of input

In [54]:
pickle.load(open("similarity.pkl",'rb'))

array([[1.        , 0.11009638, 0.09534626, ..., 0.1254363 , 0.11396058,
        0.05025189],
       [0.11009638, 1.        , 0.17320508, ..., 0.        , 0.        ,
        0.        ],
       [0.09534626, 0.17320508, 1.        , ..., 0.0438529 , 0.05976143,
        0.        ],
       ...,
       [0.1254363 , 0.        , 0.0438529 , ..., 1.        , 0.05241424,
        0.04622502],
       [0.11396058, 0.        , 0.05976143, ..., 0.05241424, 1.        ,
        0.06299408],
       [0.05025189, 0.        , 0.        , ..., 0.04622502, 0.06299408,
        1.        ]])

In [50]:
pickle.load(open("movies_list.pkl",'rb'))

EOFError: Ran out of input