# Develop a movie recommendation model using the scikit-learn library in python. Refer dataset
https://github.com/rashida048/Some-NLP-Projects/blob/master/movie_dataset.csv

In [47]:
import numpy as np
import pandas as pd

In [48]:
data=pd.read_csv("BollywoodMovieDetail.csv")

In [49]:
data

Unnamed: 0,imdbId,title,releaseYear,releaseDate,genre,writers,actors,directors,sequel,hitFlop
0,tt0118578,Albela,2001,20 Apr 2001,Romance,Honey Irani (screenplay) | Honey Irani (story)...,Govinda | Aishwarya Rai Bachchan | Jackie Shro...,Deepak Sareen,0.0,2
1,tt0169102,Lagaan: Once Upon a Time in India,2001,08 May 2002,Adventure | Drama | Musical,Ashutosh Gowariker (story) | Ashutosh Gowarike...,Aamir Khan | Gracy Singh | Rachel Shelley | Pa...,Ashutosh Gowariker,0.0,6
2,tt0187279,Meri Biwi Ka Jawab Nahin,2004,02 Jul 2004,Action | Comedy,,Akshay Kumar | Sridevi | Gulshan Grover | Laxm...,Pankaj Parashar | S.M. Iqbal,0.0,1
3,tt0222024,Hum Tumhare Hain Sanam,2002,24 May 2002,Drama | Romance,K.S. Adiyaman | Arun Kumar (assistant dialogue...,Shah Rukh Khan | Madhuri Dixit | Salman Khan |...,K.S. Adiyaman,0.0,4
4,tt0227194,One 2 Ka 4,2001,30 Mar 2001,Action | Comedy | Drama,Sanjay Chhel | Raaj Kumar Dahima (screenplay) ...,Shah Rukh Khan | Juhi Chawla | Jackie Shroff |...,Shashilal K. Nair,0.0,1
...,...,...,...,...,...,...,...,...,...,...
1279,tt4121522,Shuruaat Ka Interval,2014,15 Aug 2014,,Aarti S. Bagdi | Amrit Raj Gupta | Krishan Hoo...,Kaivalya Chheda | Rajsekhar Aningi | Avidant B...,Aarti S. Bagdi | Amrit Raj Gupta | Krishan Hoo...,0.0,1
1280,tt4187650,Trip to Bhangarh,2014,28 Aug 2014,Horror,,Manish Choudhary | Suzanna Mukherjee | Poonam ...,Jitendra Pawar,0.0,1
1281,tt4190220,Zed Plus,2014,28 Nov 2014,,Chandra Prakash Dwivedi (screen play and dialo...,Adil Hussain | Ekavali Khanna | Kulbhushan Kha...,Chandra Prakash Dwivedi,0.0,1
1282,tt4219300,Gollu aur Pappu,2014,21 Nov 2014,Comedy,,Vir Das | Sandeepa Dhar | Rahul Handa | Dimple...,Kabir Sadanand,0.0,1


In [50]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1284 entries, 0 to 1283
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   imdbId       1284 non-null   object 
 1   title        1284 non-null   object 
 2   releaseYear  1284 non-null   int64  
 3   releaseDate  1231 non-null   object 
 4   genre        1282 non-null   object 
 5   writers      1165 non-null   object 
 6   actors       1281 non-null   object 
 7   directors    1280 non-null   object 
 8   sequel       1281 non-null   float64
 9   hitFlop      1284 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 100.4+ KB


In [51]:
data.shape

(1284, 10)

In [52]:
data.describe()

Unnamed: 0,releaseYear,sequel,hitFlop
count,1284.0,1281.0,1284.0
mean,2007.989097,0.035129,2.144081
std,4.005243,0.188371,1.81282
min,2001.0,0.0,1.0
25%,2004.0,0.0,1.0
50%,2008.0,0.0,1.0
75%,2011.0,0.0,2.0
max,2014.0,2.0,9.0


In [53]:
data.isnull().sum()

imdbId           0
title            0
releaseYear      0
releaseDate     53
genre            2
writers        119
actors           3
directors        4
sequel           3
hitFlop          0
dtype: int64

In [54]:
data.columns

Index(['imdbId', 'title', 'releaseYear', 'releaseDate', 'genre', 'writers',
       'actors', 'directors', 'sequel', 'hitFlop'],
      dtype='object')

# feature Engineering

In [55]:
# Cell 1: Feature Engineering

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Fill missing values with empty strings
data['genre'] = data['genre'].fillna('')
data['writers'] = data['writers'].fillna('')
data['actors'] = data['actors'].fillna('')
data['directors'] = data['directors'].fillna('')

# Combine relevant features into a single string
combined_features = data['genre'].str.lower() + ' ' + \
                    data['writers'].str.lower() + ' ' + \
                    data['actors'].str.lower() + ' ' + \
                    data['directors'].str.lower()

# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the combined features
tfidf_matrix = tfidf.fit_transform(combined_features)

# Get the shape of the TF-IDF matrix
tfidf_matrix.shape


(1284, 3509)

# Similarity Calculation

In [56]:
# Calculate the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


# Recommendation

In [57]:
def recommend_movies(movie_title, data=data, cosine_sim=cosine_sim):
    idx = data[data['title'] == movie_title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Top 10 similar movies
    movie_indices = [i[0] for i in sim_scores]
    return data['title'].iloc[movie_indices]
recommended_movies = recommend_movies('Hum Tumhare Hain Sanam')


In [58]:
recommended_movies

302     Shaadi Karke Phas Gaya Yaar
532                     Aaja Nachle
1089           Life Ki Toh Lag Gayi
54                Moksha: Salvation
5                            Devdas
222             Yeh Lamhe Judaai Ke
571                           Hello
814                   Prem Kaa Game
622                            Veer
885                         Talaash
Name: title, dtype: object