### Dependencies

In [1]:
import numpy as np
import pandas as pd
import difflib
import re
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Data Preprocessing

In [2]:
movies = pd.read_csv('IMDB_movies.csv')

In [3]:
movies.head(2)

Unnamed: 0,Movie_Title,Year,Director,Actors,Rating,Runtime(Mins),Censor,Total_Gross,main_genre,side_genre
0,Kantara,2022,Rishab Shetty,"Rishab Shetty, Sapthami Gowda, Kishore Kumar G...",9.3,148,UA,Gross Unkown,Action,"Adventure, Drama"
1,The Dark Knight,2008,Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",9.0,152,UA,$534.86M,Action,"Crime, Drama"


In [4]:
movies.shape

(5562, 10)

In [5]:
movies.isnull().sum()

Movie_Title      0
Year             0
Director         0
Actors           0
Rating           0
Runtime(Mins)    0
Censor           0
Total_Gross      0
main_genre       0
side_genre       0
dtype: int64

### Feature Extraction

In [6]:
selected_features = ['Movie_Title','main_genre','side_genre','Actors','Director']
for feature in selected_features:
    movies[feature]=movies[feature].fillna('')
    movies[feature]=movies[feature].str.replace(',',' ')

In [7]:
selected_features

['Movie_Title', 'main_genre', 'side_genre', 'Actors', 'Director']

In [8]:
#movies[selected_features]

In [9]:
combined_features = movies['Movie_Title']+' '+movies['main_genre']+' '+movies['side_genre']+' '+movies['Actors']+' '+movies['Director']

In [10]:
combined_features

0       Kantara Action  Adventure   Drama Rishab Shett...
1       The Dark Knight Action  Crime   Drama Christia...
2       The Lord of the Rings: The Return of the King ...
3       Inception Action  Adventure   Sci-Fi Leonardo ...
4       The Lord of the Rings: The Two Towers Action  ...
                              ...                        
5557    Disaster Movie Comedy  Sci-Fi Carmen Electra  ...
5558    The Hottie & the Nottie Comedy  Romance Paris ...
5559    From Justin to Kelly Comedy  Musical   Romance...
5560    Superbabies: Baby Geniuses 2 Comedy  Family   ...
5561    Cumali Ceber: Allah Seni Alsin Comedy Comedy H...
Length: 5562, dtype: object

In [11]:
stemmer = SnowballStemmer('english')
def stemming_tokenizer(str_input):
    words = re.sub(r'[^a-zA-Z]{2,}', ' ', str_input).lower().split()
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

In [13]:
stemmed_features = combined_features.apply(stemming_tokenizer)

In [14]:
stemmed_features

0       kantara action adventur drama rishab shetti sa...
1       the dark knight action crime drama christian b...
2       the lord of the ring the return of the king ac...
3       incept action adventur sci-fi leonardo dicapri...
4       the lord of the ring the two tower action adve...
                              ...                        
5557    disast movi comedi sci-fi carmen electra vanes...
5558    the hotti the notti comedi romanc pari hilton ...
5559    from justin to kelli comedi music romanc kelli...
5560    superbabi babi genius comedi famili sci-fi jon...
5561    cumali ceber allah seni alsin comedi comedi ha...
Length: 5562, dtype: object

In [15]:
vectorizer = TfidfVectorizer(stop_words='english')

feature_vectors = vectorizer.fit_transform(stemmed_features)

In [16]:
feature_vectors.shape

(5562, 14601)

In [17]:
vectorizer.get_feature_names_out()

array(['8mm', 'aakeel', 'aaliyah', ..., 'ôdishon', 'ôkami', 'ölüml'],
      dtype=object)

In [18]:
similarity = cosine_similarity(feature_vectors)
similarity.shape

(5562, 5562)

### Testing

In [None]:
movie_name = input('Enter movie name: ')

In [None]:
movie_titles =movies['Movie_Title'].tolist()
movie_matches = difflib.get_close_matches(movie_name, movie_titles)
movie_matches

In [None]:
movie_idx = movies[movies['Movie_Title']==movie_matches[0]].index.values[0]

In [None]:
similarity_scores = list(enumerate(similarity[movie_idx]))
#similarity_scores

In [None]:
sorted_movies = sorted(similarity_scores, key=lambda x:x[1], reverse = True)

In [None]:
recommendation = []
for idx, score in sorted_movies[:20]:
    recommendation.append(movies[movies.index==idx]['Movie_Title'].values[0])

recommendation

### Compilation

In [19]:
def recommendation(movie_title):
    movie_titles =movies['Movie_Title'].tolist()
    movie_matches = difflib.get_close_matches(movie_name, movie_titles)
    movie_idx = movies[movies['Movie_Title']==movie_matches[0]].index.values[0]
    similarity_scores = list(enumerate(similarity[movie_idx]))
    sorted_movies = sorted(similarity_scores, key=lambda x:x[1], reverse = True)
    
    recommendation = []
    for idx, score in sorted_movies[:20]:
        recommendation.append(movies[movies.index==idx]['Movie_Title'].values[0])

    return recommendation
    

In [24]:
movie_name = input('Enter movie name: ')

Enter movie name: Batman


In [25]:
recommendation(movie_name)

['Batman',
 'Batman Returns',
 'Mars Attacks!',
 'The Natural',
 'Beetle Juice',
 'Planet of the Apes',
 'The Sentinel',
 "Something's Gotta Give",
 'Cellular',
 'Grudge Match',
 'L.A. Confidential',
 'Never Say Never Again',
 'Sherlock Jr.',
 '8 Mile',
 'Nine 1/2 Weeks',
 'My Stepmother Is an Alien',
 'Edward Scissorhands',
 'The Batman',
 'The Lego Batman Movie',
 'Jack Frost']