# Content Based Movie Recomendation Engine in Python.

### Import the Necessary Libraries 

In [1]:
#IMPORT THE NECESSARY LIBRARIES

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore")

### Read the Data 

In [2]:
#Read The CSV file SCraped in the Recommendation DataFrame Scrape DataSet:

data = pd.read_csv('movie_recommend.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Title,Movie Rating,Director,Release Year,Genres,Cast,TagLine,Top Review,Meta Score
0,0,Shingeki no kyojin,9.0,Yûki Kaji,2013,",Animation,Action,Adventure,Drama,Fantasy,Horror","Josh Grelle,Yûki Kaji,Marina Inoue",The face of humanity's extinction. (season 1),Legendary show.,0.0
1,1,American Horror Stories,6.1,Brad Falchuk,2021,",Drama,Horror,Thriller","Sierra McCormick,Paris Jackson,Merrin Dungey",A,What even is this,0.0
2,2,Black Widow,6.8,Cate Shortland,2021,",Action,Adventure,Sci-Fi","Jac Schaeffer,Ned Benson,Scarlett Johansson",She's Done Running From Her Past.,MCU Version of a Filler Episode,67.0
3,3,What If...?,7.6,Jeffrey Wright,2021,",Animation,Action,Adventure,Fantasy","Matthew Wood,Jeffrey Wright,Terri Douglas",Enter the Multiverse of infinite possibilities,"Marvel, please take your cue from the original...",0.0
4,4,Afterlife of the Party,5.8,Stephen Herek,2021,",Comedy,Drama,Fantasy,Romance","Victoria Justice,Midori Francis,Robyn Scott",She's got one more chance to fix the life she ...,Hope,0.0


### Get the Measure of Central Tendencies:

In [3]:
#LOOK AT THE DESCRIPTION OF THE DATASET:

data.describe()

Unnamed: 0.1,Unnamed: 0,Movie Rating,Release Year,Meta Score
count,750.0,750.0,750.0,750.0
mean,374.5,7.464,2015.214667,21.086667
std,216.65064,1.059428,8.156385,30.29643
min,0.0,3.3,1967.0,0.0
25%,187.25,6.9,2012.0,0.0
50%,374.5,7.55,2019.0,0.0
75%,561.75,8.3,2021.0,50.0
max,749.0,9.4,2022.0,94.0


### Get the Data Type of the Columns in the Dataset:

In [4]:
#GET THE DATATYPE OF THE ELEMENTS IN THE DATASET:

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750 entries, 0 to 749
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    750 non-null    int64  
 1   Title         750 non-null    object 
 2   Movie Rating  750 non-null    float64
 3   Director      750 non-null    object 
 4   Release Year  750 non-null    int64  
 5   Genres        750 non-null    object 
 6   Cast          750 non-null    object 
 7   TagLine       750 non-null    object 
 8   Top Review    750 non-null    object 
 9   Meta Score    750 non-null    float64
dtypes: float64(2), int64(2), object(6)
memory usage: 58.7+ KB


### Identify Number of Duplicate Values in the DataSet 

In [5]:
#IDENTIFY THE DUPLICATE ELEMENTS IN THE DATA SET BY TITLES:

data['Title'].duplicated().value_counts()

True     387
False    363
Name: Title, dtype: int64

## Data Transformation:
- Drop the Duplicate elements in the Data Set.
- Combine all the Features in the Data Set into a Single Column.
- Shuffle the elements of the DataSet and reset the Index of the dataframe.
- Fill Unavailable information in the Top Review pandas Series.

In [6]:
#DEFINE A FUNCTION TO TRANSFORM THE DATA AND REFINE THE DATASET BY DROPPING THE DUPLICATE COLUMNS:

def DATA_TRANSFORM(data):
    data = data.drop_duplicates('Title', keep='last')
    data['Combined Features'] =  data['Title'] + ',' + data['Director'] + ',' + data['Genres'] + ',' + data['Cast'] + ',' + data['TagLine'] + ',' + data['Top Review']
    data = data.sample(frac=1)
    data.reset_index(drop=True,inplace=True)
    data.drop(data.columns[0],inplace=True,axis=1)
    data['Top Review'].fillna('Information Unavailable',inplace=True)
    return data

### Check for NULL Values:

In [7]:
#GET THE TRANSFORMED DATA:
data = DATA_TRANSFORM(data)
#CHECK FOR NULLL VALUES AFTER PROCESSING:
data.isnull().sum()

Title                0
Movie Rating         0
Director             0
Release Year         0
Genres               0
Cast                 0
TagLine              0
Top Review           0
Meta Score           0
Combined Features    0
dtype: int64

### Similarity Scores
- Pass the Combined Features Column into the CountVectorizer function.
- Get the Similarity Scores from the Cosine Similarity Function.
- Sort the Movies in the Descending order of the Similarity Scores.
- Return the sorted List of similar movies.

In [8]:
#INITIALIZE THE OBJECTS AND COMPUTE SIMILARITY SCORES:

cv = CountVectorizer(stop_words='english')
count_matrix = cv.fit_transform(data['Combined Features'])
similarity_scores = cosine_similarity(count_matrix)

#DEFINE GET SIMILAR MOVIES FUNCTION TO GET THE INDEXES OF THE SIMILAR MOVIES TO THE MOVIE LIKED BY THE USER:
def GET_SIMILAR_MOVIES(movie_liked_by_user):
    movie_index = data[data['Title'] == movie_liked_by_user].index.values[0]
    similar_movies = list(enumerate(similarity_scores[movie_index]))
    sorted_similar = sorted(similar_movies,key=lambda x:x[1],reverse=True)
    return sorted_similar

## Function to get the Top 10 Recommendations:
- Get the following Information from the Data Set for the Recommended Movies.
    - Movie Index
    - Movie Title
    - Genres
    - Director
- Create a Dictionary of the Information Available.
- Convert the obtained information into a Data Set.

In [9]:
#Initialize Empty Lists to Store the Values:
def Final_Recommendation(title):
    recommend_movies = []
    serial_number = []
    movie_genre = []
    movie_director = []
    
    #Get the Similar Movies from the Indexes of the Similar Sorted Movie By calling the Get Similar Movies Function:
    i = 0
    similar_movies = GET_SIMILAR_MOVIES(title)
    for index in similar_movies:
        i = i + 1
        recommend_movies.append(data[data.index == index[0]]['Title'].values[0])
        serial_number.append(i)
        movie_genre.append(data[data.index == index[0]]['Genres'].values[0])
        movie_director.append(data[data.index == index[0]]['Director'].values[0])
        if(i>9):
            break


    #Create A Dictionary of the Available Columns:

    Recommend_Dict ={'S. No.':serial_number,
                     'Title':recommend_movies,
                     'Genres':movie_genre,
                     'Director':movie_director}


    #Create a DataFrame out of the returned Columns and the Dictionary

    Recommend_Df = pd.DataFrame(Recommend_Dict)
    Recommend_Df.set_index('S. No.',inplace=True)
    
    return Recommend_Df

## Get the Movie Which the User Likes from the User:
- If the Movie Does not exist in the dataset, we print information unavailable.
- If the Movies is present in the Dataset, we return the top 10 recommended Movies.

In [10]:
Movie_Liked_input = str(input('Please Type the movie which you liked:'))
if(Movie_Liked_input in data['Title'].values):
    print('Movie found in the Scraped Dataset. Fetching you the Recommended Movies!')
else:
    print('Movie not found in the Scraped Dataset. Please try once again')


Please Type the movie which you liked:Cinderella
Movie found in the Scraped Dataset. Fetching you the Recommended Movies!


In [11]:
Final_Recommendation(Movie_Liked_input)

Unnamed: 0_level_0,Title,Genres,Director
S. No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Cinderella,",Adventure,Comedy,Family,Fantasy,Musical,Romance",Kay Cannon
2,Raya and the Last Dragon,",Animation,Action,Adventure,Comedy,Family,Fantasy",Don Hall
3,Harley Quinn,",Animation,Action,Adventure,Comedy,Crime,Fanta...",Justin Halpern
4,The Addams Family 2,",Animation,Adventure,Comedy,Family,Fantasy,Hor...",Greg Tiernan
5,The Boss Baby: Family Business,",Animation,Adventure,Comedy,Family,Fantasy",Tom McGrath
6,Hotel Transylvania: Transformania,",Animation,Adventure,Comedy,Family,Fantasy,Hor...",Derek Drymon
7,Nightbooks,",Family,Fantasy,Horror,Mystery",David Yarovesky
8,Adventure Time,",Animation,Action,Adventure,Comedy,Family,Fant...",Pendleton Ward
9,Vivo,",Animation,Adventure,Comedy,Family,Musical,Add...",Kirk DeMicco
10,Luca,",Animation,Adventure,Comedy,Family,Fantasy",Enrico Casarosa


# Thanks!!