# CA05 – kNN based Movie Recommender Engine

In [1]:
# Importing the packages we use the most
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Part 1: Data Source and Contents

In [2]:
# Reading the data
url = "https://github.com/ArinB/MSBA-CA-Data/raw/main/CA05/movies_recommendation_data.csv"
df = pd.read_csv(url)
# Checking if it worked
df.head()

Unnamed: 0,Movie ID,Movie Name,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History,Label
0,58,The Imitation Game,8.0,1,1,1,0,0,0,0,0
1,8,Ex Machina,7.7,0,1,0,0,0,1,0,0
2,46,A Beautiful Mind,8.2,1,1,0,0,0,0,0,0
3,62,Good Will Hunting,8.3,0,1,0,0,0,0,0,0
4,97,Forrest Gump,8.8,0,1,0,0,0,0,0,0


In [3]:
# Checkin its shape
df.shape

(30, 11)

In [4]:
# Checking info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Movie ID     30 non-null     int64  
 1   Movie Name   30 non-null     object 
 2   IMDB Rating  30 non-null     float64
 3   Biography    30 non-null     int64  
 4   Drama        30 non-null     int64  
 5   Thriller     30 non-null     int64  
 6   Comedy       30 non-null     int64  
 7   Crime        30 non-null     int64  
 8   Mystery      30 non-null     int64  
 9   History      30 non-null     int64  
 10  Label        30 non-null     int64  
dtypes: float64(1), int64(9), object(1)
memory usage: 2.7+ KB


In [5]:
# Checking for missing values
df.isnull().sum()

Movie ID       0
Movie Name     0
IMDB Rating    0
Biography      0
Drama          0
Thriller       0
Comedy         0
Crime          0
Mystery        0
History        0
Label          0
dtype: int64

In [6]:
# Descriptive statistics for the columns
df.describe()

Unnamed: 0,Movie ID,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History,Label
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,48.133333,7.696667,0.233333,0.6,0.1,0.1,0.133333,0.1,0.1,0.0
std,29.288969,0.666169,0.430183,0.498273,0.305129,0.305129,0.345746,0.305129,0.305129,0.0
min,1.0,5.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,27.75,7.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,48.5,7.75,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,64.25,8.175,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,98.0,8.8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [7]:
# Removing the Label column since are all zeroes because we aren’t using this data set for classification or regression
df = df.drop(columns=['Label'])
df.head()

Unnamed: 0,Movie ID,Movie Name,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History
0,58,The Imitation Game,8.0,1,1,1,0,0,0,0
1,8,Ex Machina,7.7,0,1,0,0,0,1,0
2,46,A Beautiful Mind,8.2,1,1,0,0,0,0,0
3,62,Good Will Hunting,8.3,0,1,0,0,0,0,0
4,97,Forrest Gump,8.8,0,1,0,0,0,0,0


## Part 2: Building your own Recommender System

### Splitting the data since our model only take numerical varaiables


In [8]:
# Splitting the data into x and y, our numeric and non numeric columns
numeric_data =  df.loc[:, ['IMDB Rating', 'Biography', 'Drama', 'Thriller', 
                   'Comedy', 'Crime', 'Mystery', 'History']]
nn_data = df.loc[:,['Movie Name']] # nn stands for non numeric

print(numeric_data.head())
print(nn_data.head())

   IMDB Rating  Biography  Drama  Thriller  Comedy  Crime  Mystery  History
0          8.0          1      1         1       0      0        0        0
1          7.7          0      1         0       0      0        1        0
2          8.2          1      1         0       0      0        0        0
3          8.3          0      1         0       0      0        0        0
4          8.8          0      1         0       0      0        0        0
           Movie Name
0  The Imitation Game
1          Ex Machina
2    A Beautiful Mind
3   Good Will Hunting
4        Forrest Gump


### Using kNN model to build our system

In [9]:
from sklearn.neighbors import NearestNeighbors
# Building our model with the number of neighbors being 5 as we want to reccomend 5 similiar movies
model = NearestNeighbors(n_neighbors = 5)

# Fitting 
model.fit(numeric_data)


## Part 3: Making a Recommendation

### Following is the genre information about the movie “The Post”
- IMDB Rating = 7.2, Biography = Yes, Drama = Yes, Thriller = No, Comedy = No, Crime = No, Mystery = No, History = Yes

In [10]:
# Manually "Encoding the values"
the_post = [7.2, 1, 1, 0, 0, 0, 0, 1]

In [11]:
# Using our model to find the columns/movies near "The Post"
distances, indices = model.kneighbors([the_post])

# Finding the names of the 5 most similar movies
for index in indices[0]:
    print(df.iloc[index]['Movie Name'])


12 Years a Slave
Hacksaw Ridge
Queen of Katwe
The Wind Rises
A Beautiful Mind




### Please ignore the warning, it does not affect the results of our model
