In [14]:
from pathlib import Path
import pathlib


import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow import metrics
from keras import optimizers
from tensorflow.keras.utils import plot_model


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import linear_kernel

In [6]:
imdb_dir = './Data/imdb metadata/'
imdb_path = Path(imdb_dir)

df = pd.read_csv(imdb_path / 'movies_metadata.csv')
rating_df = pd.read_csv(imdb_path/ 'ratings_small.csv')
rating_df.head()

  df = pd.read_csv(imdb_path / 'movies_metadata.csv')


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [6]:
df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [7]:
df['spoken_languages'].unique()

array(["[{'iso_639_1': 'en', 'name': 'English'}]",
       "[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'fr', 'name': 'Français'}]",
       "[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'es', 'name': 'Español'}]",
       ...,
       "[{'iso_639_1': 'sv', 'name': 'svenska'}, {'iso_639_1': 'de', 'name': 'Deutsch'}]",
       "[{'iso_639_1': 'ar', 'name': 'العربية'}, {'iso_639_1': 'pl', 'name': 'Polski'}]",
       "[{'iso_639_1': 'ff', 'name': 'Fulfulde'}, {'iso_639_1': 'en', 'name': 'English'}]"],
      dtype=object)

## Making Some EDA on the data

- we have 23 features 4 of them are float64 and the rest are objects "text"
- we have alot of features that contains a null values which we need to deal with them
- 

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

# We are trying to make a simpler recommendation systems 
## Excercise

Build Simple Recommender System based on the metric below:
- weightedRating(WR) = ((v/v+m).R) + ((m/v+m).C)
 * v >> is the number of votes for the movie. (vote_count)
 * m >> is the min votes required to be listed in chart. (based on negative vote)
 * R >> is the average rating of the movie. (vote_average)
 * C >> is the mean vote across the the whole report. (calculate from data)

In [33]:
# mean vote across the whole report
C = df['vote_average'].mean()
print(C)

M = df['vote_count'].quantile(0.9)
print(M)

df_new = df.copy()

df_new = df_new.loc[df_new['vote_count'] >= M]

print(df_new.shape)

R = df_new['vote_average']
V = df_new['vote_count']

5.618207215134185
160.0
(4555, 24)


In [38]:
def calculate_recommendation(x, C = C, M = M):
    V = x['vote_count']
    R = x['vote_average']
    recom_score = (V / (V + M) * R) + ( M / (V + M ) * C)
    return recom_score

In [40]:
df_new['score'] = df_new.apply(calculate_recommendation, axis = 1)

df_new['score'].head()

10309    8.421453
39085    7.284861
40251    8.112532
834      8.425439
314      8.445869
Name: score, dtype: float64

In [41]:
# Sorting movies based on score calculated above
df_new = df_new.sort_values('score', ascending= False)

# Print the top 15 movies
df_new[['title', 'vote_count', 'vote_average', 'score']].head(15)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171
