# <center> IMDB-Movies Dataset Cleaning </center>

* Load dataset
    - dataset 1 
    - dataset 2

* Merge dataset 
    - 'id' 

* Missing Values checking
    - Replace Null value with 0.

* Drop Duplicates

* Feature Engineering
    - Dictunary to list 

* Addition Columns   
    - cast_size
    - crew_size
    - director
    - weight_rating
    - revenue_div_budget

## Load Datasets

In [1]:
import pandas as pd
import ast
from ast import literal_eval


# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('the-movies-dataset/movies_metadata.csv')
data_credit = pd.read_csv("the-movies-dataset/credits.csv")

In [3]:
data["id"] =pd.to_numeric(data['id'], errors='coerce',downcast="integer")

In [4]:
data = data.merge(data_credit,on=["id"],how="left")

In [5]:
data.isnull().sum()

adult                        0
belongs_to_collection    41039
budget                       0
genres                       0
homepage                 37747
id                           3
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25103
title                        6
video                        6
vote_average                 6
vote_count                   6
cast                         4
crew                         4
dtype: int64

2. Drop Duplications

In [6]:
data = data.drop_duplicates()

3. Replace non and eliminate outliers(objects)

In [7]:
import numpy as np
data['budget'] = data['budget'].replace(0, np.nan)
data["budget"] =pd.to_numeric(data['budget'], errors='coerce',downcast="integer")
data.dropna(subset=["budget"],inplace=True)

data['revenue'] = data['revenue'].replace(0, np.nan)

data.dropna(subset=["runtime"],inplace=True)

4. Dict to list

In [8]:
data['cast'] = data['cast'].replace(0, {})
data['crew'] = data['crew'].replace(0, {})

In [9]:
data.dropna(subset=["cast"],inplace=True)

In [10]:
data.dropna(subset=['crew'], inplace=True)

In [11]:
data['cast'] = data['cast'].apply(literal_eval)
data['crew'] = data['crew'].apply(literal_eval)
data['cast_size'] = data['cast'].apply(lambda x: len(x))
data['crew_size'] = data['crew'].apply(lambda x: len(x))

In [12]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan
data['director'] = data['crew'].apply(get_director)

In [13]:
vote_counts = data[data['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = data[data['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()

m = vote_counts.quantile(0.75)

def weighted_rating(x):
    v = x['vote_count']+1 # added +1 - Dan
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

data['weight_rating'] = data.apply(weighted_rating, axis=1)


In [14]:
data['release_date'] =  data.release_date.fillna('0000-00-00')

release_year = []
for n,i in enumerate(data.release_date):
    j = i .split(' ')[0].split('-')
    release_year.append(j[0])
data['release_year'] = release_year

In [15]:
#data['keywords'] = data['keywords'].apply(literal_eval)
data['revenue_di_budget'] = data['revenue'] / data['budget']

In [16]:
data['genres'] = data['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [17]:
data.genres

0         [Animation, Comedy, Family]
1        [Adventure, Fantasy, Family]
2                   [Romance, Comedy]
3            [Comedy, Drama, Romance]
4                            [Comedy]
                     ...             
45537                 [Drama, Family]
45538                         [Drama]
45539       [Action, Drama, Thriller]
45540                              []
45541                              []
Name: genres, Length: 45202, dtype: object

In [18]:
data['belongs_to_collection'] = data['belongs_to_collection'].fillna("[]").apply(ast.literal_eval).apply(lambda x: x['name'] if isinstance(x, dict) else np.nan)

In [19]:
data['spoken_languages'] = data['spoken_languages'].fillna("[]").apply(ast.literal_eval).apply(lambda x: x['name'] if isinstance(x, dict) else np.nan)

In [20]:
print('Dataset shape',data.shape)
data.head(2)

Dataset shape (45202, 32)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,vote_average,vote_count,cast,crew,cast_size,crew_size,director,weight_rating,release_year,revenue_di_budget
0,False,Toy Story Collection,30000000.0,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862.0,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",13,106,John Lasseter,7.684743,1995,12.451801
1,False,,65000000.0,"[Adventure, Fantasy, Family]",,8844.0,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",26,16,Joe Johnston,6.877145,1995,4.043035


In [21]:
data = data.drop(['cast','crew'], axis = 1) 

In [22]:
s = data.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
data = data.drop('genres', axis=1).join(s)

In [23]:
data

Unnamed: 0,adult,belongs_to_collection,budget,homepage,id,imdb_id,original_language,original_title,overview,popularity,...,video,vote_average,vote_count,cast_size,crew_size,director,weight_rating,release_year,revenue_di_budget,genre
0,False,Toy Story Collection,30000000.0,http://toystory.disney.com/toy-story,862.0,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,False,7.7,5415.0,13,106,John Lasseter,7.684743,1995,12.451801,Animation
0,False,Toy Story Collection,30000000.0,http://toystory.disney.com/toy-story,862.0,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,False,7.7,5415.0,13,106,John Lasseter,7.684743,1995,12.451801,Comedy
0,False,Toy Story Collection,30000000.0,http://toystory.disney.com/toy-story,862.0,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,False,7.7,5415.0,13,106,John Lasseter,7.684743,1995,12.451801,Family
1,False,,65000000.0,,8844.0,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,...,False,6.9,2413.0,26,16,Joe Johnston,6.877145,1995,4.043035,Adventure
1,False,,65000000.0,,8844.0,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,...,False,6.9,2413.0,26,16,Joe Johnston,6.877145,1995,4.043035,Fantasy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45539,False,,0.0,,67758.0,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",0.903007,...,False,3.8,6.0,15,5,Mark L. Lester,5.006136,2003,,Action
45539,False,,0.0,,67758.0,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",0.903007,...,False,3.8,6.0,15,5,Mark L. Lester,5.006136,2003,,Drama
45539,False,,0.0,,67758.0,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",0.903007,...,False,3.8,6.0,15,5,Mark L. Lester,5.006136,2003,,Thriller
45540,False,,0.0,,227506.0,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",0.003503,...,False,0.0,0.0,5,2,Yakov Protazanov,5.104330,1917,,


In [24]:
data.to_csv("movies_dataset.csv")