# Movie Dataset Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')

In [2]:
dataset = pd.read_csv('movie_metadata.csv')
dataset.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [3]:
dataset.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   color                      5024 non-null   object 
 1   director_name              4939 non-null   object 
 2   num_critic_for_reviews     4993 non-null   float64
 3   duration                   5028 non-null   float64
 4   director_facebook_likes    4939 non-null   float64
 5   actor_3_facebook_likes     5020 non-null   float64
 6   actor_2_name               5030 non-null   object 
 7   actor_1_facebook_likes     5036 non-null   float64
 8   gross                      4159 non-null   float64
 9   genres                     5043 non-null   object 
 10  actor_1_name               5036 non-null   object 
 11  movie_title                5043 non-null   object 
 12  num_voted_users            5043 non-null   int64  
 13  cast_total_facebook_likes  5043 non-null   int64

In [5]:
dataset.isnull().sum()

color                         19
director_name                104
num_critic_for_reviews        50
duration                      15
director_facebook_likes      104
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        884
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
facenumber_in_poster          13
plot_keywords                153
movie_imdb_link                0
num_user_for_reviews          21
language                      12
country                        5
content_rating               303
budget                       492
title_year                   108
actor_2_facebook_likes        13
imdb_score                     0
aspect_ratio                 329
movie_facebook_likes           0
dtype: int64

In [6]:
dataset.shape

(5043, 28)

In [7]:
dataset.drop_duplicates(inplace=True)
dataset.shape

(4998, 28)

In [8]:
numerical_cols = [col for col in dataset.columns if dataset[col].dtype != 'object']
categorical_cols = [col for col in dataset.columns if dataset[col].dtype == 'object']

In [12]:
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num_critic_for_reviews,4949.0,139.8901,121.4776,1.0,50.0,110.0,195.0,813.0
duration,4983.0,107.2133,25.24878,7.0,93.0,103.0,118.0,511.0
director_facebook_likes,4895.0,688.6791,2821.65,0.0,7.0,49.0,192.0,23000.0
actor_3_facebook_likes,4975.0,639.9009,1643.298,0.0,133.0,369.0,635.0,23000.0
actor_1_facebook_likes,4991.0,6556.94,15061.59,0.0,611.5,984.0,11000.0,640000.0
gross,4124.0,48325650.0,67964830.0,162.0,5304835.25,25445749.0,62319416.0,760505800.0
num_voted_users,4998.0,83470.2,138086.6,5.0,8560.0,34260.5,96120.75,1689764.0
cast_total_facebook_likes,4998.0,9676.941,18165.4,0.0,1405.5,3085.5,13740.5,656730.0
facenumber_in_poster,4985.0,1.368907,2.014623,0.0,0.0,1.0,2.0,43.0
num_user_for_reviews,4977.0,272.0147,377.7762,1.0,64.0,156.0,324.0,5060.0


In [13]:
color_mode = dataset['color'].mode().iloc[0]
dataset.color.fillna(color_mode, inplace = True)
dataset.color.isnull().sum()

0

In [14]:
dataset = dataset.dropna(axis = 0, subset = ['director_name'] )

In [15]:
dataset.num_critic_for_reviews.min(), dataset.num_critic_for_reviews.max(), dataset.num_critic_for_reviews.median()

(1.0, 813.0, 112.0)

In [16]:
num_critic_for_reviews_median = dataset['num_critic_for_reviews'].median()
dataset.num_critic_for_reviews.fillna(num_critic_for_reviews_median, inplace = True)
dataset.num_critic_for_reviews.isnull().sum()

0

In [17]:
duration_median = dataset.duration.median()
dataset.duration.fillna(duration_median, inplace=True)
dataset.duration.isnull().sum()

0

In [18]:
director_facebook_likes_mean = dataset.director_facebook_likes.mean()
dataset.director_facebook_likes.fillna(director_facebook_likes_mean, inplace = True)
dataset.director_facebook_likes.isnull().sum()

0

In [20]:
dataset.actor_3_facebook_likes.min()

0.0

In [21]:
dataset.actor_3_facebook_likes.max()

23000.0

In [22]:
dataset.actor_3_facebook_likes.median()

372.0

In [23]:
dataset.actor_3_facebook_likes.mean()

646.1009230769231

In [24]:
actor_3_facebook_likes_mean = dataset.actor_3_facebook_likes.mean()
dataset.actor_3_facebook_likes.fillna(actor_3_facebook_likes_mean, inplace = True)
dataset.actor_3_facebook_likes.isnull().sum()

0

In [25]:
dataset = dataset.dropna(axis = 0, subset = ['actor_2_name'])
dataset.actor_2_name.isnull().sum()

0

In [26]:
actor_1_facebook_likes_mean = dataset.actor_1_facebook_likes.mean()
dataset.actor_1_facebook_likes.fillna(actor_1_facebook_likes_mean, inplace = True)
dataset.actor_1_facebook_likes.isnull().sum()

0

In [27]:
dataset = dataset.dropna(axis = 0, subset = ['gross'])
dataset.gross.isnull().sum()

0

In [28]:
dataset.shape
dataset.isnull().sum()

color                          0
director_name                  0
num_critic_for_reviews         0
duration                       0
director_facebook_likes        0
actor_3_facebook_likes         0
actor_2_name                   0
actor_1_facebook_likes         0
gross                          0
genres                         0
actor_1_name                   0
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                   7
facenumber_in_poster           7
plot_keywords                 39
movie_imdb_link                0
num_user_for_reviews           1
language                       3
country                        0
content_rating                60
budget                       263
title_year                     0
actor_2_facebook_likes         0
imdb_score                     0
aspect_ratio                 102
movie_facebook_likes           0
dtype: int64

In [29]:
dataset = dataset.dropna(axis = 0, subset = ['budget'])
dataset.budget.isnull().sum()

0

In [30]:
dataset = dataset.dropna(axis = 0, subset = ['actor_3_name'])
dataset.actor_3_name.isnull().sum()

0

In [31]:
facenumber_in_poster_median = dataset.facenumber_in_poster.median()
dataset.facenumber_in_poster.fillna(facenumber_in_poster_median, inplace = True)
dataset.facenumber_in_poster.isnull().sum()

0

In [32]:
language_mode = dataset.language.mode().iloc[0]
dataset.language.fillna(language_mode, inplace = True)
dataset.language.isnull().sum()

0

In [33]:
dataset = dataset.dropna(axis = 0, subset = ['plot_keywords'])
dataset.plot_keywords.isnull().sum()

0

In [34]:
dataset.content_rating.unique()

array(['PG-13', 'PG', 'G', 'R', 'Approved', 'NC-17', nan, 'X',
       'Not Rated', 'Unrated', 'M', 'GP', 'Passed'], dtype=object)

In [35]:
dataset.content_rating.fillna('Not Rated', inplace = True)

In [36]:
dataset.aspect_ratio.unique()

array([ 1.78,  2.35,  1.85,  2.  ,  2.2 ,  2.39,  2.24,  1.66,  1.5 ,
        1.77,  2.4 ,  1.37,   nan,  2.76,  1.33,  1.18,  2.55,  1.75,
       16.  ])

In [37]:
aspect_ratio_mode = dataset.aspect_ratio.mode().iloc[0]
dataset.aspect_ratio.fillna(aspect_ratio_mode, inplace = True)  

In [38]:
dataset.isnull().sum()

color                        0
director_name                0
num_critic_for_reviews       0
duration                     0
director_facebook_likes      0
actor_3_facebook_likes       0
actor_2_name                 0
actor_1_facebook_likes       0
gross                        0
genres                       0
actor_1_name                 0
movie_title                  0
num_voted_users              0
cast_total_facebook_likes    0
actor_3_name                 0
facenumber_in_poster         0
plot_keywords                0
movie_imdb_link              0
num_user_for_reviews         0
language                     0
country                      0
content_rating               0
budget                       0
title_year                   0
actor_2_facebook_likes       0
imdb_score                   0
aspect_ratio                 0
movie_facebook_likes         0
dtype: int64

In [39]:
dataset.color.unique(), dataset.color.nunique()

(array(['Color', ' Black and White'], dtype=object), 2)

In [40]:
dataset['color'] = dataset.color.map({'Color' : 1 , ' Black and White' : 0})

In [41]:
dataset.director_name.unique(), dataset.director_name.nunique()

(array(['James Cameron', 'Gore Verbinski', 'Sam Mendes', ...,
        'Kiyoshi Kurosawa', 'Shane Carruth', 'Neill Dela Llana'],
       dtype=object),
 1723)

In [42]:
director_name_value_counts = dataset.director_name.value_counts()
director_name_value_counts  = pd.DataFrame(director_name_value_counts).reset_index().rename(columns = {'index': 'director_name', 'director_name':'director_name_value_counts'})

In [43]:
dataset = pd.merge(dataset, director_name_value_counts,left_on = 'director_name', right_on = 'director_name', how = 'left')

In [44]:
dataset = dataset.drop(columns = 'director_name')

In [45]:
actor_2_name_value_counts = dataset.actor_2_name.value_counts()
actor_2_name_value_counts  = pd.DataFrame(actor_2_name_value_counts).reset_index().rename(columns = {'index': 'actor_2_name', 'actor_2_name':'actor_2_name_value_counts'})

In [46]:
dataset = pd.merge(dataset, actor_2_name_value_counts,left_on = 'actor_2_name', right_on = 'actor_2_name', how = 'left')

In [47]:
dataset = dataset.drop(columns = 'actor_2_name')

In [48]:
dataset.genres.unique(), dataset.genres.nunique()

(array(['Action|Adventure|Fantasy|Sci-Fi', 'Action|Adventure|Fantasy',
        'Action|Adventure|Thriller', 'Action|Thriller',
        'Action|Adventure|Sci-Fi', 'Action|Adventure|Romance',
        'Adventure|Animation|Comedy|Family|Fantasy|Musical|Romance',
        'Adventure|Family|Fantasy|Mystery', 'Action|Adventure',
        'Action|Adventure|Western', 'Action|Adventure|Family|Fantasy',
        'Action|Adventure|Comedy|Family|Fantasy|Sci-Fi',
        'Adventure|Fantasy', 'Action|Adventure|Drama|History',
        'Adventure|Family|Fantasy', 'Action|Adventure|Drama|Romance',
        'Drama|Romance', 'Action|Adventure|Sci-Fi|Thriller',
        'Action|Adventure|Fantasy|Romance',
        'Action|Adventure|Fantasy|Sci-Fi|Thriller',
        'Adventure|Animation|Comedy|Family|Fantasy',
        'Adventure|Animation|Comedy|Family|Sport', 'Action|Crime|Thriller',
        'Action|Adventure|Horror|Sci-Fi|Thriller',
        'Adventure|Animation|Family|Sci-Fi',
        'Action|Comedy|Crime|Thril

In [51]:
dataset.head()

Unnamed: 0,color,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,genres,actor_1_name,movie_title,...,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,director_name_value_counts,actor_2_name_value_counts
0,1,723.0,178.0,0.0,855.0,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,...,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,7,3
1,1,302.0,169.0,563.0,1000.0,40000.0,309404152.0,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,...,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,7,7
2,1,602.0,148.0,0.0,161.0,11000.0,200074175.0,Action|Adventure|Thriller,Christoph Waltz,Spectre,...,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,8,2
3,1,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,Action|Thriller,Tom Hardy,The Dark Knight Rises,...,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,8,5
4,1,462.0,132.0,475.0,530.0,640.0,73058679.0,Action|Adventure|Sci-Fi,Daryl Sabara,John Carter,...,USA,PG-13,263700000.0,2012.0,632.0,6.6,2.35,24000,3,3
