In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import the library we are going to use
import pandas as pd 
import numpy as np 
import re
import seaborn as sns 
import matplotlib.pyplot as plt 

In [None]:
# read the data
df = pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_movies.csv')

In [None]:
# look at the first five rows to understand the data
df.head()

**Let's start cleaning the data.**

In [None]:
# do the data has correct data types?
df.dtypes

In [None]:
# we only needs to change the data type of one column: release_date
df['release_date'] = pd.to_datetime(df['release_date'], infer_datetime_format=True)

In [None]:
# let's see if it works
df.dtypes

In [None]:
# could there be some unreleased movies?
df['status'].value_counts()

# We will drop some unnecessary column:
* 'original_title' - there's already a "title" column
* 'status' - not needed since they are all released. 5 rumored and 3 post production
* 'id' - not needed 
* spoken_languages' - same as original language.
* 'homepage' and 'vote_average'

In [None]:
# Drop unnecessary columns
cols_to_drop = ['original_title', 'homepage', 'status', 'vote_average', 'id', 'spoken_languages']
df.drop(cols_to_drop, axis=1, inplace=True)

In [None]:
# A glimpse at what's missing
for column in df.columns:
    null_count = df[column].isnull().sum()
    print(f'{column} — {round((null_count/df.shape[0]) * 100)}% MISSING and total missing is: {null_count}')

### Tagline has the most missing values. Let's fill that with "NO TAGLINE"

In [None]:
df.tagline.fillna('NO TAGLINE', inplace=True)

###  let's drop the remaining rows that has missing values. 6 row in total

In [None]:
df.dropna(axis=0, inplace=True)

In [None]:
# let's take another look at the dataset
df.head(3)

##  genres
### the column has list of dictionaries. The value of the key 'name' is what we want

In [None]:
df['genres'][1]

In [None]:
# regex pattern to work with
pattern = r"[^a-zA-Z_]+"

# function to wrap up to regex
def extract(column):
    rows_list = []
    for row in column:
        string = re.sub(pattern=pattern, repl=' ', string=row, flags=re.IGNORECASE)
        rows_list.append(string.replace('id', '').replace('name', '').split('  '))
    return list(rows_list)

The above function uses a regular expression to subtract any non-string character and replaces that with  a white space, one row at a time, and then replaces 'id' and 'name' with an empty string, then store that in a list.

In [None]:
# update the column
df['genres'] = extract(df['genres'])

In [None]:
df['genres'][0]

Now we need a function to strip the whitespaces and empty string

In [None]:
def strip(col):
    column_list = []
    for row in col:
        row = [x.strip(' ') for x in row if x]
        column_list.append(row)
    return column_list

In [None]:
df['genres'] = strip(df['genres'])

In [None]:
df['genres'][0]

In [None]:
# let's look at keywords column
df['keywords'][0]

### too dense and messy. let's apply previous function here too

In [None]:
df['keywords'] = extract(df['keywords'])

In [None]:
# strip the white spaces and empty string
df['keywords'] = strip(df['keywords'])

In [None]:
df['keywords'][1]

In [None]:
# let's take a closer look at this column
df['production_companies'][1]

In [None]:
# regex pattern to work with
pattern = r"[^a-zA-Z_]+"

# function to do the work
def extract_values(column):
    rows_list = []
    for row in column:
        string = re.sub(pattern=pattern, repl=' ', string=row, flags=re.IGNORECASE)
        rows_list.append(string.replace('id', '').replace('name', '').split('  '))
    return list(rows_list)

In [None]:
# let's apply the function above here
df['production_companies'] = extract_values(df['production_companies'])

In [None]:
# see what the function returns
df['production_companies'][1]

In [None]:
# strip the spaces here too
df['production_companies'] = strip(df['production_companies'])

In [None]:
# see what the function does
df['production_companies'][1]

In [None]:
# now let's look at production countries
df['production_countries'][2]

In [None]:
# we need to modify previous function a bit

pattern = r"[^a-zA-Z]+"
def extract_prod_countries(column):
    rows_list = []
    for row in column:
        string = re.sub(pattern=pattern, repl=' ', string=row, flags=re.IGNORECASE)
        rows_list.append(string.replace('iso', '').replace('name', '').split('  '))
    return list(rows_list)

In [None]:
# update the column
df['production_countries'] = extract_prod_countries(df['production_countries'])

In [None]:
df['production_countries'][2]

In [None]:
df['production_countries'] = strip(df['production_countries'])

In [None]:
df['production_countries'][2]

Now let's look at the dataframe to see what we've done so far

In [None]:
df.head(4)

In [None]:
# let's add another column to calculate the profit of each movie
df['Profit'] = df['revenue'] - df['budget']

In [None]:
# any missing values? 
df.isna().sum()

Now it's time to start Doing EDA!

In [None]:
# let's group the columns into categorical and numerical column
cat_cols = df.select_dtypes(exclude=[np.number])
num_cols = df.select_dtypes(include=[np.number])

In [None]:
# All the categorical columns
cat_cols.head(2)

In [None]:
# All the Qualitative columns
num_cols.head(2)

### uni-variate analysis

In [None]:
df.columns

In [None]:
# what is the highest budget for a movie in the dataset?
df.budget.sort_values(ascending=False).head()

### The highest budget for a single movie is $380M. let's see the movies

In [None]:
df.loc[ df.budget.sort_values(ascending=False).head().index]

### Do any of these generate the highest revenue in the whole dataset? 