In [None]:
# Importing all necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from matplotlib.pyplot import figure
plt.rcParams['figure.figsize'] = (12,8)
plt.style.use('ggplot')

In [None]:
# Reading the data
movies = pd.read_csv(r'movies.csv')

In [None]:
movies.head()

In [None]:
# Missing Value Treatment
for col in movies.columns:
    missing_pct = np.mean(movies[col].isnull())
    print('{} - {}%'.format(col,missing_pct))

In [None]:
# So as we can see we do not really have any missing values in this dataset

In [None]:
movies.dtypes

In [None]:
# Converting the budget to 'int64' values
movies['budget'] = movies['budget'].fillna(0).round().astype('int64')

In [None]:
movies.head()

In [None]:
# Converting the 'gross' column into int64 values
movies['gross'] = movies['gross'].fillna(0).round().astype('int64')

In [None]:
movies.head()

In [None]:
movies.head()

In [None]:
# Sorting the revenue generated in Descending Order
movies.sort_values(by=['gross'], inplace=False, ascending=False)

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
# Dropping any duplicates
movies['company'].sort_values(ascending=False)
movies.drop_duplicates()

In [None]:
movies['company'].sort_values(ascending=False)

In [None]:
# Assumptions for high revenue
# More the spend or budget on a movie more would be the revenue generated
# The more revered the company the more revenue is its films going to generate

In [None]:
# Scatter to plot to check the correlation between Budget and Gross
plt.scatter(x=movies['budget'], y=movies['gross'])
plt.title('Budget vs Gross Correlation')
plt.xlabel('Gross Earnings')
plt.ylabel('Budget for the film')
plt.show()

In [None]:
# We could reconfirm the same using a regplot
sns.regplot(x = 'budget', y = 'gross', data = movies, scatter_kws = {'color':'red'}, line_kws = {'color':'black'}) 
plt.title('Budget vs Gross Correlation')
plt.xlabel('Gross Earnings')
plt.ylabel('Budget for the film')
plt.show()

In [None]:
# Plotting the correlation
movies.corr(method='pearson')
# Types of correlation - pearson, kendall, spearman

In [None]:
# So it is claerly evident that there is high correlation between Budget and Gross

In [None]:
corrmat = movies.corr(method='pearson')
sns.heatmap(corrmat,annot=True)
plt.title('Correlation Matrix of Numerical Features')
plt.xlabel('Movie dataset features')
plt.ylabel('Movie dataset features')
plt.show()

In [None]:
# Now lets convert the 'categorical' features to numeric values
movies_numeric = movies

for col_name in movies_numeric.columns:
    if(movies_numeric[col_name].dtype=='object'):
        movies_numeric[col_name] = movies_numeric[col_name].astype('category')
        movies_numeric[col_name] = movies_numeric[col_name].cat.codes
movies_numeric

In [None]:
movies

In [None]:
corrmat_categoric = movies_numeric.corr(method='pearson')
sns.heatmap(corrmat_categoric,annot=True)
plt.title('Correlation Matrix of Categorical Features')
plt.xlabel('Movie dataset features')
plt.ylabel('Movie dataset features')
plt.show()

In [None]:
# Checking all the pairs with high correlation
correlation_mat = movies_numeric.corr()
corr_pairs = correlation_mat.unstack()
corr_pairs

In [None]:
sorted_pairs = corr_pairs.sort_values()
sorted_pairs

In [None]:
high_corr = sorted_pairs[(sorted_pairs)>0.5]
high_corr

In [None]:
# So we can see Budget and Gross have high correlations as we stated earlier