In [None]:
# Import libraries

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import scipy
plt.style.use('ggplot')
from matplotlib.pyplot import figure

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8) # Adjusts the configuration of the plots to be created


df = pd.read_csv('D:\PortfolioProjects\Movie Project\movies.csv')

In [None]:
# Quick look at the data
df.head()

In [None]:
# Identify missing data

for col in df.columns:
    print(df[col].isnull().value_counts(), "\n")

In [None]:
# Data types for each column
df.dtypes

In [None]:
# Drop missing values from table
df = df.dropna()

# Convert both budget and gross to integer
df['budget'] = df['budget'].astype('int64')
df['gross'] = df['gross'].astype('int64')
df.head()

In [None]:

# Create correct year column
df['yearcorrect'] = df['released'].str.extract(pat = '([0-9]{4})').astype(int)
df.head()

In [None]:
# Sort df by gross earnings in descending order
df = df.sort_values(by = ['gross'],inplace=False, ascending=False)
df.head()

In [None]:
# Drop any duplicates
df.drop_duplicates()

In [None]:
# Find correlations with respect to Gross revenue
# Hypothesis: high budget -> high gross revenue

In [None]:
# Scatter plot budget vs gross
plt.scatter(x=df['budget'], y=df['gross'])
plt.title('Budget vs Gross Earnings')
plt.ylabel('Gross Earnings')
plt.xlabel('Film Budget')
plt.show

In [None]:
# Plot budget vs gross using seaborn

sns.regplot(x='budget', y='gross', data=df, scatter_kws={"color":"orange"},line_kws={"color":"blue"})

In [None]:
# Looking at correlation
# Types of correlations: pearson, kendall, spearman
df.corr(method='pearson') 

# There looks to be high correlations between (budget vs gross) and (votes vs gross)

In [None]:
# Use correlation matrix to visualize relationships between two variables
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation Matrix for Numeric Features')
plt.ylabel('Movie Features')
plt.xlabel('Movie Features')
plt.show()

In [None]:
# View correlations in number pairs
correlation_mat = df.corr()

corr_paris = correlation_mat.unstack()

corr_paris

In [None]:
# Sort correlation pairs
sorted_pairs = corr_paris.sort_values()

sorted_pairs

In [None]:
# Display correlation pairs with correlation value greater than 0.5
high_corr = sorted_pairs[(sorted_pairs) > 0.5]
high_corr

In [None]:
# Conclusion: Film Budget vs Gross Earnings
# and User Votes vs Gross Earnings have the highest correlation