**All imports**

In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from IPython.display import Image, display
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter 
import csv
import random

**Functions are defined here to improve readability**

In [75]:
def remove_outliers(dataframe, columnName):
    q1 = np.quantile(dataframe[columnName], 0.25)
 
    q3 = np.quantile(dataframe[columnName], 0.75)
    
    iqr = q3-q1
    
    upper_bound = q3+(1.5*iqr)
    lower_bound = q1-(1.5*iqr)

    return dataframe[(dataframe[columnName] >= lower_bound) & (dataframe[columnName] <= upper_bound)]

def boxplot_gross(dataframe, columnName):
    fig1, ax1 = plt.subplots()
    ax1.boxplot(dataframe[columnName])
    def euro_formatter(x, _):
        if x >= 1000000:
            return f"${x / 1000000:.0f}M"
    plt.gca().yaxis.set_major_formatter(FuncFormatter(euro_formatter))
    ax1.set_title(f"Boxplot from {columnName}")
    ax1.set_xlabel(columnName)
    ax1.set_ylabel('Gross')
    plt.show()

def histogram_gross(dataframe, columnName):
    plt.hist(dataframe[columnName], bins=100, color='skyblue', edgecolor='black')
    def euro_formatter(x, _):
        if x >= 1000000:
            return f"${x / 1000000:.0f}M"
    plt.gca().xaxis.set_major_formatter(FuncFormatter(euro_formatter))
    plt.title(f"The frequency of {columnName}")
    plt.xlabel('Gross')
    plt.ylabel('Frequency')
    plt.show()

def boxplot_likes(dataframe, columnName):
    fig1, ax1 = plt.subplots()
    ax1.boxplot(dataframe[columnName])
    ax1.set_title(f"Boxplot from {columnName}")
    ax1.set_xlabel(columnName)
    ax1.set_ylabel('Amount of likes')
    plt.show()

def histogram_likes(dataframe, columnName):
    plt.hist(dataframe[columnName], bins=100, color='skyblue', edgecolor='black')
    plt.title(f"Frequency of {columnName}")
    plt.xlabel('Likes')
    plt.ylabel('Frequency')
    plt.show()

def correlation_plot(df, targetVariable, featureVariable):
    fig1, ax1 = plt.subplots()
    ax1.scatter(df[targetVariable], df[featureVariable], s=3)

    def euro_formatter(x, _):
        if x >= 1000000:
            return f"${x / 1000000:.0f}M"
    plt.gca().xaxis.set_major_formatter(FuncFormatter(euro_formatter))
    
    ax1.set_xlabel(targetVariable)
    ax1.set_ylabel(featureVariable)

    correlation = df[[targetVariable, featureVariable]].corr().iloc[0, 1]

    ax1.set_title(f"{targetVariable} vs {featureVariable} (r = {correlation:.2f})")

**Load the dataset**

In [76]:
dfr = pd.read_csv('data/movie-1.csv')

# Show the maximum info the dataframe can give.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Format all big numbers for better readability.
pd.set_option('display.float_format', '{:,.2f}'.format)

**Copy of the dataset**

Making a copy of the data to work with so we will not alter the actual dataset.

In [77]:
df = dfr.copy()

**Taking a look at the data we're given.**

In [None]:
df.head(5)

**Get the column names to see with what kind of variables we're working with**

In [None]:
for col in df.columns:
    print(f"column: {col}, dtype: {df[col].dtype}")

**Measurement levels**

In [None]:
display(Image(filename="meet.png"))

**Target and Feature variables**

In [81]:
featureVariables = ['director_name', 'director_facebook_likes',
                    'actor_1_name', 'actor_1_facebook_likes', 
                    'actor_2_name', 'actor_2_facebook_likes', 
                    'actor_3_name', 'actor_3_facebook_likes', 
                    'cast_total_facebook_likes',
                    'movie_facebook_likes',
                    'imdb_score']

targetVariable = ['gross']

numVariable =   ['director_facebook_likes', 
                'actor_1_facebook_likes', 
                'actor_2_facebook_likes', 
                'actor_3_facebook_likes', 
                'cast_total_facebook_likes', 
                'movie_facebook_likes', 
                'imdb_score',
                'gross']

**Make another DF with only the columns we're interested in.**

In [None]:
print(df.shape) #check impact
df = df[featureVariables + targetVariable]
print(df.shape) #check impact

In [None]:
df.head(1)

**Check NaN values**

Drop all the NaN values in all the feature and target variables. (for now) Later on we will do more research on how to properly handle these.

In [None]:
df.isna().sum()

We will drop all NaN values. We can not replace them because the vast majority contains our target variable.

In [None]:
print(df.shape) #check impact
df = df.dropna()
print(df.shape) #check impact

**Now we will analyse all columns to check for anomalies.**

All the statistics look clean. 

All columns containing 'names' have NaN and number where this is expected. 

All columns containing 'likes' have NaN and a number where this is expected. Also no weird min or max values

The column 'imdb_score' has values only between 0 and 10 which is expected.

And lastly the column 'gross' has a natural order of magnitude from min to max.

In [None]:
df.describe(include="all")

We will not delete any of the outliers from our target variable 'gross'. However, for our feature variables there are some weird outliers and a lot of directors and actors getting 0 likes on facebook. We will need to dive deeper and see if we can fill those 0 values with something else.

In [None]:
# print(f"Count before removing outliers: {df.shape}")
boxplot_gross(df, 'gross')
histogram_gross(df, 'gross')
# df = remove_outliers(df, 'gross')
# print(f"Count after removing outliers: {df.shape}")
# boxplot_gross(df, 'gross')
# histogram_gross(df, 'gross')

for feature in featureVariables:
    if 'like' in feature:
        # print(f"Count before removing outliers: {df.shape}")
        boxplot_likes(df, feature)
        histogram_likes(df, feature)
        # df = remove_outliers(df, feature)
        # print(f"Count after removing outliers: {df.shape}")
        # boxplot_likes(df, feature)
        # histogram_likes(df, feature)

**Calculate the numeric variables against the target variable so we can look which features are (somewhat) important to the target variable**

In [None]:
correlation = df[numVariable].corr()['gross'].sort_values(ascending=False)

print(correlation)

In [None]:
correlation_plot(df, 'gross', 'director_facebook_likes')
correlation_plot(df, 'gross', 'actor_1_facebook_likes')
correlation_plot(df, 'gross', 'actor_2_facebook_likes')
correlation_plot(df, 'gross', 'actor_3_facebook_likes')
correlation_plot(df, 'gross', 'cast_total_facebook_likes')
correlation_plot(df, 'gross', 'movie_facebook_likes')
correlation_plot(df, 'gross', 'imdb_score')

**Scatterplots**

Here we will plot some scatterplots to get a better insight in our feature variables.

In [90]:
def scatter(variable, x_label):
    plt.scatter(df[variable], df['imdb_score'])
    plt.title(f'IMDb Score vs. {x_label}')
    plt.xlabel(x_label)
    plt.ylabel('IMDb Score')
    plt.show()

In [None]:
scatter('movie_facebook_likes', 'Movie Facebook Likes')
scatter('gross', 'Gross Revenue')
scatter('cast_total_facebook_likes', 'Total Cast Facebook Likes')
scatter('actor_1_facebook_likes', 'Actor 1 Facebook Likes')
scatter('actor_2_facebook_likes', 'Actor 2 Facebook Likes')
scatter('actor_3_facebook_likes', 'Actor 3 Facebook Likes')

**Assign dummy values to all the names. This is needed to calculate our model.**

In [None]:
#Select numeric feature variables
X_numeric = df[['director_facebook_likes', 
                'actor_1_facebook_likes', 
                'actor_2_facebook_likes', 
                'actor_3_facebook_likes', 
                'cast_total_facebook_likes', 'movie_facebook_likes', 'imdb_score',
                'gross' # Comment this one out when applying a model
                ]]

#We assign all other column values to dummy values 
X_dummies = pd.get_dummies(df[['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name']], dtype = int)

#Assign the 2 dataframes to 1 dataframe again
X = pd.concat([X_numeric, X_dummies], axis = 1)
y = df[['gross']]

df = X

df.head(1)

In [None]:
print(X.shape) #check impact

**We will now look for all the likes statistics per director and actor**

As you can see, there are a lot of directors and actors with major movies (high gross) that get zero likes on their facebook. Especially with directors. We will need to fill those in. This will be done by assigning the mean likes value of similar movies grouped by their 'gross'. A visual representation with actual implementation will follow next week. 

When this is done we think our correlation plots will have a straighter line, which indicates a better corrolation and will most likely improve our model.

In [None]:
likeList = []
grossList = []

for column in df.columns:
    if 'director_name' in column:

        rowsWithDirector = df[df[column] == 1]

        infoLikes = rowsWithDirector[['director_facebook_likes']]
        infoGross = rowsWithDirector[['gross']]

        describeLikeStats = infoLikes.describe(include='all').transpose()
        describeGrossStats = infoGross.describe(include='all').transpose()

        describeLikeStats = describeLikeStats.add_prefix('likes_')
        describeGrossStats = describeGrossStats.add_prefix('gross_')

        describeLikeStats['director'] = column
        describeGrossStats['director'] = column

        likeList.append(describeLikeStats)
        grossList.append(describeGrossStats)

directorLikeStats = pd.concat(likeList).reset_index(drop=True)
directorGrossStats = pd.concat(grossList).reset_index(drop=True)

directorStats = pd.merge(directorLikeStats, directorGrossStats, on='director', how='outer')

directorStats = directorStats.sort_values(by='likes_mean', ascending=True)

directorStats.head(10)
        

In [None]:
likeActor1List = []
grossActor1List = []

for column in df.columns:
    if 'actor_1' in column:

        rowsWithActor1 = df[df[column] == 1]

        infoLikes = rowsWithActor1[['actor_1_facebook_likes']]
        infoGross = rowsWithActor1[['gross']]

        describeLikeStats = infoLikes.describe(include='all').transpose()
        describeGrossStats = infoGross.describe(include='all').transpose()

        describeLikeStats = describeLikeStats.add_prefix('likes_')
        describeGrossStats = describeGrossStats.add_prefix('gross_')

        describeLikeStats['actor1'] = column
        describeGrossStats['actor1'] = column
        
        likeActor1List.append(describeLikeStats)
        grossActor1List.append(describeGrossStats)

actor1LikeStats = pd.concat(likeActor1List).reset_index(drop=True)
actor1GrossStats = pd.concat(grossActor1List).reset_index(drop=True)

actor1Stats = pd.merge(actor1LikeStats, actor1GrossStats, on='actor1', how='outer')

actor1Stats = actor1Stats.sort_values(by='likes_mean', ascending=True)

actor1Stats.head(10)

In [None]:
likeActor2List = []
grossActor2List = []

for column in df.columns:
    if 'actor_2' in column:

        rowsWithActor2 = df[df[column] == 1]

        infoLikes = rowsWithActor2[['actor_2_facebook_likes']]
        infoGross = rowsWithActor2[['gross']]

        describeLikeStats = infoLikes.describe(include='all').transpose()
        describeGrossStats = infoGross.describe(include='all').transpose()

        describeLikeStats = describeLikeStats.add_prefix('likes_')
        describeGrossStats = describeGrossStats.add_prefix('gross_')

        describeLikeStats['actor2'] = column
        describeGrossStats['actor2'] = column
        
        likeActor2List.append(describeLikeStats)
        grossActor2List.append(describeGrossStats)

actor2LikeStats = pd.concat(likeActor2List).reset_index(drop=True)
actor2GrossStats = pd.concat(grossActor2List).reset_index(drop=True)

actor2Stats = pd.merge(actor2LikeStats, actor2GrossStats, on='actor2', how='outer')

actor2Stats = actor2Stats.sort_values(by='likes_mean', ascending=True)

actor2Stats.head(10)

In [None]:
likeActor3List = []
grossActor3List = []

for column in df.columns:
    if 'actor_3' in column:
        rowsWithActor3 = df[df[column] == 1]

        infoLikes = rowsWithActor3[['actor_3_facebook_likes']]
        infoGross = rowsWithActor3[['gross']]

        describeLikeStats = infoLikes.describe(include='all').transpose()
        describeGrossStats = infoGross.describe(include='all').transpose()

        describeLikeStats = describeLikeStats.add_prefix('likes_')
        describeGrossStats = describeGrossStats.add_prefix('gross_')

        describeLikeStats['actor3'] = column
        describeGrossStats['actor3'] = column
        
        likeActor3List.append(describeLikeStats)
        grossActor3List.append(describeGrossStats)

actor3LikeStats = pd.concat(likeActor3List).reset_index(drop=True)
actor3GrossStats = pd.concat(grossActor3List).reset_index(drop=True)

actor3Stats = pd.merge(actor3LikeStats, actor3GrossStats, on='actor3', how='outer')

actor3Stats = actor3Stats.sort_values(by='likes_mean', ascending=True)

actor3Stats.head(10)