In [None]:
import kaggle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

1. Download the data programatically with python.

In [2]:
kaggle.api.authenticate()
kaggle.api.dataset_download_files('PromptCloudHQ/imdb-data', path='imdb-data', unzip=True)

2. Fit the training-data into a model, present it visually and reflect on wether it is a good fit or not.

In [3]:
data = pd.read_csv('imdb-data/IMDB-Movie-Data.csv')
data.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


In [4]:
revenue = data.pop('Revenue (Millions)').values
revenue

array([3.3313e+02, 1.2646e+02, 1.3812e+02, 2.7032e+02, 3.2502e+02,
       4.5130e+01, 1.5106e+02,        nan, 8.0100e+00, 1.0001e+02,
       2.3402e+02, 1.6927e+02, 5.3217e+02, 2.4875e+02, 2.8700e+00,
       3.6831e+02, 6.7120e+01, 1.6216e+02, 5.1690e+01, 1.0050e+02,
       7.2200e+00, 4.7700e+01,        nan, 1.5369e+02, 1.0314e+02,
              nan, 6.5000e+00, 1.0000e-02, 1.1308e+02, 5.4650e+01,
       6.0310e+01, 1.0640e+01, 1.5533e+02, 3.6302e+02, 2.6840e+01,
       4.0808e+02, 1.8799e+02, 2.3260e+02, 9.3380e+01,        nan,
       9.7660e+01, 2.7850e+01,        nan, 1.2790e+01, 4.2100e+00,
       2.4106e+02, 3.4400e+00,        nan, 1.5880e+02,        nan,
       9.3663e+02, 3.0350e+01, 3.2460e+01, 4.3000e+01, 5.3332e+02,
       7.0800e+00, 8.9210e+01, 5.6230e+01, 3.1800e+00, 1.2507e+02,
       3.3025e+02,        nan, 7.5310e+01, 1.6615e+02, 5.3080e+01,
       1.2825e+02, 3.1860e+01, 1.5363e+02, 1.0000e-02, 6.1280e+01,
              nan, 4.0070e+01, 3.7300e+00, 3.0980e+01, 3.4126e

In [5]:
from sklearn.preprocessing import OneHotEncoder

def encode(frame, column):
    column_train = frame[[column]].copy()
    ohe = OneHotEncoder(sparse=False)
    column_train_transformed = ohe.fit_transform(column_train)
    return column_train_transformed, ohe

In [6]:
encoded_titles, title_ohe = encode(data, "Title")
encoded_directors, director_ohe = encode(data, "Director")
encoded_genres, genre_ohe = encode(data, "Genre")
encoded_descriptions, description_ohe = encode(data, "Description")
encoded_actors, actors_ohe= encode(data, "Actors")

It’s good to verify that our estimator is working properly. Let’s look at the first row of encoded data.

In [7]:
feature_names = title_ohe.get_feature_names()
first = feature_names[encoded_titles[0] == 1]
print(first)
print(data['Title'][0])

['x0_Guardians of the Galaxy']
Guardians of the Galaxy


Just like most transformer objects, there is an inverse_transform method that will get you back your original data. Here we must wrap row0 in a list to make it a 2D array.

In [8]:
title_ohe.inverse_transform([encoded_titles[0]])

array([['Guardians of the Galaxy']], dtype=object)

In [9]:
from_data = data['Title'].values
from_train = list(map(lambda x: x[0], title_ohe.inverse_transform(encoded_titles)))
np.unique(from_data == from_train)

array([ True])

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

def find_text_columns(frame):
    values = [i for (i, t) in enumerate(frame.dtypes.values) if t.name == 'object']
    return frame.columns[values]

def find_numeric_columns(frame):
    values = [i for (i, t) in enumerate(frame.dtypes.values) if t.name != 'object']
    return frame.columns[values]
    

text_si_step = ('si', SimpleImputer(strategy='constant', fill_value='MISSING'))
text_ohe_step = ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
text_pipe = Pipeline([text_si_step, text_ohe_step])
text_cols = find_text_columns(data)

numeric_si_step = ('si', SimpleImputer(strategy='median'))
numeric_ss_step = ('ss', StandardScaler())
numeric_pipe = Pipeline([numeric_si_step, numeric_ss_step])
numeric_cols = find_numeric_columns(data)

transformers = [
    ('text', text_pipe, text_cols),
    ('num', numeric_pipe, numeric_cols)
]
ct = ColumnTransformer(transformers=transformers)

In [13]:
from sklearn.linear_model import Ridge
import inspect

def normalize(array):
    median = 0
    nans = []
    for (i, value) in enumerate(array):
        if np.isnan(value) or value == 0.0:
            nans.append(i)
        else:
            median += value
    
    median /= len(array)
    for i in nans:
        array[i] = median
        
    return array



revenue = normalize(revenue)
ml_pipe = Pipeline([('transform', ct), ('ridge', Ridge())])

In [17]:
ml_pipe.fit(data, revenue)
ml_pipe.score(data, revenue)

0.9831394214622724

In [31]:
prediction = ml_pipe.predict(data)
diff = abs(prediction - revenue)
enumerated = list(enumerate(diff))
biggest_diff = sorted(enumerated, key=lambda t: t[1], reverse=True)
print(biggest_dif)
# percentage_difference = abs(prediction - revenue) / revenue) * 100
# percentage_difference
# avg_percentage_difference = sum(percentage_difference) / len(percentage_difference)
# avg_percentage_difference

[(50, 134.3234886029236),
 (85, 85.09049704111294),
 (12, 73.1087630639363),
 (87, 72.85687664247496),
 (119, 66.93139188697415),
 (80, 50.82017736312747),
 (125, 48.02067573824297),
 (84, 44.23827786551078),
 (710, 43.35897555961327),
 (331, 41.96820882133318),
 (76, 41.65527806826617),
 (35, 39.89562602161362),
 (547, 39.768819063068975),
 (15, 37.71530871172348),
 (400, 37.33322969259223),
 (578, 36.94320415094421),
 (78, 36.30048131083436),
 (310, 34.61226307745042),
 (94, 33.67466569337785),
 (279, 32.47046614744602),
 (3, 31.216286248012324),
 (797, 30.789414157201634),
 (64, 29.824950774125213),
 (144, 29.781007373626466),
 (77, 29.53065395620827),
 (566, 28.970315935122358),
 (67, 28.729130600208777),
 (133, 28.724436957551806),
 (941, 28.68102341834981),
 (249, 28.34330387970555),
 (242, 27.9033900383535),
 (344, 27.8313693120798),
 (733, 27.79480512223941),
 (789, 27.407985592586613),
 (82, 27.274519586320764),
 (182, 27.106989244490705),
 (136, 26.76337886861363),
 (33, 25.7