In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Import

In [3]:
movies_df = pd.read_csv('../raw_data/IMDb movies.csv')
names = pd.read_csv('../raw_data/IMDb names.csv')
titles = pd.read_csv('../raw_data/IMDb title_principals.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
df = movies_df.loc[(movies_df['budget'].isna()==False) & (movies_df['worlwide_gross_income'].isna()==False)].copy()
df.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
165,tt0010323,Il gabinetto del dottor Caligari,Das Cabinet des Dr. Caligari,1920,1920-02-27,"Fantasy, Horror, Mystery",76,Germany,German,Robert Wiene,...,"Werner Krauss, Conrad Veidt, Friedrich Feher, ...","Hypnotist Dr. Caligari uses a somnambulist, Ce...",8.1,55601,$ 18000,$ 8811,$ 8811,,237.0,160.0
210,tt0011440,Markens grøde,Markens grøde,1921,1921-12-02,Drama,107,Norway,,Gunnar Sommerfeldt,...,"Amund Rydland, Karen Poulsen, Ragna Wettergree...",After the Nobel prize winning Knut Hamsun-nove...,6.6,195,NOK 250000,,$ 4272,,3.0,3.0
245,tt0012190,I quattro cavalieri dell'Apocalisse,The Four Horsemen of the Apocalypse,1921,1923-04-16,"Drama, Romance, War",150,USA,,Rex Ingram,...,"Pomeroy Cannon, Josef Swickard, Bridgetta Clar...",An extended family split up in France and Germ...,7.2,3058,$ 800000,$ 9183673,$ 9183673,,45.0,16.0
251,tt0012349,Il monello,The Kid,1921,1923-11-26,"Comedy, Drama, Family",68,USA,"English, None",Charles Chaplin,...,"Carl Miller, Edna Purviance, Jackie Coogan, Ch...","The Tramp cares for an abandoned child, but ev...",8.3,109038,$ 250000,,$ 26916,,173.0,105.0
348,tt0014624,La donna di Parigi,A Woman of Paris: A Drama of Fate,1923,1927-06-06,"Drama, Romance",82,USA,"None, English",Charles Chaplin,...,"Edna Purviance, Clarence Geldart, Carl Miller,...",A kept woman runs into her former fiancé and f...,7.0,4735,$ 351000,,$ 11233,,37.0,24.0


In [5]:
df.reset_index(inplace=True)

In [6]:
# df.budget.replace(value='', regex="\$")

### Cleaning

#### Budget

In [7]:
df['budget'] = df['budget'].str.strip() # supprime les espaces à la fin et au début
df['budget'] = df['budget'].str.split() # split la string en mots

df['currency'] = df['budget'].apply(lambda x: x[0])
df['budget'] = df['budget'].apply(lambda x: x[1]).astype('int64')

In [8]:
df.currency.unique()

array(['$', 'NOK', 'GBP', 'DEM', 'FRF', 'SEK', 'ITL', 'JPY', 'RUR', 'AUD',
       'HKD', 'CAD', 'ESP', 'IEP', 'DKK', 'BEF', 'INR', 'EUR', 'ISK',
       'PHP', 'FIM', 'BRL', 'CZK', 'NLG', 'ATS', 'CNY', 'BND', 'HUF',
       'THB', 'ZAR', 'SGD', 'KRW', 'PLN', 'EGP', 'MXN', 'NZD', 'CHF',
       'SKK', 'BGL', 'EEK', 'TWD', 'LVL', 'TRL', 'YUM', 'ARS', 'MYR',
       'IDR', 'UAH', 'RON', 'VEB', 'LTL', 'CLP', 'PKR', 'NGN', 'COP',
       'HRK', 'IRR', 'DOP', 'ILS', 'AMD', 'BDT', 'AED'], dtype=object)

#### Income 

In [9]:
df['worlwide_gross_income'] = df['worlwide_gross_income'].str.strip() # supprime les espaces à la fin et au début
df['worlwide_gross_income'] = df['worlwide_gross_income'].str.split() # split la string en mots

df['worlwide_gross_income'] = df['worlwide_gross_income'].apply(lambda x: x[1]).astype('int64')
df['worlwide_gross_income']

0           8811
1           4272
2        9183673
3          26916
4          11233
          ...   
12757       9306
12758       2568
12759      83857
12760     524061
12761       4791
Name: worlwide_gross_income, Length: 12762, dtype: int64

#### Baseline (linear regression)

In [23]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score

In [25]:
# Declaring X and y
reg_df = df[df.currency == '$'].copy()
reg_df.reset_index(inplace=True)
X = reg_df[['budget', 'avg_vote', 'duration', 'year']]
y = reg_df['worlwide_gross_income']
# X.avg_vote.hist()
X.shape, y.shape

((9025, 4), (9025,))

In [32]:
n_splits = 5
tscv = TimeSeriesSplit(n_splits)
print(tscv)

TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)


In [27]:
# Scaling avg_vote
scaler = RobustScaler()
X['avg_vote'] = scaler.fit_transform(X[['avg_vote']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['avg_vote'] = scaler.fit_transform(X[['avg_vote']])


In [41]:
def baseline(model, X, y):
    """ Returns a list of 5 r2 scores"""
    r2 = []
    tscv = TimeSeriesSplit(n_splits=5)
    for train_index, test_index in tscv.split(X):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2.append(r2_score(y_test, y_pred))
    return r2

print(baseline(LinearRegression(), X, y))

#### Baseline (GradientBoostingRegressor)

In [50]:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(
    n_estimators=100, 
    learning_rate=0.1,
    max_depth=3
)
baseline(model, X, y)

TRAIN: [   0    1    2 ... 1502 1503 1504] TEST: [1505 1506 1507 ... 3006 3007 3008]
TRAIN: [   0    1    2 ... 3006 3007 3008] TEST: [3009 3010 3011 ... 4510 4511 4512]
TRAIN: [   0    1    2 ... 4510 4511 4512] TEST: [4513 4514 4515 ... 6014 6015 6016]
TRAIN: [   0    1    2 ... 6014 6015 6016] TEST: [6017 6018 6019 ... 7518 7519 7520]
TRAIN: [   0    1    2 ... 7518 7519 7520] TEST: [7521 7522 7523 ... 9022 9023 9024]


[0.26964979747280793,
 0.5212853600932246,
 0.47579248513284045,
 0.5985829033746941,
 0.6817552087900579]

#### Plots

In [None]:
# Income vs Budget
sns.scatterplot(x=df.worlwide_gross_income, y=df.budget)
plt.xlim(left=0, right=500000)
plt.ylim(bottom=0, top=1000000000)

In [None]:
# Income vs year
sns.scatterplot(y=df.worlwide_gross_income, x=df.year)

In [None]:
df.year = df.year.astype('int64')

In [None]:
df.date_published = pd.to_datetime(df.date_published)

In [None]:
df[df.year==2020].shape

In [None]:
sns.scatterplot(y=df.worlwide_gross_income, x=df[df.year>1980].date_published)

In [None]:
df[df.worlwide_gross_income < 100].shape