# 0.0 Imports

In [19]:
import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
from sklearn import model_selection as ms
from sklearn import preprocessing

## 0.1 Load data

In [20]:
exploratory_data_analysis = catalog.load("exploratory_data_analysis")

## 0.2 Helper Functions

In [21]:
def notebook_settings():
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', exploratory_data_analysis.shape[1])
    pd.set_option('display.float_format', lambda x: '%.3f' % x)

    plt.style.use('bmh')
    plt.rcParams['figure.figsize'] = [25, 10]
    plt.rcParams['font.size'] = 24
    sns.set()

    return None


notebook_settings()

# 4.0 Data preparation

In [22]:
exploratory_data_analysis.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China",2019-09-09,2019,41,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,2017-09-08,2017,57,99 min,Comedies,When nerdy high schooler Dani finally attracts...
2,70304989,Movie,Automata,Gabe Ibáñez,"Antonio Banderas, Dylan McDermott, Melanie Gri...","Bulgaria, United States, Spain, Canada",2017-09-08,2014,61,110 min,"International Movies, Sci-Fi & Fantasy, Thrillers","In a dystopian future, an insurance adjuster f..."
3,80164077,Movie,Fabrizio Copano: Solo pienso en mi,"Rodrigo Toro, Francisco Schultz",Fabrizio Copano,Chile,2017-09-08,2017,49,60 min,Stand-Up Comedy,Fabrizio Copano takes audience participation t...
4,70304990,Movie,Good People,Henrik Ruben Genz,"James Franco, Kate Hudson, Tom Wilkinson, Omar...","United States, United Kingdom, Denmark, Sweden",2017-09-08,2014,56,90 min,"Action & Adventure, Thrillers",A struggling couple can't believe their luck w...


## 4.1 Split Dataframe into training and validation dataset

In [23]:
exploratory_data_analysis = exploratory_data_analysis.drop(columns=['date_added'])

X = exploratory_data_analysis.drop('rating', axis=1)
y = exploratory_data_analysis['rating'].copy()

x_train, x_val, y_train, y_val = ms.train_test_split(X, y, test_size=0.20)

train_data = pd.concat([x_train, y_train], axis=1)
validation_data = pd.concat([x_val, y_val], axis=1)


## 4.2 Transformation

### 4.3.1 Encoding

In [24]:
cols_to_label = ['type', 'title', 'director', 'cast', 'country',
    'duration', 'listed_in', 'description']

le = preprocessing.LabelEncoder()

for col in cols_to_label:
    train_data[col] = le.fit_transform(train_data[col])
    # Check if there is something not mapped in validation data
    validation_data[col] = validation_data[col].map(
        lambda s: "unknown" if s not in le.classes_ else s
    )
    # Add the new class 'unknown' if it is not present
    if "unknown" not in list(le.classes_):
        le.classes_ = np.append(le.classes_, "unknown")
    else:
        # The class 'unknown' is already present, ensure that it occurs only once
        le.classes_ = np.unique(le.classes_)
    validation_data[col] = le.transform(validation_data[col])

### 4.3.2 Response Variable Transformation

The prediction of a model can be biased according to the distribution of the response variable, as we analyzed in the exploratory data analysis notebook we need to convert the variable to a normal distribution.

In [25]:
#TODO

### 4.3.3 Nature Transformation

Transformation of nature involves maintaining a state and preserving its cycle, for example in numerical terms a column of months the model can understand that month 1 is close to month 2, on the other hand month 1 seems very far from month 12, in this case we will apply techniques that allow the model to understand that both months are very close.

In [26]:
# TODO

In [27]:
catalog.save("train_data_preparation", train_data)
catalog.save("validation_data_preparation", validation_data)