### We load the dataset

In [None]:
import pandas as pd
import numpy as np

movies = pd.read_csv("data/movies.1.initial_process.csv")
movies = movies[movies.status=="Released"]
del movies["status"]
movies.head()

### Create a Pipeline that process the dataset. You have to make sure you deal accordingly with numerical, categorical and text variables. (Note: you dont have to use them all!)

In [None]:
numerical_cols = movies.select_dtypes(np.number).columns
categorical_cols = movies.select_dtypes(object).drop(columns=[
                            "belongs_to_collection",
                            "title",
                            "release_date"
    ]).columns
date_col = ["release_date"]

In [None]:
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline, make_union

In [None]:
from mlxtend.feature_selection import ColumnSelector

**Numerical pipeline**

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer(strategy="mean")
scaler = preprocessing.StandardScaler()

numerical_pipeline = make_pipeline(
    ColumnSelector(cols=numerical_cols),
    imputer,
    scaler
)

In [None]:
numerical_pipeline.fit_transform(movies)

**Categorical pipeline**

In [None]:
from category_encoders import OneHotEncoder

categorical_pipeline = make_pipeline(
    ColumnSelector(cols=categorical_cols),
    OneHotEncoder()
)

In [None]:
categorical_pipeline.fit_transform(movies).head()

In [None]:
processing_pipeline = make_union(
    categorical_pipeline,
    numerical_pipeline
)

**Note**: you could probably use the release date year as a categorical like this:

In [None]:
pd.to_datetime(movies.release_date).dt.year.astype("category")

### Transform the dataset

In [None]:
processing_pipeline

In [None]:
processed_data = processing_pipeline.fit_transform(movies)

In [None]:
processed_data.shape

### Create a Ridge estimator to predict a movies revenue based on the other features. What is the optimal value of alpha to minimize the RMSE? *Hint*: You can use validation curves to figure it out.

In [None]:
target = "revenue"
numerical_cols_no_revenue = movies.drop(columns=target).select_dtypes(np.number).columns

numerical_pipeline_no_revenue = make_pipeline(
    ColumnSelector(cols=numerical_cols_no_revenue),
    imputer,
    scaler
)

In [None]:
processing_pipeline_no_revenue = make_union(
    categorical_pipeline,
    numerical_pipeline_no_revenue
)
movies_with_revenue = movies[movies.revenue.notnull()]
processed_data_no_revenue = processing_pipeline_no_revenue.fit_transform(movies_with_revenue)
target_revenue = movies_with_revenue.revenue

In [None]:
processed_data_no_revenue

In [None]:
target_revenue

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.rcParams["figure.figsize"] = (12, 12)

In [None]:
range_alpha = np.linspace(0.001, 100, 100)

In [None]:
from sklearn.model_selection import validation_curve
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

def rmse_cv(estimator, X, y):
    y_pred = estimator.predict(X)
    return np.sqrt(mean_squared_error(y_pred, y))

estimator = Ridge()

train_scores, test_scores = validation_curve(
    estimator, 
    processed_data_no_revenue,
    target_revenue,
    param_name="alpha", 
    param_range=range_alpha,
    cv=3, 
    scoring=rmse_cv,
    n_jobs=-1)

In [None]:
train_scores_mean = np.abs(np.mean(train_scores, axis=1))
test_scores_mean = np.abs(np.mean(test_scores, axis=1))

In [None]:
plt.plot(range_alpha, train_scores_mean, 'o-', color="r",
             label="Training score")
plt.plot(range_alpha, test_scores_mean, 'o-', color="g",
             label="Test score")
plt.title("Validation Curve: Ridge alpha value")
plt.xlabel("Alpha")
plt.ylabel("Root Mean Squared Error (RMSE)")
plt.legend();

We see the training score increases but the test score flattens out around $\alpha=90$. Since what we care the most is about the test score (the training score is nice but we care more about how the model generalizes on unseen data) we can use that value.

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(Ridge(alpha=90),
                          processed_data_no_revenue,
                          target_revenue,
                          scoring=rmse_cv,
                          cv=3,
                          n_jobs=-1
                         ).mean()

### Remember when we did exploratory data analyses and we groupd the numerical variables into quintiles? That is a valid technique used in Machine Learning to expand a dataset, it is called [Binning or Bucketing](http://blog.yhat.com/tutorials/5-Feature-Engineering.html).

### Create your own transformer that given a numerical variable and a number of buckets returns the specificed quartile (so if we choose buckets = 4, it would return 1, 2,3 or 4 depending on each observation being on the 1st, 2nd, 3rd or 4th quartile).

### Try putting your bucket transformer into a pipeline to make sure it works, and check if it improves the performance of your model.

**Hint**: You can use `ColumnSelector` as a template, and you can check pandas `qcut` for the actual binning.

In [None]:
movies.head()

In [None]:
import pandas as pd

A transformer must work with numpy arrays, and I would like it also to work with pandas dataframes as inputs. The output will be a numpy array.

I found that for some columns I got the error:

Which can be solved by setting the parameter `duplicates="drop"` to qcut

In [None]:
from sklearn.base import BaseEstimator

class QuantileBinner(BaseEstimator):
    """
    Transform a column and groups it into a specified number of buckets
    """
    def __init__(self, bins):
        self.bins = bins

    def fit_transform(self, X, y=None):
        return self.transform(X=X, y=y)

    def transform(self, X, y=None):
        output = pd.DataFrame()
        if not hasattr(X, 'loc'):
            # if the input doesnt have the method loc is a numpy array
            input_data = pd.DataFrame(X)
        else:
            #only pandas dataframes have the method loc
            input_data = X
        for column in input_data.columns:
            output = pd.concat([output, pd.qcut(
                                            input_data[column],
                                            self.bins,
                                            duplicates="drop").cat.codes], 
                                   axis=1)  
        return output.values

    def fit(self, X, y=None):
        return self

In [None]:
binner = QuantileBinner(bins=5)

We test  that it works with numpy arrays and pandas dataframes

In [None]:
binner.fit_transform(movies[["budget", "popularity"]].values)

In [None]:
binner.fit_transform(movies[["budget", "popularity"]])

In [None]:
numerical_pipeline_no_revenue_buckets = make_pipeline(
    ColumnSelector(cols=numerical_cols_no_revenue),
    imputer,
    scaler,
    binner
)

In [None]:
processing_pipeline_no_revenue_buckets = make_union(
    categorical_pipeline,
    numerical_pipeline_no_revenue_buckets
)

In [None]:
processed_data_no_revenue_buckets = processing_pipeline_no_revenue_buckets.fit_transform(movies_with_revenue)


In [None]:
processed_data_no_revenue_buckets

In [None]:
cross_val_score(Ridge(alpha=90),
                          processed_data_no_revenue_buckets,
                          target_revenue,
                          scoring=rmse_cv,
                          cv=3,
                          n_jobs=-1
                         ).mean()

We see that in this case it does not improve the performance of the model