In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.decomposition import PCA, TruncatedSVD

from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression

# Exam Preparation

## Demos

In [3]:
EPSILON = 1e-06

In [4]:
spaceship_data = pd.read_csv("data/train.csv")

In [5]:
spaceship_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [6]:
spaceship_data.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

In [7]:
spaceship_data.CryoSleep.unique()

array([False, True, nan], dtype=object)

In [8]:
spaceship_data.CryoSleep.value_counts(dropna = False)

False    5439
True     3037
NaN       217
Name: CryoSleep, dtype: int64

In [9]:
spaceship_data.FoodCourt[spaceship_data.FoodCourt == 0]

0       0.0
7       0.0
9       0.0
10      0.0
12      0.0
       ... 
8681    0.0
8684    0.0
8685    0.0
8689    0.0
8690    0.0
Name: FoodCourt, Length: 5456, dtype: float64

In [10]:
spaceship_data.FoodCourt = spaceship_data.FoodCourt.apply(lambda x: EPSILON if x <=0 else x)

In [11]:
spaceship_data.FoodCourt.min()

1e-06

In [12]:
positive_values_transformer = FunctionTransformer(func = lambda x: np.array([EPSILON if elem <= 0 else elem for elem in x]))

In [13]:
log_transformer = FunctionTransformer(func = lambda x: np.log10(x))

In [14]:
log_pipeline = Pipeline([
    ("pos_values", positive_values_transformer),
    ("log", log_transformer)
])

In [15]:
spaceship_data.HomePlanet.unique()

array(['Europa', 'Earth', 'Mars', nan], dtype=object)

In [16]:
pd.get_dummies(spaceship_data.HomePlanet, dummy_na = True)

Unnamed: 0,Earth,Europa,Mars,NaN
0,0,1,0,0
1,1,0,0,0
2,0,1,0,0
3,0,1,0,0
4,1,0,0,0
...,...,...,...,...
8688,0,1,0,0
8689,1,0,0,0
8690,1,0,0,0
8691,0,1,0,0


In [17]:
def fill_in_string_values(str_values):
    strings = str_values.fillna("Unknown")
    string_value_columns = strings.values.reshape(-1, 1)
    return string_value_columns

In [18]:
spaceship_data.HomePlanet.values.reshape(-1, 1)

array([['Europa'],
       ['Earth'],
       ['Europa'],
       ...,
       ['Earth'],
       ['Europa'],
       ['Europa']], dtype=object)

In [19]:
home_planet_pipeline = Pipeline(steps = [
    ("fillna", FunctionTransformer(func = fill_in_string_values)),
    ("ohe", OneHotEncoder()),
    ("create_dense_matrix", FunctionTransformer(func = lambda x: np.array(x.todense()))),
    ("pca", PCA())
])

In [20]:
home_planet_pipeline.fit(spaceship_data.HomePlanet)

In [21]:
transformed_home_planets = home_planet_pipeline.transform(spaceship_data.HomePlanet)

In [22]:
transformed_home_planets

array([[ 7.42898965e-01, -5.81696193e-01, -3.57600764e-02,
        -3.36536354e-16],
       [-5.66208172e-01, -4.70708684e-02, -1.54065332e-02,
         7.97972799e-17],
       [ 7.42898965e-01, -5.81696193e-01, -3.57600764e-02,
        -3.36536354e-16],
       ...,
       [-5.66208172e-01, -4.70708684e-02, -1.54065332e-02,
         7.97972799e-17],
       [ 7.42898965e-01, -5.81696193e-01, -3.57600764e-02,
        -3.36536354e-16],
       [ 7.42898965e-01, -5.81696193e-01, -3.57600764e-02,
        -3.36536354e-16]])

In [23]:
search = GridSearchCV(home_planet_pipeline, param_grid = {
    "pca__n_components": [1, 2, 3, 4, 5]
})

In [24]:
search.fit(spaceship_data.HomePlanet)

5 fits failed out of a total of 25.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\tstoyn\AppData\Local\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 730, in _fit_and_score
    estimator.fit(X_train, **fit_params)
  File "C:\Users\tstoyn\AppData\Local\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\tstoyn\AppData\Local\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\tstoyn\AppData\Local\an

In [25]:
search.cv_results_

{'mean_fit_time': array([0.00480018, 0.00440011, 0.00451183, 0.0031908 , 0.00260181]),
 'std_fit_time': array([0.00074835, 0.00049   , 0.00044806, 0.0004042 , 0.00049142]),
 'mean_score_time': array([0.00119982, 0.00079985, 0.00101089, 0.00081043, 0.        ]),
 'std_score_time': array([3.99899522e-04, 3.99923453e-04, 1.96807280e-05, 4.05719031e-04,
        0.00000000e+00]),
 'param_pca__n_components': masked_array(data=[1, 2, 3, 4, 5],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'pca__n_components': 1},
  {'pca__n_components': 2},
  {'pca__n_components': 3},
  {'pca__n_components': 4},
  {'pca__n_components': 5}],
 'split0_test_score': array([-1.42556499, -0.13879519, 13.44000103, 17.27602341,         nan]),
 'split1_test_score': array([-1.45059461, -0.25972173, 14.50066971,        -inf,         nan]),
 'split2_test_score': array([-1.53664913, -0.24710998,        -inf,        -inf,         nan]),
 'split3_tes

In [26]:
search.best_params_

{'pca__n_components': 2}

In [27]:
log_pipeline

In [28]:
FeatureUnion([
    ("log_of_price", log_pipeline),
    ("homeplanet_encoding", home_planet_pipeline)
])

In [29]:
ColumnTransformer([
    ("log_of_price", log_pipeline, ["RoomService", "FoodCourt", "Shopping Mall", "Spa", "VRDeck"]),
    ("homeplanet_encoding", home_planet_pipeline, ["HomePlanet"])
])

In [30]:
tfidf = TfidfVectorizer()
tfidf.fit(spaceship_data.Name.fillna("None"))

In [31]:
names_encoding_pipeline = Pipeline([
    ("fill_na", FunctionTransformer(func = fill_in_string_values)),
    ("tdidf", TfidfVectorizer()),
    ("svd", TruncatedSVD())
])

In [32]:
names_encoding_pipeline

In [33]:
column_transformer = ColumnTransformer([
    ("log_of_price", log_pipeline, ["RoomService", "FoodCourt", "Shopping Mall", "Spa", "VRDeck"]),
    ("homeplanet_encoding", home_planet_pipeline, ["HomePlanet"]),
    ("names_encoding", names_encoding_pipeline, ["Name"])
])

In [34]:
pipeline = Pipeline(steps = [
    ("features", column_transformer),
    ("final_pca", PCA()),
    ("model", LogisticRegression())
])

In [35]:
pipeline.steps

[('features',
  ColumnTransformer(transformers=[('log_of_price',
                                   Pipeline(steps=[('pos_values',
                                                    FunctionTransformer(func=<function <lambda> at 0x00000160E78D7A60>)),
                                                   ('log',
                                                    FunctionTransformer(func=<function <lambda> at 0x00000160E7938220>))]),
                                   ['RoomService', 'FoodCourt', 'Shopping Mall',
                                    'Spa', 'VRDeck']),
                                  ('homeplanet_encoding',
                                   Pipeline(steps=[('fillna',
                                                    FunctionTransformer(fun...00160E7938D60>)),
                                                   ('ohe', OneHotEncoder()),
                                                   ('create_dense_matrix',
                                                    Function

In [36]:
# Pipeline which accepts raw data, perform grid search for hyperparameter tunning and return the best model

# search = GridSearchCV(pipeline, param_grid = {})
# search.fit()
# search.best_estimator_