## Imports

In [None]:
# pip install --upgrade scikit-learn

In [612]:
import time
from IPython.display import clear_output
import numpy    as np
import pandas   as pd
import seaborn  as sb
import matplotlib.pyplot as plt
import sklearn  as skl

import os
import zipfile
from sklearn.model_selection import cross_val_predict
from sklearn.decomposition import TruncatedSVD

from sklearn import pipeline      # Pipeline
from sklearn import preprocessing # OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config

set_config(display='diagram') # Useful for display the pipeline

print("Pandas  ", pd.__version__)
print("Sklearn ", skl.__version__) # Try to use 0.24

Pandas   1.3.2
Sklearn  1.0.1


## Get the dataset
- **CLOUD = True**: Download dataset from Kaggle. Necesary for cloud enviroments like COLAB. **Specify your [kaggle credentials](https://www.kaggle.com/docs/api)**.
- **CLOUD = False**: Get the dataset from your local machine. **Specify the data path**.

In [2]:
os.environ['KAGGLE_USERNAME'] = "anthonyemeka12"
os.environ['KAGGLE_KEY']      = "94aca0f70e36c6996ff3fa3766094158"
!pip install --upgrade kaggle
!kaggle competitions download -c titanic
DATA_PATH = "./titanic/"

#unzip
with zipfile.ZipFile('titanic.zip', 'r') as zip_ref:
    zip_ref.extractall(DATA_PATH)
os.remove('titanic.zip')

Downloading titanic.zip to c:\Users\Asus\Documents\Education\Strive School\AI-Engineering\Chapter 2\10. Robust ML




  0%|          | 0.00/34.1k [00:00<?, ?B/s]
100%|██████████| 34.1k/34.1k [00:00<00:00, 11.6MB/s]


## Load data

In [3]:
df      = pd.read_csv(DATA_PATH + "train.csv", index_col='PassengerId')
df_test = pd.read_csv(DATA_PATH + "test.csv",  index_col='PassengerId')

print("")
print("Train DataFrame:", df.shape)
print("Test DataFrame: ", df_test.shape)


Train DataFrame: (891, 11)
Test DataFrame:  (418, 10)


## Check missings

In [4]:
(df.isnull().sum() / len(df))*100

Survived     0.000000
Pclass       0.000000
Name         0.000000
Sex          0.000000
Age         19.865320
SibSp        0.000000
Parch        0.000000
Ticket       0.000000
Fare         0.000000
Cabin       77.104377
Embarked     0.224467
dtype: float64

In [5]:
(df_test.isnull().sum() / len(df))*100

Pclass       0.000000
Name         0.000000
Sex          0.000000
Age          9.652076
SibSp        0.000000
Parch        0.000000
Ticket       0.000000
Fare         0.112233
Cabin       36.700337
Embarked     0.000000
dtype: float64

# Exercise 1 (2pts):
Extract the title (Mr, Mrs, ... ) from the "Name" column.

In [6]:
df['Title'] = df.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())
df_test['Title'] = df_test.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [7]:
assert df['Title'].values[0] == "Mr"
assert df['Title'].values[1] == "Mrs"
assert df['Title'].values[2] == "Miss"

assert df_test['Title'].values[0] == "Mr"
assert df_test['Title'].values[1] == "Mrs"
assert df_test['Title'].values[414] == "Dona"

# Exercise 2 (1pts):
Apply the title_dictionary to get a better information about the title. You have to overwrite the Title variable.

In [8]:
title_dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

In [9]:
df["Title"] =  df.Title.map(title_dictionary)
df_test["Title"] = df_test.Title.map(title_dictionary)
df_test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,Mr
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,Mrs
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Mr
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,Mr
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,Mrs


In [10]:
assert df['Title'].values[886] == "Officer"
assert df_test['Title'].values[417] == "Master"

In [218]:
df.Ticket.values

array(['A/5 21171', 'PC 17599', 'STON/O2. 3101282', '113803', '373450',
       '330877', '17463', '349909', '347742', '237736', 'PP 9549',
       '113783', 'A/5. 2151', '347082', '350406', '248706', '382652',
       '244373', '345763', '2649', '239865', '248698', '330923', '113788',
       '349909', '347077', '2631', '19950', '330959', '349216',
       'PC 17601', 'PC 17569', '335677', 'C.A. 24579', 'PC 17604',
       '113789', '2677', 'A./5. 2152', '345764', '2651', '7546', '11668',
       '349253', 'SC/Paris 2123', '330958', 'S.C./A.4. 23567', '370371',
       '14311', '2662', '349237', '3101295', 'A/4. 39886', 'PC 17572',
       '2926', '113509', '19947', 'C.A. 31026', '2697', 'C.A. 34651',
       'CA 2144', '2669', '113572', '36973', '347088', 'PC 17605', '2661',
       'C.A. 29395', 'S.P. 3464', '3101281', '315151', 'C.A. 33111',
       'CA 2144', 'S.O.C. 14879', '2680', '1601', '348123', '349208',
       '374746', '248738', '364516', '345767', '345779', '330932',
       '113059',

# Exercise OPTINAL (0pts):
Try to extract some information from the feature **Ticket**. Search on Internet if that colum has some kind of information.

In [484]:
unique_tickets = set()
num_tickets = set()
def cat_tic(inp):
    if inp < 0:
        return 0
    elif inp >=0 and inp < 1000:
        return 1
    elif inp >= 1000 and inp < 10000:
        return 3
    elif inp >= 10000 and inp < 200000:
        return 3
    return 4
    

def pro_ticket(col_row):
    col_row_lst = [i.strip() for i in col_row.split(" ")]
    unique_ticket = None
    num_ticket = None
    if len(col_row_lst) > 1:
        unique_ticket = col_row_lst[0].replace('.','').lower()
        # num_ticket = str(cat_tic( int(col_row_lst[-1].replace('.','')) ))
        num_ticket = int(col_row_lst[-1].replace('.','')) > 600
    else:
        try:
            # num_ticket = str(cat_tic( int(col_row_lst[0].replace('.','')) ))
            num_ticket = int(col_row_lst[-1].replace('.','')) > 600
            # unique_ticket = 'NOTSPECIFIED'
        except:
            unique_ticket = col_row_lst[0].replace('.','').lower()
            # num_ticket = str(cat_tic( -1 ))
    
    return unique_ticket, num_ticket

df['Ticket_str'] = df.Ticket.apply(lambda x: pro_ticket(x)[0])
df['Ticket_num'] = df.Ticket.apply(lambda x: pro_ticket(x)[1])

df_test['Ticket_str'] = df_test.Ticket.apply(lambda x: pro_ticket(x)[0])
df_test['Ticket_num'] = df_test.Ticket.apply(lambda x: pro_ticket(x)[1])



print(df.groupby('Ticket_num').count())
print('\n')
print(df['Ticket_str'])

            Survived  Pclass  Name  Sex  Age  SibSp  Parch  Ticket  Fare  \
Ticket_num                                                                 
False              3       3     3    3    3      3      3       3     3   
True             884     884   884  884  707    884    884     884   884   

            Cabin  Embarked  Title  Ticket_str  
Ticket_num                                      
False           2         3      3           3  
True          202       882    884         223  


PassengerId
1          a/5
2           pc
3      ston/o2
4         None
5         None
        ...   
887       None
888       None
889        w/c
890       None
891       None
Name: Ticket_str, Length: 891, dtype: object


# Exercise OPTIONAL (0pts):
Try to extract some information from the feature **Cabin**. Search on Internet if that colum has some kind of information.

In [485]:
df.Cabin

PassengerId
1       NaN
2       C85
3       NaN
4      C123
5       NaN
       ... 
887     NaN
888     B42
889     NaN
890    C148
891     NaN
Name: Cabin, Length: 891, dtype: object

# Preprocessing
For X data, notice that...
- We drop Survived because is the target variable
- We drop Name because we have extracted the Title: Mr, Mrs, ...
- We drop Ticket because it has no information -> see df.Ticket.nunique()
- We drop Cabin because it has a lot of missings (77% are missings)

Then, we identify **numerical** variables and **categorical** variables,

In [486]:
x = df.drop(columns=["Survived", 'Name', 'Ticket', 'Cabin']) # X DATA (WILL BE TRAIN+VALID DATA)
y = df["Survived"] # 0 = No, 1 = Yes

x_test = df_test.drop(columns=['Name', 'Ticket', 'Cabin']) # # X_TEST DATA (NEW DATA)

In [487]:
cat_vars  = ['Sex', 'Embarked', 'Title', 'Ticket_str', 'Ticket_num']         # x.select_dtypes(include=[object]).columns.values.tolist()
num_vars  = ['Pclass', 'SibSp', 'Parch', 'Fare', 'Age'] # x.select_dtypes(exclude=[object]).columns.values.tolist()

print("\nNumerical features:\n", num_vars)
print("\nCategorical features:\n", cat_vars)


Numerical features:
 ['Pclass', 'SibSp', 'Parch', 'Fare', 'Age']

Categorical features:
 ['Sex', 'Embarked', 'Title', 'Ticket_str', 'Ticket_num']


# Exercise 3 (2pts):
Create a **ColumnTransformer for Tree Models**. You need to create 2 pipelines (one for numerical and other for categories). Remember:
- Categorical pipeline: Some SimpleImputer -> Some Encoder
- Numerical pipeline: Some SimpleImputer -> NO Encoder

In [644]:
# Drop other vars not specified in num_vars or cat_vars

cat_vars  = ['Sex', 'Embarked', 'Title']#, 'Ticket_str'
num_vars  = ['Pclass', 'SibSp', 'Parch', 'Fare', 'Age']

num_4_treeModels = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='median', add_indicator=False))#, # mean, median
])

cat_4_treeModels = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', preprocessing.OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1))
])

num_4_nonTreeModels = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='mean', add_indicator=False)), # mean, median
    ('scaler', preprocessing.QuantileTransformer(n_quantiles=100))
    # ('pca', TruncatedSVD (n_components=2))
])

cat_4_nonTreeModels = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),
    # ('ordinal', preprocessing.OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1))
    ('onehot', preprocessing.OneHotEncoder(handle_unknown='ignore'))
    # ('pca', TruncatedSVD (n_components=2))
])

tree_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_4_treeModels, num_vars),
    ('cat', cat_4_treeModels, cat_vars),
], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

nonTree_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_4_nonTreeModels, num_vars),
    ('cat', cat_4_nonTreeModels, cat_vars),
], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

nonTree_prepro


In [645]:
assert type(tree_prepro)      is compose._column_transformer.ColumnTransformer
assert type(num_4_treeModels) is pipeline.Pipeline
assert type(cat_4_treeModels) is pipeline.Pipeline
assert len(num_4_treeModels) == 1
assert len(cat_4_treeModels) == 2

# Exercise 4 (1pts):
1. Complete the diccionary with some Tree Models.
2. Then we put each model in a Pipeline where:
   - first is the prepocessing with the column Transformer
   - Then is the Tree model
3. Display the fullpipeline of the LGBMClassifier

In [646]:
#Tree Models
from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier

#NonTree Models
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [647]:
tree_classifiers = {
  "Decision Tree": DecisionTreeClassifier(),
  "Extra Trees": ExtraTreesClassifier(),
  "Random Forest": RandomForestClassifier(),
  "AdaBoost": AdaBoostClassifier(),
  "Skl GBM": GradientBoostingClassifier(),
  "Skl HistGBM": HistGradientBoostingClassifier(),
  "XGBoost": XGBClassifier(use_label_encoder=False,eval_metric='error'),
  "LightGBM": LGBMClassifier(),
  "CatBoost": CatBoostClassifier()
}
nonTree_classifiers = {
  "Linear Regression": LinearRegression(),
  "KNeighbors Classifier": KNeighborsClassifier(n_neighbors=10),
  "Gaussian NB": GaussianNB(),
  "Logistic Regression": LogisticRegression(),
  "Svc": SVC()
}

nonTree_classifiers = {name: pipeline.make_pipeline(nonTree_prepro, model) for name, model in nonTree_classifiers.items()}
tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}
tree_classifiers["LightGBM"]

In [648]:
for pipe in tree_classifiers.values():
    assert type(pipe) is pipeline.Pipeline

# Exercise 5 (3pts):
Define a simple split validation strategy with:
- 80% for train
- 20% for validation
- With stratification
- random_state=0

And train all the models in a for loop

In [649]:
# Train Test Split
x_train, x_val, y_train, y_val = model_selection.train_test_split(
    x,y,
    test_size=.2,
    stratify=y,
    random_state=0
)

#Train model
results_test_split = []

for model_name, model in tree_classifiers.items():
    start_time = time.time()
    
    # FOR EVERY PIPELINE (PREPRO + MODEL) -> TRAIN WITH TRAIN DATA (x_train)
    model.fit(x_train, y_train)
    
    # GET PREDICTIONS USING x_val
    pred = model.predict(x_val)

    total_time = time.time() - start_time

    results_test_split.append(
        {
            "Model": model_name,
            "Accuracy": metrics.accuracy_score(y_val, pred)*100,
            "Bal Acc.": metrics.balanced_accuracy_score(y_val, pred)*100,
            "Time": total_time
        }
        )

for model_name, model in nonTree_classifiers.items():
    start_time = time.time()
    
    # FOR EVERY PIPELINE (PREPRO + MODEL) -> TRAIN WITH TRAIN DATA (x_train)
    model.fit(x_train, y_train)
    
    # GET PREDICTIONS USING x_val
    pred = model.predict(x_val)

    total_time = time.time() - start_time

    results_test_split.append(
        {
            "Model": model_name,
            "Accuracy": metrics.accuracy_score(y_val, np.round(abs(pred)). astype(int))*100,
            "Bal Acc.": metrics.balanced_accuracy_score(y_val, np.round(abs(pred)). astype(int))*100,
            "Time": total_time
        }
        )

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Learning rate set to 0.008911
0:	learn: 0.6896767	total: 1.06ms	remaining: 1.06s
1:	learn: 0.6840123	total: 3.03ms	remaining: 1.51s
2:	learn: 0.6785069	total: 4.67ms	remaining: 1.55s
3:	learn: 0.6727526	total: 6.43ms	remaining: 1.6s
4:	learn: 0.6660879	total: 7.99ms	remaining: 1.59s
5:	learn: 0.6603185	total: 9.37ms	remaining: 1.55s
6:	learn: 0.6545448	total: 10.7ms	remaining: 1.52s
7:	learn: 0.6501322	total: 11.9ms	remaining: 1.48s
8:	learn: 0.6449121	total: 13.4ms	remaining: 1.47s
9:	learn: 0.6420056	total: 14.3ms	remaining: 1.42s
10:	learn: 0.6369110	total: 15.9ms	remaining: 1.43s
11:	learn: 0.6316935	total: 17.5ms	remaining: 1.44s
12:	learn: 0.6264142	total: 19.1ms	remaining: 1.45s
13:	learn: 0.6217946	total: 20.6ms	remaining: 1.45s
14:	learn: 0.6174358	total: 22ms	remaining: 1.45s
15:	learn: 0.6125738	total: 23.4ms	remaining: 1.44s
16:	learn: 0.6085813	total: 25.2ms	remaining: 1.46s
17:	learn: 0.6039203	total: 26.9ms	remaining: 1.47s
18:	learn: 0.5993951	total: 28ms	remaining: 1.4

In [650]:
results_ord_train_test_split = pd.DataFrame.from_dict(results_test_split).sort_values('Accuracy',ascending=False)
results_ord_train_test_split

Unnamed: 0,Model,Accuracy,Bal Acc.,Time
4,Skl GBM,85.47486,83.050066,0.091994
10,KNeighbors Classifier,83.240223,80.6917,0.024003
6,XGBoost,82.681564,80.237154,0.082
8,CatBoost,81.564246,79.057971,1.815536
13,Svc,81.564246,78.517787,0.037992
5,Skl HistGBM,81.005587,78.603426,0.588995
3,AdaBoost,79.888268,77.964427,0.099004
7,LightGBM,79.888268,77.424242,0.077003
12,Logistic Regression,79.329609,76.429513,0.025002
9,Linear Regression,78.77095,75.974967,0.016997


In [651]:
assert results_ord_train_test_split["Accuracy"].min() > 75
assert results_ord_train_test_split["Bal Acc."].min() > 75
assert len(results_ord_train_test_split) == 9

AssertionError: 

# Exercise 6 (3pts):
Define a 10 Fold cross validation strategy with:
- With stratification
- shuffle=True
- random_state=0

And train all the models in a for loop.

Tip you can use **[cross_val_predict](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_predict.html)** for both training and predict with 

In [652]:
skf = model_selection.StratifiedKFold(
    n_splits=10,
    shuffle=True,
    random_state=0
)

#Train model
results_cv = []

for model_name, model in tree_classifiers.items():
    start_time = time.time()
    
    # TRAIN AND GET PREDICTIONS USING cross_val_predict() and x,y
    pred = cross_val_predict(model, x, y, cv=skf)

    total_time = time.time() - start_time

    results_cv.append(
        {
            "Model": model_name,
            "Accuracy": metrics.accuracy_score(y, pred)*100,
            "Bal Acc.": metrics.balanced_accuracy_score(y, pred)*100,
            "Time": total_time
        }
        )

Learning rate set to 0.009371
0:	learn: 0.6897520	total: 971us	remaining: 970ms
1:	learn: 0.6859991	total: 2.01ms	remaining: 1s
2:	learn: 0.6793872	total: 3.58ms	remaining: 1.19s
3:	learn: 0.6732314	total: 5.16ms	remaining: 1.28s
4:	learn: 0.6684068	total: 6.03ms	remaining: 1.2s
5:	learn: 0.6620659	total: 7.86ms	remaining: 1.3s
6:	learn: 0.6558435	total: 9.66ms	remaining: 1.37s
7:	learn: 0.6512660	total: 11.2ms	remaining: 1.39s
8:	learn: 0.6454009	total: 12.7ms	remaining: 1.4s
9:	learn: 0.6398426	total: 14.5ms	remaining: 1.44s
10:	learn: 0.6345826	total: 16.1ms	remaining: 1.45s
11:	learn: 0.6291410	total: 17.7ms	remaining: 1.45s
12:	learn: 0.6237012	total: 19.2ms	remaining: 1.46s
13:	learn: 0.6193753	total: 20.4ms	remaining: 1.43s
14:	learn: 0.6155657	total: 21.2ms	remaining: 1.39s
15:	learn: 0.6131940	total: 22ms	remaining: 1.35s
16:	learn: 0.6083842	total: 23.5ms	remaining: 1.36s
17:	learn: 0.6041952	total: 25ms	remaining: 1.36s
18:	learn: 0.5992284	total: 26.6ms	remaining: 1.38s
19:

In [653]:
for model_name, model in nonTree_classifiers.items():
    start_time = time.time()
    
    # TRAIN AND GET PREDICTIONS USING cross_val_predict() and x,y
    pred = cross_val_predict(model, x, y, cv=skf)

    total_time = time.time() - start_time

    results_cv.append(
        {
            "Model": model_name,
            # "Accuracy": metrics.accuracy_score(y, pred)*100,
            # "Bal Acc.": metrics.balanced_accuracy_score(y, pred)*100,
            "Accuracy": metrics.accuracy_score(y, np.round(abs(pred)). astype(int))*100,
            "Bal Acc.": metrics.balanced_accuracy_score(y, np.round(abs(pred)). astype(int))*100,
            "Time": total_time
        }
        )

In [654]:
results_ord_cv = pd.DataFrame.from_dict(results_cv).sort_values('Accuracy',ascending=False)
results_ord_cv

Unnamed: 0,Model,Accuracy,Bal Acc.,Time
4,Skl GBM,84.399551,82.214073,0.912982
8,CatBoost,83.38945,81.339277,19.111238
10,KNeighbors Classifier,83.05275,80.294315,0.219998
5,Skl HistGBM,82.491582,80.8863,7.079518
7,LightGBM,82.267116,80.483654,0.777003
13,Svc,81.705948,78.650177,0.387992
6,XGBoost,81.481481,80.011504,0.879001
2,Random Forest,81.369248,80.085802,1.696688
9,Linear Regression,80.808081,79.079187,0.199986
1,Extra Trees,80.695847,79.373981,1.447062


In [280]:
assert results_ord_cv["Accuracy"].min() > 75
assert results_ord_cv["Bal Acc."].min() > 75
assert len(results_ord_cv) == 9

# Exercise 7.1
Train with all data the best model

In [655]:
best_Accuracy = -np.inf
best_model_name = None
for d in [results_ord_cv, results_ord_train_test_split]:
    if d.Accuracy.max() > best_Accuracy:
        best_model_name = d[d.Accuracy == d.Accuracy.max()].Model.values[0]


best_model = tree_classifiers[best_model_name]
best_model.fit(x,y)

# Exercise 7.2 (2pts)
With your best model, generate the predicitions for test data (x_test)

In [656]:
test_pred = best_model.predict(x_test)

In [657]:
assert len(test_pred) == 418
assert np.unique(test_pred).tolist() == [0,1]

# Exercise 7.3

Submit to kaggle.

- You can use the kaggle command line app. Check https://github.com/Kaggle/kaggle-api

In [658]:
sub = pd.DataFrame(test_pred, index=x_test.index, columns=["Survived"])
sub.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1


In [659]:
sub.to_csv("sub.csv")

In [661]:
!kaggle competitions submit -c titanic -f sub.csv -m "Another submit"

401 - Unauthorized
