## Imports

In [2]:
# pip install --upgrade scikit-learn

In [3]:
import time
from IPython.display import clear_output
import numpy    as np
import pandas   as pd
import seaborn  as sb
import matplotlib.pyplot as plt
import sklearn  as skl

from sklearn import pipeline      # Pipeline
from sklearn import preprocessing # OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config

set_config(display='diagram') # Useful for display the pipeline

print("Pandas  ", pd.__version__)
print("Sklearn ", skl.__version__) # Try to use 0.24

Pandas   1.3.4
Sklearn  1.0.2


## Get the dataset
- **CLOUD = True**: Download dataset from Kaggle. Necesary for cloud enviroments like COLAB. **Specify your [kaggle credentials](https://www.kaggle.com/docs/api)**.
- **CLOUD = False**: Get the dataset from your local machine. **Specify the data path**.

In [4]:
DATA_PATH = "/Users/talvinderjohal/Desktop/Talvinder Strive Course/ai_nov21/Chapter 2/10. Robust ML"

## Load data

In [6]:
df      = pd.read_csv(DATA_PATH + "/train.csv", index_col='PassengerId')
df_test = pd.read_csv(DATA_PATH + "/test.csv",  index_col='PassengerId')

print("Train DataFrame:", df.shape)
print("Test DataFrame: ", df_test.shape)

Train DataFrame: (891, 11)
Test DataFrame:  (418, 10)


## Check missings

In [7]:
df.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [8]:
df_test.isnull().sum()

Pclass        0
Name          0
Sex           0
Age          86
SibSp         0
Parch         0
Ticket        0
Fare          1
Cabin       327
Embarked      0
dtype: int64

# Exercise 1 (2pts):
Extract the title (Mr, Mrs, ... ) from the "Name" column.

Tips:
- split(',')[1] to get the 2nd part, and remove the surnamename
- split('.')[0] to get the 1str part, and remove the name

In [9]:
# CODE HERE get_Title_from_Name funtion
# Create this function using lambda (not def)

get_Title_from_Name = lambda name: name.split(',')[1].split('.')[0].strip()
df['Title']      = df['Name'].map(get_Title_from_Name)
df_test['Title'] = df_test['Name'].map(get_Title_from_Name)


In [10]:
assert df['Title'].values[0] == "Mr"
assert df['Title'].values[1] == "Mrs"
assert df['Title'].values[2] == "Miss"

assert df_test['Title'].values[0] == "Mr"
assert df_test['Title'].values[1] == "Mrs"
assert df_test['Title'].values[414] == "Dona"

# Exercise 2 (1pts):
Apply the title_dictionary to get a better information about the title. You have to overwrite the Title variable.

In [12]:
title_dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

In [13]:
# Use map to apply the prevous dict

df["Title"] =  df.Title.map(title_dictionary)
df_test["Title"] = df_test.Title.map(title_dictionary)

In [14]:
assert df['Title'].values[886] == "Officer"
assert df_test['Title'].values[417] == "Master"

# Exercise OPTINAL (0pts):
Try to extract some information from the feature **Ticket**. Search on Internet if that colum has some kind of information.

PassengerId
892                 330911
893                 363272
894                 240276
895                 315154
896                3101298
               ...        
1305             A.5. 3236
1306              PC 17758
1307    SOTON/O.Q. 3101262
1308                359309
1309                  2668
Name: Ticket, Length: 418, dtype: object

# Exercise OPTIONAL (0pts):
Try to extract some information from the feature **Cabin**. Search on Internet if that colum has some kind of information.

PassengerId
892      NaN
893      NaN
894      NaN
895      NaN
896      NaN
        ... 
1305     NaN
1306    C105
1307     NaN
1308     NaN
1309     NaN
Name: Cabin, Length: 418, dtype: object

# Preprocessing
For X data, notice that...
- We drop Survived because is the target variable
- We drop Name because we have extracted the Title: Mr, Mrs, ...
- We drop Ticket because it has no information -> see df.Ticket.nunique()
- We drop Cabin because it has a lot of missings (77% are missings)

Then, we identify **numerical** variables and **categorical** variables,

In [15]:
x = df.drop(columns=["Survived", 'Name', 'Ticket', 'Cabin']) # X DATA (WILL BE TRAIN+VALID DATA)
y = df["Survived"] # 0 = No, 1 = Yes

x_test = df_test.drop(columns=['Name', 'Ticket', 'Cabin']) # # X_TEST DATA (NEW DATA)

In [16]:
cat_vars  = ['Sex', 'Embarked', 'Title']         # x.select_dtypes(include=[object]).columns.values.tolist()
num_vars  = ['Pclass', 'SibSp', 'Parch', 'Fare', 'Age'] # x.select_dtypes(exclude=[object]).columns.values.tolist()

print("\nNumerical features:\n", num_vars)
print("\nCategorical features:\n", cat_vars)


Numerical features:
 ['Pclass', 'SibSp', 'Parch', 'Fare', 'Age']

Categorical features:
 ['Sex', 'Embarked', 'Title']


# Exercise 3 (2pts):
Create a **ColumnTransformer for Tree Models**. You need to create 2 pipelines (one for numerical and other for categories). Remember:
- Categorical pipeline: Some SimpleImputer -> Some Encoder
- Numerical pipeline: Some SimpleImputer -> NO Encoder

In [17]:
"""
num_preprocessing = pipeline.Pipeline(steps=[
  # Some SimpleImputer here
])

cat_preporcessing = pipeline.Pipeline(steps=[
  # Some SimpleImputer here
  # Some Encoder here. Remember to handle_unknown
])

tree_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_preprocessing, num_vars),
    ('cat', cat_preporcessing, cat_vars),
], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

tree_prepro
""";

### BEGIN SOLUTION
num_4_treeModels = pipeline.Pipeline(steps=[('imputer', impute.SimpleImputer(strategy='mean')),])
cat_4_treeModels = pipeline.Pipeline(steps=[('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),('ordinal', preprocessing.OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value = -1))])
tree_prepro = compose.ColumnTransformer(transformers=[('num', num_4_treeModels, num_vars),('cat', cat_4_treeModels, cat_vars),], remainder='drop')

tree_prepro


In [18]:
assert type(tree_prepro)      is compose._column_transformer.ColumnTransformer
assert type(num_4_treeModels) is pipeline.Pipeline
assert type(cat_4_treeModels) is pipeline.Pipeline
assert len(num_4_treeModels) == 1
assert len(cat_4_treeModels) == 2

# Exercise 4 (1pts):
1. Complete the diccionary with some Tree Models.
2. Then we put each model in a Pipeline where:
   - first is the prepocessing with the column Transformer
   - Then is the Tree model
3. Display the fullpipeline of the LGBMClassifier

In [20]:
from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting # Necesary for HistGradientBoostingClassifier
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier



In [22]:
tree_classifiers = {
  "Decision Tree": DecisionTreeClassifier(),
  "Extra Trees":   ExtraTreesClassifier(n_estimators=100),
  "Random Forest": RandomForestClassifier(n_estimators=100),
  "AdaBoost":      AdaBoostClassifier(n_estimators=100),
  "Skl GBM":       GradientBoostingClassifier(n_estimators=100),
  "Skl HistGBM":   HistGradientBoostingClassifier(max_iter=100),
  "XGBoost":       XGBClassifier(n_estimators=100),
  "LightGBM":      LGBMClassifier(n_estimators=100),
  "CatBoost":      CatBoostClassifier(n_estimators=100),
}
tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}
tree_classifiers["LightGBM"]





In [23]:
for pipe in tree_classifiers.values():
    assert type(pipe) is pipeline.Pipeline

# Exercise 5 (3pts):
Define a simple split validation strategy with:
- 80% for train
- 20% for validation
- With stratification
- random_state=0

And train all the models in a for loop

In [25]:
"""
x_train, x_val, y_train, y_val = model_selection.train_test_split(
    # CODE HERE
)
"""
results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

"""
for model_name, model in tree_classifiers.items():
    start_time = time.time()
    
    # FOR EVERY PIPELINE (PREPRO + MODEL) -> TRAIN WITH TRAIN DATA (x_train)
    
    # GET PREDICTIONS USING x_val
    pred = # CODE HERE

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y_val, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_val, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)
                              
                              
"""

### BEGIN SOLUTION
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, train_size=0.3, stratify=y, random_state=0)
results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})
for model_name, model in tree_classifiers.items():

    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time() - start_time
        
    pred = model.predict(x_test)
    
    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y_test, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_test, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)




results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')



Learning rate set to 0.048423
0:	learn: 0.6795697	total: 63.5ms	remaining: 6.28s
1:	learn: 0.6666736	total: 63.9ms	remaining: 3.13s
2:	learn: 0.6550832	total: 64ms	remaining: 2.07s
3:	learn: 0.6435277	total: 64.3ms	remaining: 1.54s
4:	learn: 0.6344353	total: 64.6ms	remaining: 1.23s
5:	learn: 0.6229371	total: 64.8ms	remaining: 1.01s
6:	learn: 0.6108504	total: 65.1ms	remaining: 865ms
7:	learn: 0.6010858	total: 65.4ms	remaining: 752ms
8:	learn: 0.5911963	total: 65.6ms	remaining: 664ms
9:	learn: 0.5810550	total: 65.9ms	remaining: 593ms
10:	learn: 0.5717799	total: 66.2ms	remaining: 535ms
11:	learn: 0.5646933	total: 66.7ms	remaining: 489ms
12:	learn: 0.5561787	total: 67.3ms	remaining: 450ms
13:	learn: 0.5509177	total: 67.5ms	remaining: 415ms
14:	learn: 0.5424444	total: 67.8ms	remaining: 384ms
15:	learn: 0.5360288	total: 68.2ms	remaining: 358ms
16:	learn: 0.5294343	total: 68.5ms	remaining: 334ms
17:	learn: 0.5233169	total: 68.9ms	remaining: 314ms
18:	learn: 0.5179194	total: 69.2ms	remaining: 

Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,CatBoost,82.051282,78.385417,0.173446
2,Skl HistGBM,81.730769,79.21875,1.475877
3,Skl GBM,81.089744,77.916667,0.040242
4,Random Forest,80.769231,78.125,0.075697
5,LightGBM,80.608974,77.994792,0.284893
6,XGBoost,79.647436,77.447917,0.260892
7,Extra Trees,78.846154,76.640625,0.06116
8,Decision Tree,78.205128,76.822917,0.012327
9,AdaBoost,77.724359,75.104167,0.0679


In [26]:
assert results_ord["Accuracy"].min() > 75
assert results_ord["Bal Acc."].min() > 75
assert len(results_ord) == 9

# Exercise 6 (3pts):
Define a 10 Fold cross validation strategy with:
- With stratification
- shuffle=True
- random_state=0

And train all the models in a for loop.

Tip you can use **[cross_val_predict](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_predict.html)** for both training and predict with 

In [29]:
"""
skf = model_selection.StratifiedKFold(
    # CODE HERE
)
"""
skf = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=0)



results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

"""
for model_name, model in tree_classifiers.items():
    start_time = time.time()
        
    # TRAIN AND GET PREDICTIONS USING cross_val_predict() and x,y
    pred = # CODE HERE

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y_val, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_val, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)
                              
                              
"""
for model_name, model in tree_classifiers.items():

    start_time = time.time()
    pred = model_selection.cross_val_predict(model, x, y, cv=skf)
    total_time = time.time() - start_time
            
    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)



results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')







































Learning rate set to 0.077408
0:	learn: 0.6700511	total: 249us	remaining: 24.7ms
1:	learn: 0.6455330	total: 1.03ms	remaining: 50.4ms
2:	learn: 0.6280579	total: 1.42ms	remaining: 45.9ms
3:	learn: 0.6057286	total: 2.42ms	remaining: 58.1ms
4:	learn: 0.5893753	total: 2.94ms	remaining: 55.9ms
5:	learn: 0.5711346	total: 3.34ms	remaining: 52.3ms
6:	learn: 0.5559002	total: 3.84ms	remaining: 51ms
7:	learn: 0.5416820	total: 4.49ms	remaining: 51.6ms
8:	learn: 0.5288831	total: 4.94ms	remaining: 50ms
9:	learn: 0.5181395	total: 5.37ms	remaining: 48.4ms
10:	learn: 0.5084239	total: 5.98ms	remaining: 48.4ms
11:	learn: 0.4978915	total: 6.41ms	remaining: 47ms
12:	learn: 0.4889679	total: 6.85ms	remaining: 45.8ms
13:	learn: 0.4839974	total: 7.26ms	remaining: 44.6ms
14:	learn: 0.4763755	total: 7.73ms	remaining: 43.8ms
15:	learn: 0.4693834	total: 8.15ms	remaining: 42.8ms
16:	learn: 0.4625048	total: 8.6ms	remaining: 42ms
17:	learn: 0.4575405	total: 9.04ms	remaining: 41.2ms
18:	learn: 0.4530417	total: 9.46ms	r

Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,Skl GBM,83.277217,81.027706,0.64003
2,CatBoost,82.716049,80.241588,0.930662
3,Skl HistGBM,82.491582,80.831176,37.452023
4,LightGBM,82.491582,80.8863,10.093571
5,AdaBoost,81.930415,80.927044,0.885057
6,XGBoost,81.930415,80.430927,3.423682
7,Random Forest,81.369248,79.920429,1.038946
8,Extra Trees,81.032548,79.592081,0.874821
9,Decision Tree,78.900112,77.641166,0.097799


In [30]:
assert results_ord["Accuracy"].min() > 75
assert results_ord["Bal Acc."].min() > 75
assert len(results_ord) == 9

# Exercise 7.1
Train with all data the best model

In [31]:
# best_model = tree_classifiers["SELECT MY BEST MODEL HERE"]

# Fit best model with all data

best_model = tree_classifiers[results_ord.iloc[0].Model]
best_model.fit(x,y)

# Exercise 7.2 (2pts)
With your best model, generate the predicitions for test data (x_test)

In [34]:
# test_pred = # Get the predictions for x_test
test_pred = best_model.predict(x_test)


In [35]:
assert len(test_pred) == 418
assert np.unique(test_pred).tolist() == [0,1]

AssertionError: 

# Exercise 7.3

Submit to kaggle.

- You can use the kaggle command line app. Check https://github.com/Kaggle/kaggle-api

In [36]:
"""sub = pd.DataFrame(test_pred, index=x_test.index, columns=["Survived"])
sub.head()"""

sub = pd.DataFrame(test_pred, index=x_test.index, columns=["Survived"])
sub.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
207,0
438,1
313,1
868,0
683,0


In [37]:
'''sub.to_csv("sub.csv")'''

'sub.to_csv("sub.csv")'

In [38]:
!kaggle competitions submit -c titanic -f sub.csv -m "My submission message"

zsh:1: command not found: kaggle
