In [1]:
import time
from IPython.display import clear_output
import numpy    as np
import pandas   as pd
import seaborn  as sb
import matplotlib.pyplot as plt
import sklearn  as skl

from sklearn import pipeline      # Pipeline
from sklearn import preprocessing # OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config

In [2]:
CLOUD = False

if CLOUD:
    import os
    os.environ['KAGGLE_USERNAME'] = "your_kaggle_username"
    os.environ['KAGGLE_KEY']      = "your_kaggle_api_key"  # See https://www.kaggle.com/docs/api
    !pip install --upgrade kaggle
    !kaggle competitions download -c titanic
    DATA_PATH = "./"

else:
    DATA_PATH = "../../Datasets/Tabular/titanic/"

In [3]:
DATA_PATH = "../../Datasets/Tabular/titanic/"
df      = pd.read_csv(DATA_PATH + "train.csv", index_col='PassengerId')
df_test = pd.read_csv(DATA_PATH + "test.csv",  index_col='PassengerId')

print("Train DataFrame:", df.shape)
print("Test DataFrame: ", df_test.shape)

Train DataFrame: (891, 11)
Test DataFrame:  (418, 10)


In [4]:
get_Title_from_Name = lambda name: name.split(",")[1].strip().split(" ")[0][:-1]
get_Title_from_Name = lambda name: name.split(',')[1].split('.')[0].strip() 


df['Title']      = df['Name'].map(get_Title_from_Name)
df_test['Title'] = df_test['Name'].map(get_Title_from_Name)

# print(df_test["Name"].values[414])
# print(df_test["Title"].values[410:415])
df_test["Title"].value_counts()

Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Dr          1
Dona        1
Ms          1
Name: Title, dtype: int64

In [5]:
title_dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

df["Title"] =  df["Title"].map(title_dictionary)
df_test["Title"] = df_test["Title"].map(title_dictionary)

df["Title"].isnull().sum()

0

In [10]:
cat_vars  = ['Sex', 'Embarked', 'Title']         # x.select_dtypes(include=[object]).columns.values.tolist()
num_vars  = ['Pclass', 'SibSp', 'Parch', 'Fare', 'Age'] # x.select_dtypes(exclude=[object]).columns.values.tolist()

print("\nNumerical features:\n", num_vars)
print("\nCategorical features:\n", cat_vars)


Numerical features:
 ['Pclass', 'SibSp', 'Parch', 'Fare', 'Age']

Categorical features:
 ['Sex', 'Embarked', 'Title']


In [33]:
# The cabin info can be used to figure out what deck a passenger was on,
# Despite having a lot of NaNs, we can still use it as only first class
# passengers would have had a cabin, so the missing data likely doesn't exist

df["Deck"] = df["Cabin"].fillna("?").str[:1]

# cabins = cabins.fillna("?")
# cabins.str[:1]

df["Deck"].value_counts()

?    687
C     59
B     47
D     33
E     32
A     15
F     13
G      4
T      1
Name: Deck, dtype: int64

In [34]:
# We could make another feature by adding data together, for example...
df['Family_Size']=df['SibSp']+df['Parch']

df["Family_Size"].value_counts()

0     537
1     161
2     102
3      29
5      22
4      15
6      12
10      7
7       6
Name: Family_Size, dtype: int64

In [96]:
# Alternatively we could see if people were travelling in a group instead of
# a family specifically by seeing what ticket numbers appear more than  once.

multis = df["Ticket"].value_counts() > 2
#df.sort_values(by="Ticket").head(50)
multis = multis[multis == True]
multis = list(multis.index)

tickets = df["Ticket"]
data_i_want = []
for row in df.values:
    #print(row[7])
    if row[7] in multis:
        data_i_want.append(row)
        
#pd.DataFrame(data_i_want, columns = df.columns).sort_values(by="Ticket").head(50)

In [143]:
# Copied from the other day
num_preprocessing = pipeline.Pipeline(steps=[
    # I went with median just so we don't end up with classes/families with fractional values
    ("imputer", impute.SimpleImputer(strategy="median")),
    # As we are using a tree classifier, the rule of thumb is not to do anything to the numeric values
    #("scaler", preprocessing.StandardScaler())
])

cat_preprocessing = pipeline.Pipeline(steps=[
    # By default, imputer uses 0/"missing_value" when filling things in with the constant strategy, I think that's fine for this
    ("imputer", impute.SimpleImputer(strategy="constant")),
    # Rule of thumb for categorical data when using trees is "Ordinal Encoding", where classes become numeric values.
    ("ordinal", preprocessing.OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)  )
])

# This is a nice pipeline, very concise and organised.
tree_prepro = compose.ColumnTransformer(transformers=[
    # Apply the imputer to our numeric data
    ("num", num_preprocessing, num_vars),
    # Apply the imputer and ordinal encoding to the categorical data
    ("cat", cat_preprocessing, cat_vars),
], remainder="drop") # And finally get rid of everything else


In [102]:
from lightgbm              import LGBMClassifier
set_config(display='diagram')
pipe = pipeline.make_pipeline(tree_prepro, LGBMClassifier())
pipe

In [132]:
df = df.dropna()
x = df.drop(columns=["Survived", 'Name', 'Ticket', 'Cabin']) # X DATA (WILL BE TRAIN+VALID DATA)
y = df["Survived"] # 0 = No, 1 = Yes


x_train, x_val, y_train, y_val = model_selection.train_test_split(x,y, 
                                                                  train_size=0.8,
                                                                  stratify=y,
                                                                  random_state=0)



In [133]:
# Let's use the new features we made
cat_vars  = ['Sex', 'Embarked', 'Title', "Cabin"]         # x.select_dtypes(include=[object]).columns.values.tolist()
num_vars  = ['Pclass', "Family_Size", 'Fare', 'Age'] # x.select_dtypes(exclude=[object]).columns.values.tolist()


In [142]:
tree_prepro
#LGBMClassifier().fit(x_train, y_train)


In [None]:
# Why it no work omg

In [156]:
DATA_PATH = "../../Datasets/Tabular/titanic/"
df      = pd.read_csv(DATA_PATH + "train.csv", index_col='PassengerId')
df_test = pd.read_csv(DATA_PATH + "test.csv",  index_col='PassengerId')

print("Train DataFrame:", df.shape)
print("Test DataFrame: ", df_test.shape)



get_Title_from_Name = lambda name: name.split(",")[1].strip().split(" ")[0][:-1]
get_Title_from_Name = lambda name: name.split(',')[1].split('.')[0].strip() 


df['Title']      = df['Name'].map(get_Title_from_Name)
df_test['Title'] = df_test['Name'].map(get_Title_from_Name)

title_dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

df["Title"] =  df["Title"].map(title_dictionary)
df_test["Title"] = df_test["Title"].map(title_dictionary)


df["Deck"] = df["Cabin"].fillna("?").str[:1]
df['Family_Size']=df['SibSp']+df['Parch']

print(df.columns)

x = df.drop(columns=["Survived"]) # X DATA (WILL BE TRAIN+VALID DATA)
y = df["Survived"] # 0 = No, 1 = Yes


cat_vars  = ['Sex', 'Embarked', 'Title', "Deck"]         # x.select_dtypes(include=[object]).columns.values.tolist()
num_vars  = ["Family_Size", 'Fare', 'Age'] # x.

print("Pipeline after")
#print(df.isnull().sum())

Train DataFrame: (891, 11)
Test DataFrame:  (418, 10)
Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked', 'Title', 'Deck', 'Family_Size'],
      dtype='object')
Pipeline after


In [158]:
tree_prepro
#pipe
# pipe = pipeline.make_pipeline(tree_prepro, LGBMClassifier())
# pipe

In [191]:
from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting # Necesary for HistGradientBoostingClassifier
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from catboost              import CatBoostClassifier
# I just want this one
from lightgbm              import LGBMClassifier

tree_classifiers = {
  "Decision Tree": DecisionTreeClassifier(random_state=0),
    # Extra trees needed a bit of tuning to get past 75% accuracy, rest where fine with default settings
  "Extra Trees": ExtraTreesClassifier(random_state=0, n_estimators=1000, bootstrap=True, max_samples=10),
  "Random Forest": RandomForestClassifier(random_state=0),
  "AdaBoost": AdaBoostClassifier(random_state=0),
  "Skl GBM": GradientBoostingClassifier(random_state=0),
  "Skl HistGBM": HistGradientBoostingClassifier(random_state=0),
  "XGBoost": XGBClassifier(random_state=0),
  "LightGBM": LGBMClassifier(random_state=0),
  "CatBoost": CatBoostClassifier(random_state=0),
}
tree_classifiers= {
     "LightGBM": LGBMClassifier(random_state=0)
}


tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}

#tree_classifiers["LightGBM"]
# This is the same thing I had earlier???
one_pipe = pipeline.make_pipeline(tree_prepro, LGBMClassifier())

In [163]:
x_train, x_val, y_train, y_val = model_selection.train_test_split(x,y, 
                                                                  train_size=0.8,
                                                                  stratify=y,
                                                                  random_state=0)


results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

In [210]:
for model_name, model in tree_classifiers.items():
    print(model_name)
    print(model)
    print("="*10)
    start_time = time.time()
    model.fit(x_train, y_train)
    pred = model.predict(x_val)
    total_time = time.time() - start_time
    results = results.append({"Model":    model_name,
                          "Accuracy": metrics.accuracy_score(y_val, pred)*100,
                          "Bal Acc.": metrics.balanced_accuracy_score(y_val, pred)*100,
                          "Time":     total_time},
                          ignore_index=True)


# results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
# results_ord.index += 1 
# # One of these models is spewing out information about it's learning rate, my guess is one of them is "verbose"?
# # This is my small-brain fix
# clear_output()
# results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')



LightGBM
Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['Pclass', 'Family_Size',
                                                   'Fare', 'Age']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('ordinal',
                                                                   OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                                  unknown_value=nan))]),
                              

In [218]:
one_pipe

In [216]:
one_pipe
one_pipe.fit(x_train, y_train)
one_pipe.score(x_val, y_val)

0.8212290502793296

In [217]:
for thing in one_pipe:
    print(thing)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['Pclass', 'Family_Size', 'Fare', 'Age']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='constant')),
                                                 ('ordinal',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=nan))]),
                                 ['Sex', 'Embarked', 'Title', 'Cabin'])])
LGBMClassifier()


In [212]:
#tree_classifiers.items()
#tree_classifiers["LightGBM"]
for thing, other_thing in one_pipe:
    print(thing)
    print(other_thing)

TypeError: cannot unpack non-iterable ColumnTransformer object

In [211]:
for thing, other_thing in [one_pipe]:
    print(thing)
    print(other_thing)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['Pclass', 'Family_Size', 'Fare', 'Age']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='constant')),
                                                 ('ordinal',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=nan))]),
                                 ['Sex', 'Embarked', 'Title', 'Cabin'])])
LGBMClassifier()
