<a href="https://colab.research.google.com/github/SkyChen1009/ML-project/blob/main/Titanic%20-%20ML%20Disaster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
# Install package
!pip install h2o

Collecting h2o
  Downloading h2o-3.44.0.1.tar.gz (257.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.4/257.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.44.0.1-py2.py3-none-any.whl size=257484150 sha256=246b5e98f86289035b46db354977e55fbecb3c126eb88e8f19499c7e9523d9e2
  Stored in directory: /root/.cache/pip/wheels/d9/9b/ca/7345b72d17e1e17da37239d70631c3214ec9e541b0c9e700e2
Successfully built h2o
Installing collected packages: h2o
Successfully installed h2o-3.44.0.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Data preprocessing
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# metric
from sklearn.metrics import accuracy_score

In [None]:
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train.csv')
testdata = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/test.csv')
sub = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/gender_submission.csv')

# EDA

In [None]:
print('train_shape: ', train.shape)
print('test_shape: ', testdata.shape)
print('sub_shape: ', sub.shape)

train_shape:  (891, 12)
test_shape:  (418, 11)
sub_shape:  (418, 2)


In [None]:
print(train.info())
print(testdata.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pcl

In [None]:
# check nunique
for column in train.columns:
    unique_count = train[column].nunique()
    print(f"{column}: {unique_count}")

PassengerId: 891
Survived: 2
Pclass: 3
Name: 891
Sex: 2
Age: 88
SibSp: 7
Parch: 7
Ticket: 681
Fare: 248
Cabin: 147
Embarked: 3


In [None]:
print(train.isnull().sum())
print(testdata.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


# Data Preprocessing

In [None]:
# Fill missing value
age_mean = train['Age'].mean()
fare_mean = train['Fare'].mean()

train['Age'].fillna(age_mean, inplace=True)
testdata['Age'].fillna(age_mean, inplace=True)

testdata['Fare'].fillna(fare_mean, inplace=True)

In [None]:
imputer = SimpleImputer(strategy='most_frequent')
missing_columns = ['Cabin', 'Embarked']

for column in missing_columns:
  combined_data = pd.concat([train[column], testdata[column]], axis=0)
  combined_data = pd.DataFrame(combined_data, columns=[column])
  combined_data = imputer.fit_transform(combined_data)

  train[column] = combined_data[:len(train)]
  testdata[column] = combined_data[len(train):]

In [None]:
train['Sex'] = train['Sex'].replace({'male': 1, 'female': 0})
testdata['Sex'] = testdata['Sex'].replace({'male': 1, 'female': 0})

In [None]:
label_encoder = LabelEncoder()
object_columns = ['Name', 'Ticket', 'Cabin', 'Embarked']

for column in object_columns:
    combined_data = pd.concat([train[column], testdata[column]], axis=0)
    label_encoder.fit(combined_data)

    train[column] = label_encoder.transform(train[column])
    testdata[column] = label_encoder.transform(testdata[column])

In [None]:
# Finding correlation of every variables with target
correlation_matrix = train.corr()
correlation_with_target = correlation_matrix['Survived']
print(correlation_with_target)

PassengerId   -0.005007
Survived       1.000000
Pclass        -0.338481
Name          -0.057487
Sex           -0.543351
Age           -0.069809
SibSp         -0.035322
Parch          0.081629
Ticket        -0.166734
Fare           0.257307
Cabin          0.103213
Embarked      -0.167675
Name: Survived, dtype: float64


In [None]:
testdata.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [None]:
# Drop unimportant features
features_to_drop = ['PassengerId', 'Name']

train = train.drop(features_to_drop, axis=1)
testdata = testdata.drop(features_to_drop, axis=1).copy()

In [None]:
# train test split
X = train.drop(['Survived'], axis=1)
y = train['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(fit_intercept=True, max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.8100558659217877


# Ensemble Learning: Stacking

In [None]:
# Stacking
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Cross validation strategy
cv = KFold(n_splits=10, random_state=42, shuffle=True)

# Define meta model
meta_model = LogisticRegression(fit_intercept=True, max_iter=1000)

# Define base model
base_model = base_models = [
    ('rf_model', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svm_model', SVC()),
    ('xgb_model', XGBClassifier(objective='binary:logistic', learning_rate=0.5, max_depth=3))
    # Add more base models if needed
]

# Stacking
model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=cv)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Performance
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8212290502793296

## Test and Submission

In [None]:
pred = model.predict(testdata)
sub['Survived'] = pred
sub.to_csv('/content/drive/MyDrive/Colab Notebooks/sub.csv', index=False, header=True)

# H2O AutoML

In [None]:
import h2o
h2o.init()
from h2o.automl import H2OAutoML

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,1 hour 0 mins
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.1
H2O_cluster_version_age:,16 days
H2O_cluster_name:,H2O_from_python_unknownUser_2kd6d9
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.092 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [None]:
trainframe = h2o.H2OFrame(train)
testframe = h2o.H2OFrame(testdata)
trainf, testf, validf = trainframe.split_frame([0.6, 0.2], seed=42)
trainf['Survived'] = trainf['Survived'].asfactor()
y = 'Survived'
X = list(trainframe.columns)
X.remove(y)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [None]:
aml = H2OAutoML(max_models=20, max_runtime_secs=180, seed=42)
aml.train(x=X, y=y, training_frame=trainf)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,31.0,31.0,20054.0,6.0,6.0,6.0,27.0,62.0,46.80645

Unnamed: 0,0,1,Error,Rate
0,327.0,6.0,0.018,(6.0/333.0)
1,15.0,190.0,0.0732,(15.0/205.0)
Total,342.0,196.0,0.039,(21.0/538.0)

metric,threshold,value,idx
max f1,0.3799143,0.9476309,159.0
max f2,0.265615,0.9555985,179.0
max f0point5,0.5511183,0.9699893,146.0
max accuracy,0.3799143,0.9609665,159.0
max precision,0.9721592,1.0,0.0
max recall,0.0936529,1.0,298.0
max specificity,0.9721592,1.0,0.0
max absolute_mcc,0.3799143,0.9171316,159.0
max min_per_class_accuracy,0.2874128,0.9512195,174.0
max mean_per_class_accuracy,0.265615,0.9558998,179.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0111524,0.9669096,2.6243902,2.6243902,1.0,0.968658,1.0,0.968658,0.0292683,0.0292683,162.4390244,162.4390244,0.0292683
2,0.0204461,0.9648086,2.6243902,2.6243902,1.0,0.9661927,1.0,0.9675374,0.0243902,0.0536585,162.4390244,162.4390244,0.0536585
3,0.0315985,0.9644266,2.6243902,2.6243902,1.0,0.9646318,1.0,0.9665119,0.0292683,0.0829268,162.4390244,162.4390244,0.0829268
4,0.0408922,0.9633217,2.6243902,2.6243902,1.0,0.963785,1.0,0.9658921,0.0243902,0.1073171,162.4390244,162.4390244,0.1073171
5,0.0501859,0.9609135,2.6243902,2.6243902,1.0,0.9621625,1.0,0.9652014,0.0243902,0.1317073,162.4390244,162.4390244,0.1317073
6,0.1003717,0.9486439,2.6243902,2.6243902,1.0,0.955144,1.0,0.9601727,0.1317073,0.2634146,162.4390244,162.4390244,0.2634146
7,0.1505576,0.938561,2.6243902,2.6243902,1.0,0.9440538,1.0,0.9547997,0.1317073,0.395122,162.4390244,162.4390244,0.395122
8,0.2007435,0.8981746,2.6243902,2.6243902,1.0,0.9231341,1.0,0.9468833,0.1317073,0.5268293,162.4390244,162.4390244,0.5268293
9,0.3011152,0.7513192,2.6243902,2.6243902,1.0,0.8247764,1.0,0.906181,0.2634146,0.7902439,162.4390244,162.4390244,0.7902439
10,0.3996283,0.2659255,1.7330879,2.4046739,0.6603774,0.4852166,0.9162791,0.8024084,0.1707317,0.9609756,73.3087897,140.4673851,0.9069216

Unnamed: 0,0,1,Error,Rate
0,282.0,51.0,0.1532,(51.0/333.0)
1,43.0,162.0,0.2098,(43.0/205.0)
Total,325.0,213.0,0.1747,(94.0/538.0)

metric,threshold,value,idx
max f1,0.3534992,0.7751196,183.0
max f2,0.1705035,0.8198925,253.0
max f0point5,0.7848433,0.820029,97.0
max accuracy,0.5586479,0.8327138,139.0
max precision,0.9784272,1.0,0.0
max recall,0.0388793,1.0,395.0
max specificity,0.9784272,1.0,0.0
max absolute_mcc,0.5586479,0.6401774,139.0
max min_per_class_accuracy,0.2809488,0.8048048,198.0
max mean_per_class_accuracy,0.3534992,0.8185454,183.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0111524,0.9710671,2.6243902,2.6243902,1.0,0.974418,1.0,0.974418,0.0292683,0.0292683,162.4390244,162.4390244,0.0292683
2,0.0204461,0.9653763,2.6243902,2.6243902,1.0,0.9688929,1.0,0.9719066,0.0243902,0.0536585,162.4390244,162.4390244,0.0536585
3,0.0315985,0.959758,2.6243902,2.6243902,1.0,0.9633871,1.0,0.9688997,0.0292683,0.0829268,162.4390244,162.4390244,0.0829268
4,0.0408922,0.9548548,2.6243902,2.6243902,1.0,0.9560651,1.0,0.9659828,0.0243902,0.1073171,162.4390244,162.4390244,0.1073171
5,0.0501859,0.9489526,2.0995122,2.5271906,0.8,0.9509923,0.962963,0.9632068,0.0195122,0.1268293,109.9512195,152.7190605,0.1238263
6,0.1003717,0.9310478,2.6243902,2.5757904,1.0,0.9406585,0.9814815,0.9519327,0.1317073,0.2585366,162.4390244,157.5790425,0.2555336
7,0.1505576,0.8892931,2.3327913,2.4947907,0.8888889,0.9131719,0.9506173,0.9390124,0.1170732,0.3756098,133.2791328,149.4790726,0.3635977
8,0.2007435,0.8300426,2.2355917,2.429991,0.8518519,0.8612722,0.9259259,0.9195773,0.1121951,0.4878049,123.5591689,142.9990967,0.4637809
9,0.3011152,0.5704902,1.7981933,2.2193917,0.6851852,0.7157051,0.845679,0.8516199,0.1804878,0.6682927,79.8193315,121.9391749,0.5932176
10,0.3996283,0.3396738,1.2379199,1.9774475,0.4716981,0.4552304,0.7534884,0.7539053,0.1219512,0.7902439,23.7919926,97.7447533,0.6310847

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.8438733,0.0120085,0.8518519,0.8333333,0.8425926,0.8317757,0.8598131
auc,0.8716613,0.04406,0.8874776,0.8269021,0.9377503,0.8380231,0.8681534
err,0.1561267,0.0120085,0.1481482,0.1666667,0.1574074,0.1682243,0.1401869
err_count,16.8,1.3038405,16.0,18.0,17.0,18.0,15.0
f0point5,0.7980603,0.0176153,0.8071749,0.801105,0.7676349,0.8018868,0.8125
f1,0.7922687,0.0235572,0.8181818,0.7631579,0.8131868,0.7906977,0.7761194
f2,0.7890592,0.0574331,0.8294931,0.7286432,0.864486,0.7798165,0.7428572
lift_top_group,2.3936105,0.6803336,2.511628,2.6341465,2.6341465,1.2159091,2.9722223
logloss,0.4203849,0.0754569,0.3840818,0.4848482,0.3244429,0.5081251,0.4004266
max_per_class_error,0.2309108,0.0548318,0.1627907,0.2926829,0.1940299,0.2272727,0.2777778

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
,2023-11-02 08:16:21,0.885 sec,0.0,0.4856426,0.6645714,0.5,0.3810409,1.0,0.6189591
,2023-11-02 08:16:21,0.905 sec,5.0,0.3820904,0.4738651,0.9539735,0.9504356,2.6243902,0.0762082
,2023-11-02 08:16:21,0.924 sec,10.0,0.3180599,0.3679931,0.9591738,0.9548346,2.6243902,0.063197
,2023-11-02 08:16:21,0.942 sec,15.0,0.280556,0.3045402,0.962045,0.9607397,2.6243902,0.0650558
,2023-11-02 08:16:21,0.961 sec,20.0,0.2515848,0.2560586,0.9722991,0.9700749,2.6243902,0.0576208
,2023-11-02 08:16:21,0.983 sec,25.0,0.2305055,0.2197298,0.9812129,0.9785853,2.6243902,0.0483271
,2023-11-02 08:16:21,1.002 sec,30.0,0.2086636,0.186007,0.9903171,0.9875352,2.6243902,0.0427509
,2023-11-02 08:16:21,1.012 sec,31.0,0.2042195,0.1794724,0.9912327,0.9886298,2.6243902,0.0390335

variable,relative_importance,scaled_importance,percentage
Sex,161.1938171,1.0,0.297401
Ticket,91.7126846,0.5689591,0.169209
Age,78.0731125,0.4843431,0.1440441
Fare,77.5553284,0.4811309,0.1430888
Cabin,39.911953,0.2476023,0.0736372
Pclass,39.5088654,0.2451016,0.0728935
SibSp,30.3301811,0.1881597,0.0559589
Parch,16.7653542,0.1040074,0.0309319
Embarked,6.9569626,0.043159,0.0128355


In [None]:
best_model = aml.leader

varimp = best_model.varimp()
varimp_df = pd.DataFrame(varimp, columns=["Feature", "Relative Importance", "Scaled Importance", "Percentage"])
varimp_df

Unnamed: 0,Feature,Relative Importance,Scaled Importance,Percentage
0,Sex,117.131447,1.0,0.277895
1,Age,67.005905,0.572057,0.158972
2,Ticket,57.92738,0.49455,0.137433
3,Pclass,49.09816,0.419171,0.116486
4,Fare,48.412014,0.413314,0.114858
5,Cabin,34.071083,0.290879,0.080834
6,SibSp,20.81465,0.177703,0.049383
7,Parch,15.681441,0.133879,0.037204
8,Embarked,11.353739,0.096932,0.026937


## Test and Submission

In [None]:
pred = aml.leader.predict(testframe)
predd = pred.as_data_frame()
sub.to_csv('/content/drive/MyDrive/Colab Notebooks/sub2.csv', index=False, header=True)

gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
