In [39]:
# Import Library
import pandas as pd
import numpy as np

# Import Feature Engineering/Preprocessing Data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

# Import models
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

# Import cross val and tuning
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

# import warning
import warnings
warnings.filterwarnings('ignore')

In [6]:
# Load dataset
df_ori = pd.read_csv('https://raw.githubusercontent.com/FTDS-learning-materials/phase-1/master/w1/P1W1D3AM%20-%20Feature%20Engineering%20-%20Part%201%20-%20Titanic.csv')

# Copy data ori
df = df_ori.copy()
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
# Check dataset 1
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


statement >>> misal ada missing value, mari lihat jumlah dan persentasenya

In [10]:
print(df.isnull().sum())
print(f'{round(df.isnull().mean()*100,2)}')

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId     0.00
Survived        0.00
Pclass          0.00
Name            0.00
Sex             0.00
Age            19.87
SibSp           0.00
Parch           0.00
Ticket          0.00
Fare            0.00
Cabin          77.10
Embarked        0.22
dtype: float64


karena kebutuhan kita skrg untuk belajar, jadi seluruh misval didrop saja.

In [11]:
df.dropna(inplace=True)

In [12]:
# check after drop
print(df.isnull().sum())
print(f'{round(df.isnull().mean()*100,2)}')

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64
PassengerId    0.0
Survived       0.0
Pclass         0.0
Name           0.0
Sex            0.0
Age            0.0
SibSp          0.0
Parch          0.0
Ticket         0.0
Fare           0.0
Cabin          0.0
Embarked       0.0
dtype: float64


In [13]:
# Check data duplicate
df.duplicated().sum()

0

In [14]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [15]:
# Split num cat col
X = df.drop(['Survived'], axis=1)
y = df.Survived

In [31]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=70)
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
717,718,2,"Troutt, Miss. Edwina Celia ""Winnie""",female,27.0,0,0,34218,10.5,E101,S
263,264,1,"Harrison, Mr. William",male,40.0,0,0,112059,0.0,B94,S
710,711,1,"Mayne, Mlle. Berthe Antonine (""Mrs de Villiers"")",female,24.0,0,0,PC 17482,49.5042,C90,C
97,98,1,"Greenfield, Mr. William Bertram",male,23.0,0,1,PC 17759,63.3583,D10 D12,C
310,311,1,"Hays, Miss. Margaret Bechstein",female,24.0,0,0,11767,83.1583,C54,C


Saya berasumsi passenger id, name, dan ticket itu korelasi dengan targetnya rendah.

In [32]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [33]:
# Feature selection
X_train.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
X_test.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
X_train.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked'], dtype='object')

In [35]:
# Split num and cat col
num_col = X_train.select_dtypes(exclude=['object']).columns.tolist()
cat_col = X_train.select_dtypes(include=['object']).columns.tolist()

print(f'numerical :{num_col}')
print(f'categorical:{cat_col}')

numerical :['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
categorical:['Sex', 'Cabin', 'Embarked']


Setelah berhasil memisahkan categorical dan numerical column, sekarang saya akan melakukan scaling menggunakan `StandardScaler()` karena .... . Lalu mengencode dengan `OneHotEncoder()` karena .... . Setelah itu dimasukkan ke dalam variable preprocess untuk dilakukan transformasi menggunakan `ColumnTransformer`

In [36]:
# Feature scaling and encoding in column transformer
scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore',sparse_output=False)

preprocess = ColumnTransformer(
    transformers=[
        ('numerical', scaler,num_col),
        ('categorical', encoder,cat_col)
        ], 
    remainder='passthrough'
)

In [37]:
# Model definition using pipeline
pipe_log = make_pipeline(preprocess,LogisticRegression())
pipe_svc = make_pipeline(preprocess,SVC())
pipe_knn = make_pipeline(preprocess,KNeighborsClassifier())
pipe_nb = make_pipeline(preprocess,GaussianNB())
pipe_dt = make_pipeline(preprocess,DecisionTreeClassifier(random_state=70))
pipe_rf = make_pipeline(preprocess,RandomForestClassifier(random_state=70))
pipe_ada = make_pipeline(preprocess,AdaBoostClassifier())

In [38]:
# Define cross val for each pipeline
cv_log = cross_val_score(pipe_log, X_train, y_train, cv = 5, scoring='f1', n_jobs=-1)
cv_svc = cross_val_score(pipe_svc, X_train, y_train, cv = 5, scoring='f1', n_jobs=-1)
cv_knn = cross_val_score(pipe_knn, X_train, y_train, cv = 5, scoring='f1', n_jobs=-1)
cv_nb = cross_val_score(pipe_nb, X_train, y_train, cv = 5, scoring='f1', n_jobs=-1)
cv_dt = cross_val_score(pipe_dt, X_train, y_train, cv = 5, scoring='f1', n_jobs=-1)
cv_rf = cross_val_score(pipe_rf, X_train, y_train, cv = 5, scoring='f1', n_jobs=-1)
cv_ada = cross_val_score(pipe_ada, X_train, y_train, cv = 5, scoring='f1', n_jobs=-1)

In [41]:
# Finding best model based on Cross_val_score (mean)

name_model = []
cv_scores = 0
for cv,name in zip([cv_log,cv_svc,cv_knn,cv_nb,cv_dt,cv_rf,cv_ada],
                   ['log','svc','knn','nb','dt','rf','ada']):
    
    print(name)
    print('f1-score - All - Cross Validation :', cv)
    print('f1-score - Mean - Cross Validation :', cv.mean())
    print('f1-score - Std - Cross Validation :', cv.std())
    print('f1-score - Range of test set :', (cv.mean()-cv.std()), '-', (cv.mean()+cv.std()))
    print('-'*50)
    if cv.mean() > cv_scores:
        cv_scores = cv.mean()
        name_model = name
    else:
        pass

print(f'Best Model: {name_model}')
print(f'Cross Val Mean from best model: {cv_scores}')

log
f1-score - All - Cross Validation : [0.76190476 0.87179487 0.92682927 0.75675676 0.76923077]
f1-score - Mean - Cross Validation : 0.8173032855959684
f1-score - Std - Cross Validation : 0.06929807971867405
f1-score - Range of test set : 0.7480052058772944 - 0.8866013653146425
--------------------------------------------------
svc
f1-score - All - Cross Validation : [0.79069767 0.86486486 0.8372093  0.66666667 0.76923077]
f1-score - Mean - Cross Validation : 0.7857338555012973
f1-score - Std - Cross Validation : 0.06838921710377244
f1-score - Range of test set : 0.7173446383975248 - 0.8541230726050697
--------------------------------------------------
knn
f1-score - All - Cross Validation : [0.77272727 0.85714286 0.7804878  0.73684211 0.73684211]
f1-score - Mean - Cross Validation : 0.7768084290548989
f1-score - Std - Cross Validation : 0.04399642725151764
f1-score - Range of test set : 0.7328120018033812 - 0.8208048563064165
--------------------------------------------------
nb
f1-s