#Titanic - Machine Learning from Disaster

In [None]:
train_df = pd.read_csv("./data/train.csv", index_col="PassengerId")
test_df = pd.read_csv("./data/test.csv", index_col="PassengerId")

In [None]:
train_df.columns

In [None]:
test_df.columns

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df_features = ["Embarked","Pclass", "Sex", "SibSp", "Parch", "Embarked"]
test_df_features = ["Pclass", "Sex", "SibSp", "Parch", "Embarked"]
def convert_cat(df, features):
    for feature in features:
        df[feature] = df[feature].astype("category")
convert_cat(train_df, train_df_features)
convert_cat(test_df, test_df_features)

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.describe(include=["category"])

3. Exploratory Data Analysis (EDA)
3.1. Correlating categorical features
   + categorical: 	Survived,	Pclass (Ordinal),	Sex,	SibSp,	Parch,	Embarked
  

Target Variable: Survived

In [None]:
train_df["Survived"].value_counts().to_frame()

In [None]:
train_df["Survived"].value_counts(normalize=True).to_frame()

In [None]:
sns.countplot(data=train_df, x='Sex', hue='Survived', palette='Blues');

In [None]:
cols = ['Sex', 'Embarked', 'Pclass', 'SibSp', 'Parch']

n_rows = 2
n_cols = 3

fig, ax = plt.subplots(n_rows, n_cols, figsize=(n_cols*3.5, n_rows*3.5))

for r in range(0, n_rows):
    for c in range(0, n_cols):
        i = r*n_cols + c #index to loop through list "cols"
        if i < len(cols):
            ax_i = ax[r,c]
            sns.countplot(data=train_df, x=cols[i], hue="Survived", palette="Blues", ax=ax_i)
            ax_i.set_title(f"Figure {i+1}: Survival Rate vs {cols[i]}")
            ax_i.legend(title='', loc='upper right', labels=['Not Survival', 'Survival'])
ax.flat[-1].set_visible(False) 
plt.tight_layout()
plt.show()

## Observation:
- Survival rate
    - Fig 1: Female survival rate > Male
    - Fig 2: Most people embarked on Southampton, and also had the highest people not survived
    - Fig 3: 1st class higher survived rate
    - Fig 4: People going with 0 SibSp are mostly not survived. The number of passenger with 1-2 family members has a better chance of survival
    - Fig 5: People going with 0 Parch are mostly not survived
# 3.2. EDA for Numerical Features
- Numerical Features: (continuous) `Age`, `Fare`

## Age

In [None]:
sns.histplot(data=train_df, x='Age', hue='Survived', bins = 40, kde=True);

- Majority passengers were from 18-40 ages
- Chilren had more chance to survive than other ages

## Fare

In [None]:
train_df["Fare"].describe()

In [None]:
sns.histplot(data=train_df, x='Fare', hue = 'Survived', bins=40, palette="Blues");

In [None]:
# To name for 0-25% quartile, 25-50, 50-75, 75-100

fare_categories = ['Economic', 'Standard', 'Expensive', 'Luxury']
quartile_data = pd.qcut(train_df['Fare'], 4 , labels=fare_categories)
sns.countplot(x=quartile_data, hue=train_df['Survived'], palette='Blues');

In [None]:
train_df['Fare']

- Distribution of Fare
    - Fare does not follow a normal distribution and has a huge spike at the price range `(0-100$]`.
    - The distribution is skewed to the left with 75% of the fare paid under $31 and a max paid fare of $512.
- Quartile plot:
    - Passenger with Luxury & Expensive Fare will have more chance to survive
 
## Feature Engineering & Data Wrangling
### Name
- Regular Expression

In [None]:
train_df["Name"].head(10)

In [None]:
import re

def extract_title(name):
    p = re.compile(r",([\w\s]+)\.")
    return p.search(name).groups(1)[0].strip()

train_df['Title']=train_df['Name'].apply(lambda name: extract_title(name))

In [None]:
train_df['Title'].value_counts()

In [None]:
test_df['Title']=test_df['Name'].apply(lambda name: extract_title(name))

In [None]:
test_df['Title'].value_counts()

In [None]:
def group_title(title):
    if title in ['Mr', 'Mrs', 'Miss', 'Master']:
        return title
    elif title == 'Ms':
        return 'Miss'
    else:
        return 'Others'

train_df['Title'] = train_df['Title'].apply(lambda title: group_title(title))
test_df['Title'] = test_df['Title'].apply(lambda title: group_title(title))
        

In [None]:
sns.countplot(data=train_df, x='Title', hue='Survived')

### Family
- `SibSp`, `Parch`

In [None]:
train_df['Family_Size'] = train_df['SibSp'].astype('int') + train_df['Parch'].astype('int') +1

In [None]:
test_df['Family_Size'] = test_df['SibSp'].astype('int') + test_df['Parch'].astype('int') +1

In [None]:
train_df['Family_Cat'] = pd.cut(train_df['Family_Size'], bins=[0,1,4,6,20], labels=['Solo', 'Small', 'Medium', 'Large'])
test_df['Family_Cat'] = pd.cut(test_df['Family_Size'], bins=[0,1,4,6,20], labels=['Solo', 'Small', 'Medium', 'Large'])

In [None]:
sns.countplot(data=train_df, x='Family_Cat', hue='Survived');

## 4.2 Data Wrangling

In [None]:
num_features = ['Age', 'Fare']
cat_features = ['Sex', 'Pclass', 'Embarked', 'Title', 'Family_Cat']
feature_cols = num_features + cat_features
print(feature_cols)

In [None]:
def display_missing(df, feature_cols):
    n_rows = df.shape[0]
    for col in feature_cols:
        missing_count = df[col].isnull().sum()
        if missing_count > 0:
            print(f"{col} has {missing_count*100/n_rows:.2f}% missing values.")

display_missing(train_df, feature_cols)
display_missing(test_df, feature_cols)

## Filling missing values
#### Age
- Filling missing values with median of whole dataset

In [None]:
age_by_sex_pclass = train_df.groupby(['Sex', 'Pclass'], observed=True)['Age'].median()

In [None]:
age_by_sex_pclass

In [None]:
# Filling the missing values in Age
train_df['Age'] = train_df.groupby(['Sex', 'Pclass'], observed=True)['Age'].transform(lambda x: x.fillna(x.median()))
test_df['Age'] = test_df.groupby(['Sex', 'Pclass'], observed=True)['Age'].transform(lambda x: x.fillna(x.median()))

In [None]:
display_missing(train_df, feature_cols)
display_missing(test_df, feature_cols)

In [None]:
x = train_df[feature_cols]
y = train_df['Survived']

In [None]:
x_test = test_df[feature_cols]

In [None]:
# preprocess pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

num_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

In [None]:
preprocessor.fit(x)

In [None]:
x = preprocessor.transform(x)

In [None]:
x_test = preprocessor.transform(x_test)

## 5. Model training

In [None]:
from sklearn.model_selection import train_test_split


In [None]:
x_train, x_val, y_train, y_val = train_test_split(x,y,test_size=0.2)

In [None]:
x_train.shape, x_val.shape

In [None]:
x_test.shape

In [None]:
# binary classsification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, classification_report, confusion_matrix
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeClassifier

In [None]:
log_reg = LogisticRegression(solver='liblinear', max_iter=1000)
log_reg.fit(x_train, y_train)

In [None]:
log_reg.score(x_val, y_val)

In [None]:
y_pred = log_reg.predict(x_val)

In [None]:
precision_score(y_val, y_pred), recall_score(y_val, y_pred)

In [None]:
print(classification_report(y_val, y_pred))

- y = ax1 + bx2 + bias -> y = ax1^2 + bx^2 + c x1*x2 +bias

In [None]:
poly = PolynomialFeatures(degree=5)
poly_features_x_train = poly.fit_transform(x_train)
poly_features_x_val = poly.transform(x_val)
                                    

In [None]:
poly_log_reg = LogisticRegression(solver='liblinear', max_iter=1000)
poly_log_reg.fit(poly_features_x_train, y_train)

In [None]:
poly_log_reg.score(poly_features_x_val, y_val)

In [None]:
# decision tree
decision_tree = DecisionTreeClassifier(criterion= 'entropy', max_depth=8, random_state=2022)
decision_tree.fit(x_train, y_train)

In [None]:
decision_tree.score(x_val, y_val)

## 5.1 Cross-validation

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
log_reg_cv = LogisticRegression(solver='liblinear', max_iter=1000)
dt_cv = DecisionTreeClassifier(criterion= 'entropy', max_depth=8, random_state=2022)

lr_scores = cross_val_score(log_reg_cv, x, y, scoring='accuracy', cv=5)

In [None]:
lr_scores.mean(), lr_scores.std()

## 5.2 Baseline Model Comparison

In [None]:
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
seed = 2023
models = [
    LinearSVC(max_iter=12000, random_state=seed),
    SVC(random_state=seed),
    KNeighborsClassifier(metric='minkowski', p=2),
    LogisticRegression(solver='liblinear', max_iter=1000),
    DecisionTreeClassifier(random_state=seed),
    RandomForestClassifier(random_state=seed),
    ExtraTreesClassifier(),
    AdaBoostClassifier(algorithm='SAMME'),
    XGBClassifier( eval_metric='logloss', random_state=seed)
]
    

In [None]:
from sklearn.model_selection import StratifiedKFold
def generate_baseline_results(models, x, y, metrics, cv =5, plot_results=False):
    #define k-fold:
    kfold = StratifiedKFold(cv, shuffle=True, random_state=seed)
    entries = []
    for model in models:
        model_name = model.__class__. __name__
        scores = cross_val_score(model, x, y, scoring=metrics, cv=kfold)
        for fold_idx, score in enumerate(scores):
            entries.append((model_name, fold_idx, score))

    cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_id', 'accuracy_score'])
    return cv_df

generate_baseline_results(models, x, y, metrics='accuracy', cv = 5, plot_results=False)