In [210]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import robust_scale
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [211]:
def clean(df):
    df.drop("Name", axis=1, inplace=True)
    df.drop("Ticket", axis=1, inplace=True)
    df.drop("Cabin", axis=1, inplace=True)
    
    df["Age"].fillna(df["Age"].median(), inplace=True)
    df["Embarked"].fillna("U", inplace=True)
    df["Fare"].fillna(df["Fare"].median(), inplace=True)
    
    se = LabelEncoder()
    ee = LabelEncoder()
    df["Sex"] = se.fit_transform(df["Sex"])
    df["Embarked"] = ee.fit_transform(df["Embarked"])
    
    df["Fare"] = robust_scale(df["Fare"])
    ageScaler = StandardScaler()
    df["Age"] = ageScaler.fit_transform(df[["Age"]])
            
    return df

In [212]:
training_df = clean(pd.read_csv("data/train.csv"))
testing_df = clean(pd.read_csv("data/test.csv"))

## END CLEANING

In [213]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.svm import SVC


import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix


In [214]:
def printStats(y_true, y_pred):
    print(classification_report(y_true, y_pred))

    # Create a sample confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Create a heatmap with the confusion matrix
    sns.heatmap(cm, annot=True, cmap='Blues', fmt='g', xticklabels=['class 0', 'class 1'], yticklabels=['class 0', 'class 1'])

    # Add labels and title
    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.title('Confusion matrix')
    plt.show()

In [224]:
def tuning(X, y):
    print("HYPERPARAMETERS TUNING ONGOING...")
    SEED = np.random.randint(1, 999999)
    
    
    from sklearn.model_selection import cross_val_score
    from bayes_opt import BayesianOptimization

    # Define the objective function to be maximized
    def rf_cv(n_estimators, max_depth, min_samples_split, max_features):
        # Define the random forest classifier with the given hyperparameters
        clf = RandomForestClassifier(n_estimators=int(n_estimators),
                                    max_depth=int(max_depth),
                                    min_samples_split=int(min_samples_split) + 1,
                                    max_features=max_features,
                                    random_state=SEED)

        # Calculate cross-validation scores for the classifier
        scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')

        # Return the mean cross-validation score
        return scores.mean()

    # Define the hyperparameters and their respective ranges for Bayesian optimization
    pbounds = {'n_estimators': (10, 500),
            'max_depth': (2, 20),
            'min_samples_split': (2, 12),
            'max_features': (0.1, 0.999)}

    # Initialize the Bayesian optimizer with the objective function and hyperparameter ranges
    optimizer = BayesianOptimization(f=rf_cv, pbounds=pbounds, random_state=42)

    # Run the Bayesian optimizer for 10 iterations
    optimizer.maximize(init_points=5, n_iter=5)

    # Train a Random Forest classifier with the best hyperparameters found by the optimization
    best_params = optimizer.max['params']
    best_params["n_estimators"] = int(best_params["n_estimators"])
    best_params["max_depth"] = int(best_params["max_depth"])
    best_params["min_samples_split"] = max(2, best_params["max_depth"])
    
    print(best_params)
    
    clf = RandomForestClassifier(**best_params,
                                random_state=SEED)
    clf.fit(X, y)

    # Return the trained Random Forest classifier with the best hyperparameters
    return clf




In [225]:
def train(df, test_size=0.3, tuned=False):
    print(f"TRAINING WITH {test_size*100}% TEST SIZE...")
    test_size = test_size if 0 < test_size <= 1 else 0.01
    
    X = df.drop("Survived", axis=1)
    y = df["Survived"]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)


    model = tuning(X_train, y_train) if tuned else RandomForestClassifier(random_state=30)
    model.fit(X_train, y_train)
    
    if test_size <= 0.01:
        return model
    
    y_pred = model.predict(X_test)
    printStats(y_test, y_pred)
    
    return model

In [233]:
model = train(training_df, test_size=0, tuned=True)

TRAINING WITH 0% TEST SIZE...
HYPERPARAMETERS TUNING ONGOING...
|   iter    |  target   | max_depth | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m0.8232   [0m | [0m8.742    [0m | [0m0.9547   [0m | [0m9.32     [0m | [0m303.3    [0m |
| [0m2        [0m | [0m0.8117   [0m | [0m4.808    [0m | [0m0.2402   [0m | [0m2.581    [0m | [0m434.4    [0m |
| [95m3        [0m | [95m0.8333   [0m | [95m12.82    [0m | [95m0.7366   [0m | [95m2.206    [0m | [95m485.3    [0m |
| [95m4        [0m | [95m0.8379   [0m | [95m16.98    [0m | [95m0.2909   [0m | [95m3.818    [0m | [95m99.87    [0m |
| [0m5        [0m | [0m0.8277   [0m | [0m7.476    [0m | [0m0.5718   [0m | [0m6.319    [0m | [0m152.7    [0m |
| [0m6        [0m | [0m0.8265   [0m | [0m15.69    [0m | [0m0.7081   [0m | [0m5.01     [0m | [0m100.2    [0m |
| [0m7        [0m | [0m0.7596   [0m | [

In [234]:
output = model.predict(testing_df)

In [235]:
submission_df = pd.DataFrame({
    "PassengerId": testing_df["PassengerId"],
    "Survived": output
})
submission_df.to_csv("output.csv", index=False)

In [68]:
def findNans(df):
    col_missing_rows = df["Age"].isna().sum()
    missing_rows = df.isna().any(axis=1).sum()

    missing_values = df.isna().sum()

    for col, num_missing in missing_values.iteritems():
        print(f"{col}: {num_missing}")


    print(col_missing_rows)
    print(missing_rows)

In [69]:
findNans(testing_df)

PassengerId: 0
Pclass: 0
Sex: 0
Age: 0
SibSp: 0
Parch: 0
Fare: 0
Embarked: 0
CC: 0
CN: 0
0
0
