In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, validation_curve, learning_curve
from sklearn.ensemble import RandomForestClassifier

from my_lib.plotting import plot_probability_distributions, plot_learning_curve, plot_validation_curve

In [None]:
data = pd.read_csv('data/titanic_newAge_withEncoded.csv')

In [None]:
# mean_age = -50 # data[(data['NewAge']>0)]['NewAge'].mean()

# def replace_age(age):
#     return mean_age if age < 0 else age

# data['NewAge'] = data['NewAge'].map(replace_age)

In [None]:
X = data.drop(columns=['Unnamed: 0', 'PassengerId', 'Survived'])
y = data['Survived']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=1)

In [None]:
clf = RandomForestClassifier(random_state=1, max_depth=4, criterion='entropy')

In [None]:
clf.fit(X_train, y_train)

In [None]:
test_score = clf.score(X_test, y_test)
print('Score: {}'.format(test_score))

In [None]:
plot_probability_distributions(clf, X_test, y_test)

In [None]:
train_sizes, train_scores, test_scores = learning_curve(
    clf,
    X_train,
    y_train,
    train_sizes=np.linspace(0.1, 1, 10),
    cv=5
)

In [None]:
plot_learning_curve(train_scores, test_scores, train_sizes, expected_score=test_score, stat_error=False)

## Validation Curve

In [None]:
param_range = range(2, 10)
train_scores, test_scores = validation_curve(
    estimator=clf, 
    X=X_train, 
    y=y_train, 
    param_name='max_depth', 
    param_range=param_range,
    cv=5
)

In [None]:
plot_validation_curve(train_scores, test_scores, param_range, expected_score=test_score, stat_error=False)