## Полный эксперимент с заполнением возраста из обученной модели DecissionTreeRegressions

Цель сравнить точность предсказания с возрастом заполненным median'ым значением и предсказанным значением

### Заполнение возраста где он отсутствует

In [22]:
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import numpy as np

In [23]:
titanic_data = pd.read_csv("./train.csv")
# titanic_data = titanic_data.set_index('PassengerId')

# Change type of values for Sex column
titanic_data['Sex'] = np.where(titanic_data['Sex'] == 'female', 0, 1)

# Remove rows with NaN in age
rows_witn_unknown_age = titanic_data[np.isnan(titanic_data.Age)]
rows_witn_known_age = titanic_data[np.isnan(titanic_data.Age) == False]

### Обучение дерева_регресии для предсказания возраста

In [24]:
# Split of data
X = rows_witn_known_age.get(['Survived', 'Sex', 'Pclass'])
Y = rows_witn_known_age.Age

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

# Learn ML model (DecisionTreeRegressor)
regressor_clf = DecisionTreeRegressor()
regressor_clf.fit(X_train, Y_train)

# Predcit age for missing rows
predicted_ages = regressor_clf.predict(rows_witn_unknown_age.get(['Survived', 'Sex', 'Pclass']))
rows_witn_unknown_age.loc[:,'Age'] = predicted_ages

# Create a titanic DataFrame with all data
titanic_data_with_age = rows_witn_known_age.merge(rows_witn_unknown_age, how='outer')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


### Usage of DecisionTreeClassifier with predicted value in age column

In [32]:
X = titanic_data_with_age.get(['Pclass', 'Sex', 'Age'])
Y = titanic_data_with_age.Survived

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

grid_search_clf = GridSearchCV(DecisionTreeClassifier(), {'max_depth': [5, 7, 10, 13, 15]})
grid_search_clf.fit(X_train, Y_train)

predicted = grid_search_clf.best_estimator_.predict(X_test)

print(f'Precision: {precision_score(Y_test, predicted)}')
print(f'Recall равен {recall_score(Y_test, predicted)}')
print(f'F1-мера {f1_score(Y_test, predicted)}')
print(f'Accuracy {accuracy_score(Y_test, predicted)}')

Precision: 0.8247422680412371
Recall равен 0.6896551724137931
F1-мера 0.7511737089201878
Accuracy 0.8203389830508474


### Usage of DecisionTreeClassifier with median value in age column

In [27]:
titanic_data = pd.read_csv("./train.csv")
titanic_data['Sex'] = np.where(titanic_data['Sex'] == 'female', 0, 1)

X = titanic_data.get(['Pclass', 'Sex', 'Age'])
Y = titanic_data.Survived

X = X.fillna({ 'Age': X.Age.median() })

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

grid_search_clf = GridSearchCV(DecisionTreeClassifier(), {'max_depth': [5, 7, 10, 13, 15]})
grid_search_clf.fit(X_train, Y_train)

predicted = grid_search_clf.best_estimator_.predict(X_test)

print(f'Precision: {precision_score(Y_test, predicted)}')
print(f'Recall равен {recall_score(Y_test, predicted)}')
print(f'F1-мера {f1_score(Y_test, predicted)}')
print(f'Accuracy {accuracy_score(Y_test, predicted)}')

Precision: 0.8840579710144928
Recall равен 0.5083333333333333
F1-мера 0.6455026455026456
Accuracy 0.7728813559322034


### Usage of RandomForest with median value in age column

In [29]:
titanic_data = pd.read_csv("./train.csv")
titanic_data['Sex'] = np.where(titanic_data['Sex'] == 'female', 0, 1)

X = titanic_data.get(['Pclass', 'Sex', 'Age'])
Y = titanic_data.Survived

X = X.fillna({ 'Age': X.Age.median() })

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

grid_search_clf = GridSearchCV(RandomForestClassifier(), {'n_estimators': [10,20,30], 'max_depth': [5, 7, 10, 13, 15]})
grid_search_clf.fit(X_train, Y_train)

predicted = grid_search_clf.best_estimator_.predict(X_test)

print(f'Precision: {precision_score(Y_test, predicted)}')
print(f'Recall равен {recall_score(Y_test, predicted)}')
print(f'F1-мера {f1_score(Y_test, predicted)}')
print(f'Accuracy {accuracy_score(Y_test, predicted)}')

Precision: 0.7387387387387387
Recall равен 0.6833333333333333
F1-мера 0.70995670995671
Accuracy 0.7728813559322034


### Usage of RandomForestClassifier with predicted value in age column

In [31]:
X = titanic_data_with_age.get(['Pclass', 'Sex', 'Age'])
Y = titanic_data_with_age.Survived

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

grid_search_clf = GridSearchCV(RandomForestClassifier(), {'n_estimators': [10,20,30], 'max_depth': [5, 7, 10, 13, 15]})
grid_search_clf.fit(X_train, Y_train)

predicted = grid_search_clf.best_estimator_.predict(X_test)

print(f'Precision: {precision_score(Y_test, predicted)}')
print(f'Recall равен {recall_score(Y_test, predicted)}')
print(f'F1-мера {f1_score(Y_test, predicted)}')
print(f'Accuracy {accuracy_score(Y_test, predicted)}')

Precision: 0.7391304347826086
Recall равен 0.7327586206896551
F1-мера 0.735930735930736
Accuracy 0.7932203389830509
