In [13]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [23]:
train_df = pd.read_csv("../datasets/titanic/preprocessed_train.csv")
test_df = pd.read_csv("../datasets/titanic/preprocessed_test.csv")

In [24]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,2
1,2,1,1,0,38.0,1,0,71.2833,0
2,3,1,3,0,26.0,0,0,7.925,2
3,4,1,1,0,35.0,1,0,53.1,2
4,5,0,3,1,35.0,0,0,8.05,2


In [25]:
# test train split on train_df
train, test = train_test_split(train_df)

In [62]:
# get the correlation with respect to the survived column
correlation = train_df.corr()['Survived'].abs()
print(correlation)

PassengerId    0.005007
Survived       1.000000
Pclass         0.338481
Sex            0.543351
Age            0.064910
SibSp          0.035322
Parch          0.081629
Fare           0.257307
Embarked       0.163517
Name: Survived, dtype: float64


In [44]:
# get the order of columns based on descending correlation value
sorted_cols = correlation.sort_values(ascending=False).index
print(sorted_cols)

Index(['Survived', 'Sex', 'Pclass', 'Fare', 'Embarked', 'Parch', 'Age',
       'SibSp', 'PassengerId'],
      dtype='object')


In [45]:
# rearrange the df column order
r_train_df = train_df[sorted_cols]
r_train_df.head(2)

Unnamed: 0,Survived,Sex,Pclass,Fare,Embarked,Parch,Age,SibSp,PassengerId
0,0,1,3,7.25,2,0,22.0,1,1
1,1,0,1,71.2833,0,0,38.0,1,2


In [60]:
added_cols = []
best_fit = []
# iterate through the columns, (excluding survived column), grabbing increasing number of columns
for col in r_train_df.columns[1:]:
    # append the col to the list 
    added_cols.append(col)

    # get the decision tree classifier for the dataframe using only the columns in added_cols
    clf = tree.DecisionTreeClassifier()
    clf.fit(train[added_cols], train['Survived'])

    # get the predicted values and the accuracy score
    y_pred = clf.predict(test[added_cols])
    best_fit
    best_fit.append({"Columns under consideration": added_cols.copy(), "Accuracy":accuracy_score(y_pred, test['Survived'])})
    #print(added_cols, ":", accuracy_score(y_pred, test['Survived']))

In [61]:
sorted_best_fit = sorted(best_fit, key=lambda obj: obj['Accuracy'], reverse=True)
for fit in sorted_best_fit:
    print(fit)

{'Columns under consideration': ['Sex', 'Pclass', 'Fare', 'Embarked'], 'Accuracy': 0.8116591928251121}
{'Columns under consideration': ['Sex', 'Pclass', 'Fare'], 'Accuracy': 0.7982062780269058}
{'Columns under consideration': ['Sex'], 'Accuracy': 0.7937219730941704}
{'Columns under consideration': ['Sex', 'Pclass'], 'Accuracy': 0.7937219730941704}
{'Columns under consideration': ['Sex', 'Pclass', 'Fare', 'Embarked', 'Parch'], 'Accuracy': 0.7937219730941704}
{'Columns under consideration': ['Sex', 'Pclass', 'Fare', 'Embarked', 'Parch', 'Age', 'SibSp'], 'Accuracy': 0.7802690582959642}
{'Columns under consideration': ['Sex', 'Pclass', 'Fare', 'Embarked', 'Parch', 'Age'], 'Accuracy': 0.7713004484304933}
{'Columns under consideration': ['Sex', 'Pclass', 'Fare', 'Embarked', 'Parch', 'Age', 'SibSp', 'PassengerId'], 'Accuracy': 0.7174887892376681}


### From the above accuracy list, we see that the best fit is obtained when four columns are considered: 
    - Sex
    - Pclass
    - Fare
    - Embarked

### Predict the answers for the test_df

In [66]:
accurate_clf = tree.DecisionTreeClassifier()
accurate_clf.fit(train_df[sorted_best_fit[0]['Columns under consideration']], train_df['Survived'])

In [70]:
# get the y predicted values for the testing df
y_pred = accurate_clf.predict(test_df[sorted_best_fit[0]['Columns under consideration']])

In [71]:
y_pred[:3]

array([0, 1, 0])

In [77]:
# write the dataframe to the csv file
answer_df = pd.DataFrame({"PassengerId": test_df['PassengerId'], 'Survived': y_pred})

In [78]:
answer_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [80]:
# write answer to csv file
answer_df.to_csv("../datasets/titanic/answers/answer_decision_tree.csv", index=False)