# Titanic Tweak
Målet med denna notebook är att försöka skaffa bra noggrannhet för ett beslutsträd med data från Titanic.

#### Importerar bibliotek

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


#### Läser in data

In [15]:
train_titanic_data = pd.read_csv('train_titanic.csv')


#### Grundläggande överblick av data

In [16]:
train_titanic_data.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [17]:
train_titanic_data.describe()


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [18]:
train_titanic_data.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

#### Förbehandling av data
Vi börjar med att titta på vilka attribut som kan sållas bort då de antagligen har liten eller låg påverkan på en passagerares överlevnad i katastrofen. Vi väljer att behålla följande:
* Survived (vårat facit)
* Pclass (Speglar till viss del placeringen på skeppet)
* Sex och Age (Efter uttrycket "Kvinnor och barn först")
* Fare (Likt Pclass kan detta möjligtvis spegla placeringen på skeppet).


In [19]:
ttd_sliced = train_titanic_data[['Survived', 'Pclass', 'Sex', 'Age', 'Fare']]

#### Sex == NaN
Vi gör om så att attributen Sex blir numerisk istället för en sträng, för att förenkla arbetet senare.

In [20]:
ttd_sliced['Sex'] = ttd_sliced['Sex'].replace({'male':0,'female':1})


  ttd_sliced['Sex'] = ttd_sliced['Sex'].replace({'male':0,'female':1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ttd_sliced['Sex'] = ttd_sliced['Sex'].replace({'male':0,'female':1})


#### Age NULL
När vi tittar på vår nya datadel kan vi se att vi har vissa passagerare för vilka det saknas ålder. Vi skapar tre versioner av datan: En utan dessa passagerare (ttd_sliced_short), en där de får genomsnittsåldern för alla andra passagerare (ttd_sliced_ship), och en för alla andra passagerare som delar Pclass och Sex med dem (ttd_sliced_demo).

In [21]:
ttd_sliced.count()

Survived    891
Pclass      891
Sex         891
Age         714
Fare        891
dtype: int64

In [22]:
ttd_sliced_short = ttd_sliced.loc[~ttd_sliced['Age'].isnull()]

ttd_sliced_ship = ttd_sliced[:]
ttd_sliced_ship['Age'] = ttd_sliced_ship['Age'].fillna(ttd_sliced['Age'].mean())

ttd_sliced_demo = ttd_sliced.copy()
for sex in ttd_sliced_demo['Sex'].unique():
    for p_class in ttd_sliced_demo['Pclass'].unique():
        mean_age = ttd_sliced.loc[(ttd_sliced['Sex'] == sex) & (ttd_sliced['Pclass'] == p_class)]['Age'].mean()
        ttd_sliced_demo.loc[(ttd_sliced_demo['Sex'] == sex) & (ttd_sliced_demo['Pclass'] == p_class) & (ttd_sliced_demo['Age'].isna()), ['Age']] = mean_age


#### Träning och test
Eftersom vi har tre versioner av vårat dataset bygger vi en klass som kan hålla reda på alla variabler för dem individuellt åt oss.

In [23]:
class PredictiveModel:
    def __init__(self, df: pd.DataFrame, test_size: bool = 0.3, random_state: int = 309, max_depth: int = None, max_leaf_nodes: int = None) -> None:
        self.df = df
        self.X = df.drop('Survived', axis = 1)
        self.y = df['Survived']
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X,
                                                                                self.y,
                                                                                test_size = test_size,
                                                                                random_state = random_state
                                                                                )
        
        self.clf = DecisionTreeClassifier(random_state = random_state, max_depth = max_depth, max_leaf_nodes = max_leaf_nodes)
        self.clf.fit(self.X_train, self.y_train)

        self.y_pred = self.predict(self.X_test)
        self.depth = self.clf.get_depth()
        self.leaves_count = self.clf.get_n_leaves()
        self.feature_importance = pd.DataFrame(self.clf.feature_importances_.reshape(1,self.clf.n_features_in_), columns = self.clf.feature_names_in_)

    def accuracy(self, df: pd.DataFrame = None, builtin: bool = False):
        if builtin:
            df = self.y_pred
        return accuracy_score(self.y_test, df)

    def predict(self, df: pd.DataFrame):
        # col_verifications = df.columns == self.X.columns
        # for verification in col_verifications:
        #     if not verification:
        #         raise KeyError
        
        return self.clf.predict(df)

    def plot(self, figsize = None):
        plt.figure(figsize = figsize)
        plot_tree(self.clf, filled = True, feature_names=self.X.keys())
        plt.show()
    
pm_short = PredictiveModel(ttd_sliced_short)
pm_ship = PredictiveModel(ttd_sliced_ship)
pm_demo = PredictiveModel(ttd_sliced_demo)


#### Demonstrering av resultat

In [24]:
print('Short nogrannhet:', pm_short.accuracy(builtin=True))
print('Ship nogrannhet:', pm_ship.accuracy(builtin=True))
print('Demo nogrannhet:', pm_demo.accuracy(builtin=True))


Short nogrannhet: 0.7534883720930232
Ship nogrannhet: 0.7425373134328358
Demo nogrannhet: 0.7761194029850746
