In [169]:
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.externals import joblib

In [4]:
data = pd.read_csv("data_titanic_proyecto.csv") 
data.head()

Unnamed: 0,PassengerId,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,passenger_class,passenger_sex,passenger_survived
0,1,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,S,Lower,M,N
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,Upper,F,Y
2,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,S,Lower,F,Y
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,S,Upper,F,Y
4,5,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,S,Lower,M,N


In [13]:
data.dtypes

PassengerId             int64
Name                   object
Age                   float64
SibSp                   int64
Parch                   int64
Ticket                 object
Fare                  float64
Cabin                  object
Embarked               object
passenger_class        object
passenger_sex          object
passenger_survived     object
dtype: object

In [57]:
data['Age'].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [65]:
data.shape[0]

891

# Encode variables to numbers for statistical learning model implementations

First we will check if the dataset contains missing values. If some numeric column contains missing values, they will be replaced with mean imputation of the column.

In [66]:
data.isnull().sum()

PassengerId             0
Name                    0
Age                   177
SibSp                   0
Parch                   0
Ticket                  0
Fare                    0
Cabin                 687
Embarked                2
passenger_class         0
passenger_sex           0
passenger_survived      0
dtype: int64

In [None]:
##Imputations for missing values variables 

In [85]:
df =data.drop(columns=['Cabin'])

In [86]:
df.shape

(891, 11)

In [87]:
df['Age'].mean()

29.69911764705882

In [88]:
df['Age'].fillna((df['Age'].mean()), inplace=True)

In [89]:
df.isnull().sum()

PassengerId           0
Name                  0
Age                   0
SibSp                 0
Parch                 0
Ticket                0
Fare                  0
Embarked              2
passenger_class       0
passenger_sex         0
passenger_survived    0
dtype: int64

In [90]:
df = df.dropna()

In [91]:
df.shape

(889, 11)

In [92]:
from sklearn import preprocessing

#Creating the label encoder
encoder = preprocessing.LabelEncoder()

#Converting string labels into numbers
df['Embarked_encoded'] = encoder.fit_transform(np.array(df['Embarked']))
df['passenger_class_encoded'] = encoder.fit_transform(np.array(df['passenger_class']))
df['passenger_sex_encoded'] = encoder.fit_transform(np.array(df['passenger_sex']))
df['passenger_survived_encoded'] = encoder.fit_transform(np.array(df['passenger_survived']))




In [94]:
labels = df['passenger_survived_encoded']
labels.head()
df_features = df[['Age','SibSp','Parch','Fare','Embarked_encoded','passenger_class_encoded','passenger_sex_encoded']]
df_features.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Embarked_encoded,passenger_class_encoded,passenger_sex_encoded
0,22.0,1,0,7.25,2,0,1
1,38.0,1,0,71.2833,0,2,0
2,26.0,0,0,7.925,2,0,0
3,35.0,1,0,53.1,2,2,0
4,35.0,0,0,8.05,2,0,1


In [95]:
df_features.shape[0]

889

In [100]:
from sklearn.model_selection import train_test_split
X_train, X_test_validation, y_train, y_test_validation = train_test_split(df_features, labels,
                                                    stratify=labels, 
                                                    test_size=0.4)

In [104]:
X_train.shape
#X_test.shape

(533, 7)

In [133]:
from sklearn.model_selection import train_test_split
X_validation, X_test, y_validation, y_test = train_test_split(X_test_validation, y_test_validation,
                                                    stratify=y_test_validation, 
                                                    test_size=0.5)

In [115]:
print("Training dataset:", X_train.shape," Validation dataset:",X_validation.shape, "Test dataset:", X_test.shape)

Training dataset: (533, 7)  Validation dataset: (178, 7) Test dataset: (178, 7)


In [120]:
pd.DataFrame(y_train).groupby('passenger_survived_encoded').agg({'passenger_survived_encoded':'count'})

Unnamed: 0_level_0,passenger_survived_encoded
passenger_survived_encoded,Unnamed: 1_level_1
0,329
1,204


In [121]:
pd.DataFrame(y_validation).groupby('passenger_survived_encoded').agg({'passenger_survived_encoded':'count'})

Unnamed: 0_level_0,passenger_survived_encoded
passenger_survived_encoded,Unnamed: 1_level_1
0,110
1,68


In [125]:
pd.DataFrame(y_test).groupby('passenger_survived_encoded').agg({'passenger_survived_encoded':'count'})

Unnamed: 0_level_0,passenger_survived_encoded
passenger_survived_encoded,Unnamed: 1_level_1
0,110
1,68


# Creating a Logbook for all experiments

In [165]:
chronicle = pd.DataFrame(columns=["Model","Accuracy","Recall","Precision","F1_Score"])

# Decision Tree Model

In [127]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [180]:
def decision_tree_model(x_train,y_train,x_test,y_test,run=0):
    model="decision_tree"
    decision_tree = DecisionTreeClassifier(criterion="entropy")
    decision_tree.fit(x_train,y_train)

    y_pred = decision_tree.predict(x_test)
    accuracy = accuracy_score(y_pred,y_test)
    error = 1 - accuracy
    recall = recall_score(y_pred,y_test)
    precision = precision_score(y_pred, y_test)
    f1_sc = f1_score(y_pred,y_test)

    dic = dict()
    dic['Model']=model
    dic['Accuracy']=accuracy
    dic['Recall']= recall
    dic['Precision']=precision
    dic['F1_Score']=f1_sc
    
    joblib.dump(decision_tree,'test.pkl')
    return(dic)

In [181]:
dtm = decision_tree_model(X_train,y_train,X_validation,y_validation)

In [182]:
chronicle = chronicle.append(dtm,ignore_index=True)
chronicle

Unnamed: 0,Model,Accuracy,Recall,Precision,F1_Score
0,decision_tree,0.825843,0.728395,0.867647,0.791946
1,decision_tree,0.825843,0.728395,0.867647,0.791946
2,decision_tree,0.825843,0.728395,0.867647,0.791946
3,decision_tree,0.792135,0.696203,0.808824,0.748299


In [183]:
tree_test = joblib.load("test.pkl")
tree_test.predict(X_validation)

array([0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0])