In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.externals import joblib

In [2]:
data = pd.read_csv("data_titanic_proyecto.csv") 
data.head()

Unnamed: 0,PassengerId,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,passenger_class,passenger_sex,passenger_survived
0,1,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,S,Lower,M,N
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,Upper,F,Y
2,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,S,Lower,F,Y
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,S,Upper,F,Y
4,5,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,S,Lower,M,N


In [3]:
data.dtypes

PassengerId             int64
Name                   object
Age                   float64
SibSp                   int64
Parch                   int64
Ticket                 object
Fare                  float64
Cabin                  object
Embarked               object
passenger_class        object
passenger_sex          object
passenger_survived     object
dtype: object

In [4]:
data['Age'].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [5]:
data.shape[0]

891

# Encode variables to numbers for statistical learning model implementations

First we will check if the dataset contains missing values. If some numeric column contains missing values, they will be replaced with mean imputation of the column.

In [6]:
data.isnull().sum()

PassengerId             0
Name                    0
Age                   177
SibSp                   0
Parch                   0
Ticket                  0
Fare                    0
Cabin                 687
Embarked                2
passenger_class         0
passenger_sex           0
passenger_survived      0
dtype: int64

In [7]:
##Imputations for missing values variables 

In [8]:
df =data.drop(columns=['Cabin'])

In [9]:
df.shape

(891, 11)

In [10]:
df['Age'].mean()

29.69911764705882

In [11]:
df['Age'].fillna((df['Age'].mean()), inplace=True)

In [12]:
df.isnull().sum()

PassengerId           0
Name                  0
Age                   0
SibSp                 0
Parch                 0
Ticket                0
Fare                  0
Embarked              2
passenger_class       0
passenger_sex         0
passenger_survived    0
dtype: int64

In [13]:
df = df.dropna()

In [14]:
df.shape

(889, 11)

In [15]:
from sklearn import preprocessing

#Creating the label encoder
encoder = preprocessing.LabelEncoder()

#Converting string labels into numbers
df['Embarked_encoded'] = encoder.fit_transform(np.array(df['Embarked']))
df['passenger_class_encoded'] = encoder.fit_transform(np.array(df['passenger_class']))
df['passenger_sex_encoded'] = encoder.fit_transform(np.array(df['passenger_sex']))
df['passenger_survived_encoded'] = encoder.fit_transform(np.array(df['passenger_survived']))




In [16]:
labels = df['passenger_survived_encoded']
labels.head()
df_features = df[['Age','SibSp','Parch','Fare','Embarked_encoded','passenger_class_encoded','passenger_sex_encoded']]
df_features.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Embarked_encoded,passenger_class_encoded,passenger_sex_encoded
0,22.0,1,0,7.25,2,0,1
1,38.0,1,0,71.2833,0,2,0
2,26.0,0,0,7.925,2,0,0
3,35.0,1,0,53.1,2,2,0
4,35.0,0,0,8.05,2,0,1


In [17]:
df_features.shape[0]

889

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test_validation, y_train, y_test_validation = train_test_split(df_features, labels,
                                                    stratify=labels, 
                                                    test_size=0.4)

In [19]:
X_train.shape
#X_test.shape

(533, 7)

In [20]:
from sklearn.model_selection import train_test_split
X_validation, X_test, y_validation, y_test = train_test_split(X_test_validation, y_test_validation,
                                                    stratify=y_test_validation, 
                                                    test_size=0.5)

In [21]:
print("Training dataset:", X_train.shape," Validation dataset:",X_validation.shape, "Test dataset:", X_test.shape)

Training dataset: (533, 7)  Validation dataset: (178, 7) Test dataset: (178, 7)


In [22]:
pd.DataFrame(y_train).groupby('passenger_survived_encoded').agg({'passenger_survived_encoded':'count'})

Unnamed: 0_level_0,passenger_survived_encoded
passenger_survived_encoded,Unnamed: 1_level_1
0,329
1,204


In [23]:
pd.DataFrame(y_validation).groupby('passenger_survived_encoded').agg({'passenger_survived_encoded':'count'})

Unnamed: 0_level_0,passenger_survived_encoded
passenger_survived_encoded,Unnamed: 1_level_1
0,110
1,68


In [24]:
pd.DataFrame(y_test).groupby('passenger_survived_encoded').agg({'passenger_survived_encoded':'count'})

Unnamed: 0_level_0,passenger_survived_encoded
passenger_survived_encoded,Unnamed: 1_level_1
0,110
1,68


# Creating a Logbook for all experiments

In [25]:
chronicle = pd.DataFrame(columns=["Model","Accuracy","Recall","Precision","F1_Score"])

# Decision Tree Model

In [26]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [27]:
def decision_tree_model(x_train,y_train,x_test,y_test,run=0):
    model="decision_tree"
    decision_tree = DecisionTreeClassifier(criterion="entropy")
    decision_tree.fit(x_train,y_train)

    y_pred = decision_tree.predict(x_test)
    accuracy = accuracy_score(y_pred,y_test)
    error = 1 - accuracy
    recall = recall_score(y_pred,y_test)
    precision = precision_score(y_pred, y_test)
    f1_sc = f1_score(y_pred,y_test)

    dic = dict()
    dic['Model']=model
    dic['Accuracy']=accuracy
    dic['Recall']= recall
    dic['Precision']=precision
    dic['F1_Score']=f1_sc
    
    joblib.dump(decision_tree,'test.pkl')
    return(dic)

In [28]:
dtm = decision_tree_model(X_train,y_train,X_validation,y_validation)

In [29]:
chronicle = chronicle.append(dtm,ignore_index=True)
chronicle

Unnamed: 0,Model,Accuracy,Recall,Precision,F1_Score
0,decision_tree,0.803371,0.746269,0.735294,0.740741


In [30]:
tree_test = joblib.load("test.pkl")
tree_test.predict(X_validation)

array([0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 1])

## Support Vector Machine Model 

In [56]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [None]:
def svm_model(x_train,y_train,x_test,y_test):
    model="svm"
    svm = SVC()
    
    
    #svm.fit(x_train, y_train)
    parameters = {'kernel':('linear', 'rbf'), 
                  'C':(1,0.25,0.5,0.75),
                  'gamma': (1,2,3,'auto'),
                  'decision_function_shape':('ovo','ovr'),
                  'shrinking':(True,False)}

    grid = GridSearchCV(svm, parameters)
    grid.fit(x_train,y_train)
    
    
    y_pred = grid.predict(x_test)
    accuracy = accuracy_score(y_pred,y_test)
    error = 1 - accuracy
    recall = recall_score(y_pred,y_test)
    precision = precision_score(y_pred, y_test)
    f1_sc = f1_score(y_pred,y_test)
    
    dic = dict()
    dic['Model']=model
    dic['Accuracy']=accuracy
    dic['Recall']= recall
    dic['Precision']=precision
    dic['F1_Score']=f1_sc
    
    #joblib.dump(svm,'test.pkl')
    print(grid.best_params_)
    return(accuracy)
    

    
    

In [67]:
svm = svm_model(X_train,y_train,X_validation,y_validation)



{'C': 1, 'decision_function_shape': 'ovo', 'gamma': 1, 'kernel': 'linear', 'shrinking': True}


In [68]:
svm

0.8258426966292135

In [116]:
yt= y_train.values.reshape((y_train.size, 1))
yt.shape

(533, 1)

In [123]:
train_labels_model_encoded = pd.DataFrame(y_train)
train_labels_model_hot= pd.get_dummies(y_train)
train_labels_model = train_labels_model_hot
np.shape(train_labels_model)

(533, 2)

## Logistic Regression Model 

In [201]:
def multinomial_model(epoch_num,lr,batch_size,x_training,y_training, x_validation, y_validation,beta) :
    
    
    ###one hot encoding ###
    y_training_encoded = pd.DataFrame(y_training)
    y_training_hot= pd.get_dummies(y_training)
    y_training = y_training_hot
    
   # y_validation = y_validation.values.reshape(y_validation.size,1)
    y_validation_encoded = pd.DataFrame(y_validation)
    y_validation_hot = pd.get_dummies(y_validation)
    y_validation = y_validation_hot
    
    
    tf.reset_default_graph()
    ##Hyperparameters
    batch = batch_size
    #y_training = y_training.values.reshape((y_training.size,1))
    m = np.shape(x_training)[1]
    n = np.shape(y_training)[1]
    training_epochs = epoch_num
    learning_rate = lr

    x_train = tf.placeholder(tf.float64, shape =[None,m], name="x_train")
    y_train = tf.placeholder(tf.float64, shape=[None,n], name="y_train")

    #W = tf.Variable(np.random.randn(m,n), name = "W") 
    #b = tf.Variable(np.random.randn(n), name = "b")

    W = tf.Variable(np.zeros([m,n]), name = "W") 
    b = tf.Variable(np.zeros(n), name = "b")

    with tf.name_scope("Hypotesis"):
        logits = tf.matmul(x_train,W) + b
        y_pred = tf.nn.softmax(logits, name="Softmax")

    with tf.name_scope("Cross_Entropy"):
        cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_train * tf.log(y_pred), reduction_indices=[1]))
        #cross_entropy = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=x_training, labels=y_training))
    
    with tf.name_scope("Regularization"):
        regularizer = tf.nn.l2_loss(W)
        cross_entropy = tf.reduce_mean(cross_entropy + beta*regularizer)
        

        
        
        
    with tf.name_scope("Optimizer"):
        optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cross_entropy)

    with tf.name_scope("Accuracy"):
        correct_prediction = tf.equal(tf.argmax(y_pred,1), tf.argmax(y_train,1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    init = tf.global_variables_initializer()

    with tf.name_scope("Disturbance"):
        disturbance = tf.summary.scalar(name = "Costfunction", tensor = cross_entropy)
        
    with tf.name_scope("Accuracy"):
        ac = tf.summary.scalar(name = "Costfunction", tensor = accuracy)
        
    #summaries = tf.summary.merge_all()
    
    with tf.Session() as sess:
    
        sess.run(init)
        #writer = tf.summary.FileWriter( './error_graphs/epoch_'+str(epoch_num)+'batch_size'+str(batch_size)+'lr'+str(lr), sess.graph)
        #print('Graph saved at ./error_graphs/epoch_'+str(epoch_num)+'_batch_size_'+str(batch_size)+'_lr_'+str(lr))
        for epoch in range(training_epochs+1):

            batch_num = (epoch * batch ) % (len(x_training) -batch)


            sess.run(optimizer, feed_dict = {x_train: x_training[batch_num:(batch_num+batch)], 
                                             y_train : y_training[batch_num:(batch_num+batch)]})



            if (epoch + 1) % 100 == 0:
                c = sess.run(disturbance, feed_dict = {x_train: x_training[batch_num:(batch_num+batch)], 
                                             y_train : y_training[batch_num:(batch_num+batch)]})
                
                a = sess.run(ac, feed_dict = {x_train: x_training[batch_num:(batch_num+batch)], 
                                             y_train : y_training[batch_num:(batch_num+batch)]})
                print("Epoch: " +str(epoch))
                
                print("train accuracy: " + 
                  str(sess.run(accuracy, feed_dict = {x_train: x_training[batch_num:(batch_num+batch)],
                                                      y_train : y_training[batch_num:(batch_num+batch)]})))
                
                print("validation accuracy: " + 
                  str(sess.run(accuracy, feed_dict = {x_train: x_validation,
                                                      y_train : y_validation})))
                
                print("")





               # writer.add_summary(c,epoch)
                #writer.add_summary(a,epoch)
                
        #weights = W.eval()
        #bias = b.eval()
        return((W.eval(),b.eval()))
        sess.close()
        #writer.close()




In [208]:
multinomial_model(epoch_num=1500, lr=0.01, batch_size=32, x_training=X_train, y_training=y_train,x_validation=X_validation,
                  y_validation=y_validation,beta = 0.01)
#multinomial_model(epoch_num=1000,lr=0.001,batch_size=32,train_image=train_imagesv,train_label=train_labels_model)

Epoch: 99
train accuracy: 0.71875
validation accuracy: 0.6292135

Epoch: 199
train accuracy: 0.71875
validation accuracy: 0.66292137

Epoch: 299
train accuracy: 0.5625
validation accuracy: 0.6292135

Epoch: 399
train accuracy: 0.78125
validation accuracy: 0.6797753

Epoch: 499
train accuracy: 0.75
validation accuracy: 0.6853933

Epoch: 599
train accuracy: 0.71875
validation accuracy: 0.6741573

Epoch: 699
train accuracy: 0.46875
validation accuracy: 0.41011235

Epoch: 799
train accuracy: 0.8125
validation accuracy: 0.69101125

Epoch: 899
train accuracy: 0.8125
validation accuracy: 0.6966292

Epoch: 999
train accuracy: 0.71875
validation accuracy: 0.66853935

Epoch: 1099
train accuracy: 0.53125
validation accuracy: 0.46067417

Epoch: 1199
train accuracy: 0.6875
validation accuracy: 0.6292135

Epoch: 1299
train accuracy: 0.5625
validation accuracy: 0.71348315

Epoch: 1399
train accuracy: 0.5625
validation accuracy: 0.66853935

Epoch: 1499
train accuracy: 0.84375
validation accuracy: 0.75

(array([[ 0.0686543 , -0.0686543 ],
        [ 0.62907704, -0.62907704],
        [ 0.12674051, -0.12674051],
        [ 0.0146421 , -0.0146421 ],
        [ 0.28974744, -0.28974744],
        [-0.41798345,  0.41798345],
        [ 1.00442033, -1.00442033]]), array([-0.02849525,  0.02849525]))

# Comments and Conclusions 

#### Training the Logistic Regression Model
Training the logistic regression 