In [None]:
"""
# Framework of Machine Learning

Purpose:  choose (or construct) the model whose score is the best among others. 

0. Preprocessing (Most important and Most difficult)

Dealing with the missing data.
Feature extraction or feature selection.
For example, what variable is relevant to prediction ? Can you make a new variable (feature) from given variables ? 


1. separate the given data into train data and test data

 When choosing a model, the score or the accuracy is necessary to compare a model to another.
 The ratio len(train data):len(test data) is often set to 7:3 or 8:2. (however, we must set it flexibly in terms of data size)

2. candidates for model

 model_candidates={model_1,model_2,...,model_M}
 A model can be deep learning, SVM or XGBoost and so on.

 for model in {model_1,model_2,...,model_M}:
     Do 3 and 4 as stated below.

3. preparation for k-fold cross validation if you have some hyper parameters in your model

 k is often set to 10.(however, we must set it flexibly in terms of data size)
 We have to divide "train data" into k pieces roughly equally.
 (If you have 101 samples and k=10, then the length of one of ten pieces is 11.)
 Name them D_1, D_2,..., D_k.
 Candidates for hyper parameters {a_1,a_2,...,a_L}

 # PROCEDURE
 for alpha in {a_1,a_2,...,a_L}:
    
     cross_validation_score=0
    
     for i in {1,2,...,k}:
       
         train the model using alpha and {D_1,D_2,...,D_k}-{D_i}
         test the model using alpha and D_i
         cross_validation_score+=test score
        
     memorize cross_varidation_score/k (when using hyper parameter alpha)    
 # END

 A candidate with the highest cross_validation_score is your hyper parameter.


4. training and test

 train and test your model (if any, using your hyper parameter).


5. GOAL

 A model with the highest score is your model.
 
"""

In [117]:
import pandas as pd
import numpy as np

D=pd.read_csv('train.csv',index_col=0)
TEST=pd.read_csv('test.csv')

t=D['Survived']
X=D.drop('Survived',axis=1)
X[0:3]

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [118]:
# 0. preprocessing

delete_list=['Name','Ticket','Cabin','Embarked']
X=X.drop(delete_list,axis=1)
print(X.columns)
X=np.array(X)
t=np.array(t)
for i in range(np.shape(X)[0]):
    
    if X[i,1]=='male':
        X[i,1]=1
        
    else:
        X[i,1]=0


# mean substitution

X_mean=np.nanmean(X,axis=0)
        
for i in range(np.shape(X)[0]):
    for j in range(np.shape(X)[1]):
        if np.isnan(X[i,j]):
            X[i,j]=X_mean[j]
            
X=np.asarray(X,dtype=float)            

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')


In [119]:
# 1. separate the given data into train data and test data
from sklearn.model_selection import train_test_split

test_ratio=0.2
X_train, X_test, t_train, t_test =train_test_split(X, t, test_size=test_ratio,shuffle=True) 

In [120]:
# 2. candidates for models
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

In [121]:
# 3. preparation for k-fold cross validation if you have some hyper parameters in your model
from sklearn.model_selection import KFold

split_size=10
k_fold=KFold(n_splits=split_size,shuffle=True)

train_v=[]
test_v=[]

for train_indices, validation_indices in k_fold.split(X_train):
    train_v.append(train_indices)
    test_v.append(validation_indices)
        
validation_score=-100
alpha_opt_LR=0
for alpha in {0.001,0.01,0.1,1,10,100,1000}:
    
    validation_score_sum=0
    LR=LogisticRegression(C=alpha,solver='lbfgs',max_iter=10000) 
   
    for i in range(split_size):
    
        LR.fit(X_train[train_v[i]],t_train[train_v[i]])
        validation_score_sum+=LR.score(X_train[test_v[i]],t_train[test_v[i]])
        
    if validation_score_sum/split_size>validation_score:
        validation_score=validation_score_sum/split_size
        alpha_opt_LR=alpha        

In [122]:
validation_score=-100
alpha_opt_MLPC=0
for a in {0.001,0.01,0.1,1,10,100,1000}:
    
    validation_score_sum=0
    MLPC=MLPClassifier(hidden_layer_sizes=(5,),activation='relu',max_iter=10000,alpha=a)
   
    for i in range(split_size):
    
        MLPC.fit(X_train[train_v[i]],t_train[train_v[i]])
        validation_score_sum+=MLPC.score(X_train[test_v[i]],t_train[test_v[i]])
        
    if validation_score_sum/split_size>validation_score:
        validation_score=validation_score_sum/split_size
        alpha_opt_MLPC=alpha        

In [123]:
# 4. trainin and test
LR=LogisticRegression(C=alpha_opt_LR,solver='lbfgs',max_iter=10000)
LR.fit(X_train,t_train)
print('LR score:',LR.score(X_test,t_test))

MLPC=MLPClassifier(hidden_layer_sizes=(150,),activation='relu',max_iter=10000,alpha=alpha_opt_MLPC)
MLPC.fit(X_train,t_train)
print('MLPC score:',MLPC.score(X_test,t_test))

LR score: 0.8379888268156425
MLPC score: 0.8100558659217877


In [None]:
# 5. select Logistic Regression model

In [124]:
# submission
TEST=TEST.drop(delete_list,axis=1)
passenser_id=TEST['PassengerId']
TEST=TEST.drop('PassengerId',axis=1)
TEST=np.array(TEST)

for i in range(len(TEST)):
    if TEST[i,1]=='male':
        TEST[i,1]=1
    else:
        TEST[i,1]=0
        
for i in range(np.shape(TEST)[0]):
    for j in range(np.shape(TEST)[1]):
        if np.isnan(TEST[i,j]):
            TEST[i,j]=X_mean[j]
                       
TEST=np.asarray(TEST,dtype=float)
pred=LR.predict(TEST)        

[[3 1 34.5 0 0 7.8292]
 [3 0 47.0 1 0 7.0]
 [2 1 62.0 0 0 9.6875]
 [3 1 27.0 0 0 8.6625]
 [3 0 22.0 1 1 12.2875]]


In [129]:
output=pd.DataFrame(index=passenser_id)
output['Survived']=pred

In [130]:
output[0:10]

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1
897,0
898,1
899,0
900,1
901,0


In [131]:
output.to_csv('output.csv')