In [None]:
"""
# Framework of Machine Learning

Purpose:  choose (or construct) the model whose score is the best among others. 

0. Preprocessing (Most important and Most difficult)

Dealing with the missing data.
Feature extraction or feature selection.
For example, what variable is relevant to prediction ? Can you make a new variable (feature) from given variables ? 


1. separate the given data into train data and test data

 When choosing a model, the score or the accuracy is necessary to compare a model to another.
 The ratio len(train data):len(test data) is often set to 7:3 or 8:2. (however, we must set it flexibly in terms of data size)

2. candidates for model

 model_candidates={model_1,model_2,...,model_M}
 A model can be deep learning, SVM or XGBoost and so on.

 for model in {model_1,model_2,...,model_M}:
     Do 3 and 4 as stated below.

3. preparation for k-fold cross validation if you have some hyper parameters in your model

 k is often set to 10.(however, we must set it flexibly in terms of data size)
 We have to divide "train data" into k pieces roughly equally.
 (If you have 101 samples and k=10, then the length of one of ten pieces is 11.)
 Name them D_1, D_2,..., D_k.
 Candidates for hyper parameters {a_1,a_2,...,a_L}

 # PROCEDURE
 for alpha in {a_1,a_2,...,a_L}:
    
     cross_validation_score=0
    
     for i in {1,2,...,k}:
       
         train the model using alpha and {D_1,D_2,...,D_k}-{D_i}
         test the model using alpha and D_i
         cross_validation_score+=test score
        
     memorize cross_varidation_score/k (when using hyper parameter alpha)    
 # END

 A candidate with the highest cross_validation_score is your hyper parameter.


4. training and test

 train and test your model (if any, using your hyper parameter).


5. GOAL

 A model with the highest score is your model.
 
"""

In [10]:
import pandas as pd
import numpy as np
import scipy 

D=pd.read_csv('train.csv',index_col=0)
test=pd.read_csv('test.csv')

t=D['Survived']
X=D.drop('Survived',axis=1)

# 0. preprocessing

delete_list=['Name','Ticket','Cabin','Embarked']
X=X.drop(delete_list,axis=1)

test=test.drop(delete_list,axis=1)
passenser_id=test['PassengerId']
test=test.drop('PassengerId',axis=1)
test=np.array(test)


print(X.columns)

X=np.array(X)
t=np.array(t)
for i in range(np.shape(X)[0]):
    
    if X[i,1]=='male':
        X[i,1]=1
        
    else:
        X[i,1]=0

for i in range(len(test)):
    if test[i,1]=='male':
        test[i,1]=1
    else:
        test[i,1]=0
        
        
X=np.asarray(X,dtype=float)                        
test=np.asarray(test,dtype=float)                

# mean substitution

X_mean=np.nanmean(X,axis=0)
        
for i in range(np.shape(X)[0]):
    for j in range(np.shape(X)[1]):
        if np.isnan(X[i,j]):
            X[i,j]=X_mean[j]

for i in range(np.shape(test)[0]):
    for j in range(np.shape(test)[1]):
        if np.isnan(test[i,j]):
            test[i,j]=X_mean[j]            

#print(X[1:10]) 
#print(test[0:10])
X=scipy.stats.zscore(X)
test=scipy.stats.zscore(test)

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')


In [13]:
# 1. separate the given data into train data and test data
from sklearn.model_selection import train_test_split

test_ratio=0.2
X_train, X_test, t_train, t_test =train_test_split(X, t, test_size=test_ratio,shuffle=True) 

In [29]:
# 2. candidates for models
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [36]:
model=GaussianNB()
model.fit(X_train,t_train)
print(model.score(X_train,t_train))
print(model.score(X_test,t_test))

In [16]:
# 3. preparation for k-fold cross validation if you have some hyper parameters in your model
from sklearn.model_selection import KFold

split_size=10
k_fold=KFold(n_splits=split_size,shuffle=True)

train_v=[]
test_v=[]

for train_indices, validation_indices in k_fold.split(X_train):
    train_v.append(train_indices)
    test_v.append(validation_indices)
        
validation_score=-100
hyperpara_opt=0
for alpha in {0.001,0.01,0.1,1,10,100,1000}:
    
    validation_score_sum=0
    model=LogisticRegression(C=alpha,solver='lbfgs',max_iter=10000) 
   
    for i in range(split_size):
    
        model.fit(X_train[train_v[i]],t_train[train_v[i]])
        validation_score_sum+=model.score(X_train[test_v[i]],t_train[test_v[i]])
        
    if validation_score_sum/split_size>validation_score:
        validation_score=validation_score_sum/split_size
        hyperpara_opt=alpha        

In [37]:
# 4. trainin and test
model=LogisticRegression(C=hyperpara_opt,solver='lbfgs',max_iter=10000)
model.fit(X_train,t_train)
print('score:',model.score(X_test,t_test))

In [7]:
output=pd.DataFrame(index=passenser_id)
pred=model.predict(test)
output['Survived']=pred

In [38]:
model=MLPClassifier(hidden_layer_sizes=(100,100),alpha=0.001,max_iter=1000000,activation='relu')
model.fit(X_train,t_train)
print(model.score(X_train,t_train))

In [39]:
model=SVC(C=10,kernel='rbf',gamma='auto',degree=2)
model.fit(X_train,t_train)
model.score(X_test,t_test)

In [40]:
model=RandomForestClassifier(n_estimators=100)
model.fit(X_train,t_train)
model.score(X_test,t_test)