## 資料匯入與預處理

In [145]:
import numpy as np
import pandas as pd
test = pd.read_csv('test.csv')
test2 = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
import warnings
print(train.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


### Fill missing data

In [146]:
tt=pd.concat([train, test], sort=False)
tt.isna().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [147]:
train.Age=train.Age.fillna(train.Age.mean())
test.Age=test.Age.fillna(test.Age.mean())

In [148]:
train.Fare=train.Fare.fillna(train.Fare.mean())
test.Fare=test.Fare.fillna(train.Fare.mean())

In [149]:
train.Cabin=train.Cabin.fillna("unknow")
test.Cabin=test.Cabin.fillna("unknow")

In [150]:
train.Embarked=train.Embarked.fillna(train.Embarked.mode()[0])
test.Embarked=test.Embarked.fillna(train.Embarked.mode()[0])

In [151]:
train.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [152]:
test.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [153]:
train.drop(['PassengerId','Name','Ticket','SibSp','Parch','Cabin'],axis=1,inplace=True)
test.drop(['PassengerId','Name','Ticket','SibSp','Parch','Cabin'],axis=1,inplace=True)
print(train.head())
print(test.head())

   Survived  Pclass     Sex   Age     Fare Embarked
0         0       3    male  22.0   7.2500        S
1         1       1  female  38.0  71.2833        C
2         1       3  female  26.0   7.9250        S
3         1       1  female  35.0  53.1000        S
4         0       3    male  35.0   8.0500        S
   Pclass     Sex   Age     Fare Embarked
0       3    male  34.5   7.8292        Q
1       3  female  47.0   7.0000        S
2       2    male  62.0   9.6875        Q
3       3    male  27.0   8.6625        S
4       3  female  22.0  12.2875        S


### LabelEncoding

In [154]:
train['Sex'] = train['Sex'].replace({'male':0,'female':1})
test['Sex'] = test['Sex'].replace({'male':0,'female':1})
train['Embarked'] = train['Embarked'].replace({'C':0,'Q':1,'S':2})
test['Embarked'] = test['Embarked'].replace({'C':0,'Q':1,'S':2})
print(train.head())
print(test.head())

   Survived  Pclass  Sex   Age     Fare  Embarked
0         0       3    0  22.0   7.2500         2
1         1       1    1  38.0  71.2833         0
2         1       3    1  26.0   7.9250         2
3         1       1    1  35.0  53.1000         2
4         0       3    0  35.0   8.0500         2
   Pclass  Sex   Age     Fare  Embarked
0       3    0  34.5   7.8292         1
1       3    1  47.0   7.0000         2
2       2    0  62.0   9.6875         1
3       3    0  27.0   8.6625         2
4       3    1  22.0  12.2875         2


## Random Forest 

In [155]:
xtrain=train.drop("Survived",axis=1)
ytrain=train['Survived']
print(xtrain.head())
print(ytrain.head())

   Pclass  Sex   Age     Fare  Embarked
0       3    0  22.0   7.2500         2
1       1    1  38.0  71.2833         0
2       3    1  26.0   7.9250         2
3       1    1  35.0  53.1000         2
4       3    0  35.0   8.0500         2
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


In [156]:
#學聯網老師
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, criterion='gini', max_features='auto', oob_score=True)
rfc.fit(xtrain,ytrain)
print("oob_score(accuary):",rfc.oob_score_)

oob_score(accuary): 0.8103254769921436


In [157]:
#top3%,Example
RF=RandomForestClassifier(random_state=1)
PRF=[{'n_estimators':[10,100],'max_depth':[3,6],'criterion':['gini','entropy']}]
GSRF=GridSearchCV(estimator=RF, param_grid=PRF, scoring='accuracy',cv=2)
scores_rf=cross_val_score(GSRF,xtrain,ytrain,scoring='accuracy',cv=5)
np.mean(scores_rf)

0.8271965577541851

In [158]:
#top3%,Example
svc=make_pipeline(StandardScaler(),SVC(random_state=1))
r=[0.0001,0.001,0.1,1,10,50,100]
PSVM=[{'svc__C':r, 'svc__kernel':['linear']},
      {'svc__C':r, 'svc__gamma':r, 'svc__kernel':['rbf']}]
GSSVM=GridSearchCV(estimator=svc, param_grid=PSVM, scoring='accuracy', cv=2)
scores_svm=cross_val_score(GSSVM, xtrain.astype(float), ytrain,scoring='accuracy', cv=5)
np.mean(scores_svm)

0.8193439433545867

In [159]:
model=GSRF.fit(xtrain, ytrain)

In [160]:
pred=model.predict(test)

In [161]:
test2.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [162]:
df = pd.DataFrame({'PassengerId':test2['PassengerId'],'Survived':pred})
df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [163]:
df.to_csv("Whitney_submission.csv",index=False)