# `Titanic Survival - From Top 70% to Top 7% on LB` を参考
- [リンク](https://www.kaggle.com/darshanjain29/titanic-survival-from-top-70-to-top-7-on-lb)

## 前処理

#### 訓練データとテストデータへの前処理方法
- リスト化してfor文で同一の前処理を施す

```python
data = [train_df, test_df] 
for dataset in data:
    # 共通の前処理
```

#### Cabin
- Deckに変換

```python
deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}
data = [train_df, test_df]

for dataset in data:
    dataset['Deck'] = dataset['Cabin'].fillna("U")
    dataset['Deck'] = dataset['Cabin'].astype(str).str[0] 
    dataset['Deck'] = dataset['Deck'].str.capitalize()
    dataset['Deck'] = dataset['Deck'].map(deck)
    dataset['Deck'] = dataset['Deck'].fillna(0)
    dataset['Deck'] = dataset['Deck'].astype(int) 
```

#### Age
- 欠損値は平均値で埋め
- なぜかTestデータの値を壊す
- 層に分ける
    - -11, 11-18, 18-22, 22-27, 27-33, 33-40, 40-66, 66-

```python
data = [train_df, test_df]
for dataset in data:
    #dataset['Age'] = dataset['Age'].astype(int)
    dataset.loc[ dataset['Age'] <= 11, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 11) & (dataset['Age'] <= 18), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 22), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 22) & (dataset['Age'] <= 27), 'Age'] = 3
    dataset.loc[(dataset['Age'] > 27) & (dataset['Age'] <= 33), 'Age'] = 4
    dataset.loc[(dataset['Age'] > 33) & (dataset['Age'] <= 40), 'Age'] = 5
    dataset.loc[(dataset['Age'] > 40) & (dataset['Age'] <= 66), 'Age'] = 6
    dataset.loc[ dataset['Age'] > 66, 'Age'] = 6
```

#### 料金
- 欠損値は0埋め
- 層に分ける
    - 7.91, 7.91-14.454, 14.454-31, 31-99, 99-250, 250-

```python
data = [train_df, test_df]

#train_df['category_fare'] = pd.qcut(train_df['Fare'], 4)

#train_df['category_fare'].value_counts()

for dataset in data:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[(dataset['Fare'] > 31) & (dataset['Fare'] <= 99), 'Fare']   = 3
    dataset.loc[(dataset['Fare'] > 99) & (dataset['Fare'] <= 250), 'Fare']   = 4
    dataset.loc[ dataset['Fare'] > 250, 'Fare'] = 5
    dataset['Fare'] = dataset['Fare'].astype(float)
```

#### 名前
- {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}で分類

```python
titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

for dataset in data:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.')
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\
                              'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'Mlle', 'Ms', 'Mme'], 'Rare')

    dataset['Title'] = dataset['Title'].map(titles)
    
    # filling NaN with 0, to get safe
    dataset['Title'] = dataset['Title'].fillna(0)
```

#### 標準化
- 適用せず

## モデル

#### 評価方法
- Cross Validation Scoreを使っている

```python
scores = cross_val_score(clf, X_train, Y_train, cv = 10, scoring = "accuracy")
print(scores)
```

#### LogisticRegression
```python
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression() 
clf.fit(X_train, Y_train)

Y_pred  = clf.predict(X_test)
```

#### svm.SVC()
```python
from sklearn import svm
clf_svm = svm.SVC()
clf_svm.fit(X_train, Y_train)

Y_pred_svm  = clf_svm.predict(X_test)
```

#### tree.DecisionTreeClassifier
```python
from sklearn import tree
clf_dt = tree.DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=25,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
clf_dt.fit(X_train, Y_train)

Y_pred_svm  = clf_dt.predict(X_test)
```

#### RandomForestClassifier
```python
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier(max_depth=2, random_state=0)

clf_rf.fit(X_train, Y_train)
```

#### GradientBoostingClassifier: 勾配ブースト
```python
from sklearn.ensemble import GradientBoostingClassifier

clf_gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)

clf_gbc.fit(X_train, Y_train)

Y_pred_rf  = clf_gbc.predict(X_test)
```

#### BaggingClassifier: バギング
```python
from sklearn.ensemble import BaggingClassifier

clf_bagging = BaggingClassifier()

clf_bagging.fit(X_train, Y_train)

Y_pred_rf  = clf_bagging.predict(X_test)
```

#### GaussianNB: ナイーブベイズ分類器
```python
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()

clf_gnb.fit(X_train, Y_train)

Y_pred_rf  = clf_gnb.predict(X_test)
```

#### XGBClassifier
```python
import xgboost as xgb
from xgboost import XGBClassifier

clf_xgb = XGBClassifier().fit(X_train, Y_train)

Y_pred  = clf_xgb.predict(X_test)

clf_xgb.score(X_train, Y_train)

scores_rf = cross_val_score(clf_xgb, X_train, Y_train, cv = 10, scoring = "accuracy")
print ("Scores: ",scores_rf.mean())
```