# データのインプット

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test = pd.read_csv("/kaggle/input/titanic/test.csv")
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


入力されたデータ
# データの入力

| 変数 | 定義 | キー |
|------|------|------|
| survival | 生存 | 0 = いいえ, 1 = はい |
| pclass | チケットのクラス | 1 = 1等, 2 = 2等, 3 = 3等 |
| sex | 性別 | |
| Age | 年齢（歳） | |
| sibsp | タイタニック号に乗船していた兄弟姉妹/配偶者の数 | |
| parch | タイタニック号に乗船していた親/子供の数 | |
| ticket | チケット番号 | |
| fare | 乗客運賃 | |
| cabin | 客室番号 | |
| embarked | 出港地 | C = シェルブール, Q = クイーンズタウン, S = サウサンプトン |


# k-最近傍法

In [4]:
train1 = train.copy()
test1 = test.copy()

## 前処理

In [5]:
# trainデータの欠損値確認
train1.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
train1["Age"] = train1["Age"].fillna(train1["Age"].mode()[0])
train1["Cabin"] = train1["Cabin"].fillna(train1["Cabin"].mode()[0])
train1["Embarked"] = train1["Embarked"].fillna(train1["Embarked"].mode()[0])

In [7]:
train1.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [8]:
# testデータの欠損値
test1.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [9]:
test1["Age"] = test1["Age"].fillna(test1["Age"].mode()[0])
test1["Cabin"] = test1["Cabin"].fillna(test1["Cabin"].mode()[0])
test1["Fare"] = test1["Fare"].fillna(test1["Fare"].mode()[0])

In [10]:
test1.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [11]:
# 文字列をダミー変数に
train1 = pd.get_dummies(train1, columns=['Sex', 'Embarked'])
test1 = pd.get_dummies(test1, columns=['Sex', 'Embarked'])

In [12]:
train1.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,B96 B98,False,True,False,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,True,False,True,False,False
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,B96 B98,True,False,False,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,True,False,False,False,True
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,B96 B98,False,True,False,False,True


## 生存書予測

In [13]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=1)

X_train1 = train1[["Pclass", "Age", "SibSp", "Parch", "Fare", "Sex_female", "Sex_male", "Embarked_C", "Embarked_Q", "Embarked_S"]]
y_train1 = train1["Survived"]

X_test1 = test1[["Pclass", "Age", "SibSp", "Parch", "Fare", "Sex_female", "Sex_male", "Embarked_C", "Embarked_Q", "Embarked_S"]]

clf.fit(X_train1, y_train1)

# t_test に予測結果を格納
y_test1 = clf.predict(X_test1)

In [14]:
# 回答ファイルの作成・エクスポート
# PassengerId を取得
PassengerId1 = np.array(test1["PassengerId"]).astype(int)

# y_test と PassengerId を結合
answer1 = pd.DataFrame(y_test1, PassengerId1, columns = ["Survived"])

# titanic_answer_01.csv としてエクスポート
answer1.to_csv("titanic_answer_01.csv", index_label = ["PassengerId"])

In [15]:
answer1.head()

Unnamed: 0,Survived
892,0
893,0
894,1
895,1
896,0


# GBDT

In [16]:
# データの取得
train2 = train.copy()
test2 = test.copy()

# 目的変数と特徴量に分ける
train2_x = train2.drop(['Survived'], axis=1)
train2_y = train['Survived']

# テストデータは特徴量のみ
test2_x = test2.copy()

In [17]:
train2.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 特徴量の作成

In [18]:
from sklearn.preprocessing import LabelEncoder

# 変数PassegerIdを取得する
train2_x = train2_x.drop(['PassengerId'], axis=1)
test2_x = test2_x.drop(['PassengerId'], axis=1)

In [19]:
# 変数Name, Ticket, Cabinを除外する
train2_x = train2_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)
test2_x = test2_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)

In [20]:
# それぞれのカテゴリ変数にlabel encordingを適用する
for c in ['Sex', 'Embarked']:
    le = LabelEncoder()
    le.fit(train2_x[c].fillna('NA'))
    
    # 学習データ、テストデータを変換する
    train2_x[c] = le.transform(train2_x[c].fillna('NA'))
    test2_x[c] = le.transform(test2_x[c].fillna('NA'))

## モデルの作成
今回は、GBDTの中でもよく使用されるxgboostを使用する

In [21]:
from xgboost import XGBClassifier

# モデルの作成および学習データを与えて学習
model = XGBClassifier(n_estimators=20, random_state=71)
model.fit(train2_x, train2_y)

In [22]:
# テストデータの予測値を確率で出力する
pred = model.predict_proba(test2_x)[:,1]

In [23]:
# テストデータの予測値を二値に変換する
pred_label = np.where(pred>0.5,1,0)

In [24]:
# 提出用のファイルの作成
submission = pd.DataFrame({'Survived':pred_label})
submission.insert(0, '', test['PassengerId'])
submission.to_csv('titanic_answer_02.csv', index=False)

# 提出用ファイルの確認

In [25]:
gender_submission = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")
gender_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [26]:
answer1.head()

Unnamed: 0,Survived
892,0
893,0
894,1
895,1
896,0


In [27]:
submission.head()

Unnamed: 0,Unnamed: 1,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
