# データのインプット

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
train.head()

In [None]:
test = pd.read_csv("/kaggle/input/titanic/test.csv")
test.head()

入力されたデータ
# データの入力

| 変数 | 定義 | キー |
|------|------|------|
| survival | 生存 | 0 = いいえ, 1 = はい |
| pclass | チケットのクラス | 1 = 1等, 2 = 2等, 3 = 3等 |
| sex | 性別 | |
| Age | 年齢（歳） | |
| sibsp | タイタニック号に乗船していた兄弟姉妹/配偶者の数 | |
| parch | タイタニック号に乗船していた親/子供の数 | |
| ticket | チケット番号 | |
| fare | 乗客運賃 | |
| cabin | 客室番号 | |
| embarked | 出港地 | C = シェルブール, Q = クイーンズタウン, S = サウサンプトン |


# k-最近傍法

In [None]:
train1 = train.copy()
test1 = test.copy()

## 前処理

In [None]:
# trainデータの欠損値確認
train1.isnull().sum()

In [None]:
train1["Age"] = train1["Age"].fillna(train1["Age"].mode()[0])
train1["Cabin"] = train1["Cabin"].fillna(train1["Cabin"].mode()[0])
train1["Embarked"] = train1["Embarked"].fillna(train1["Embarked"].mode()[0])

In [None]:
train1.isnull().sum()

In [None]:
# testデータの欠損値
test1.isnull().sum()

In [None]:
test1["Age"] = test1["Age"].fillna(test1["Age"].mode()[0])
test1["Cabin"] = test1["Cabin"].fillna(test1["Cabin"].mode()[0])
test1["Fare"] = test1["Fare"].fillna(test1["Fare"].mode()[0])

In [None]:
test1.isnull().sum()

In [None]:
# 文字列をダミー変数に
train1 = pd.get_dummies(train1, columns=['Sex', 'Embarked'])
test1 = pd.get_dummies(test1, columns=['Sex', 'Embarked'])

In [None]:
train1.head()

## 生存書予測

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=1)

X_train1 = train1[["Pclass", "Age", "SibSp", "Parch", "Fare", "Sex_female", "Sex_male", "Embarked_C", "Embarked_Q", "Embarked_S"]]
y_train1 = train1["Survived"]

X_test1 = test1[["Pclass", "Age", "SibSp", "Parch", "Fare", "Sex_female", "Sex_male", "Embarked_C", "Embarked_Q", "Embarked_S"]]

clf.fit(X_train1, y_train1)

# t_test に予測結果を格納
y_test1 = clf.predict(X_test1)

In [None]:
# 回答ファイルの作成・エクスポート
# PassengerId を取得
PassengerId1 = np.array(test1["PassengerId"]).astype(int)

# y_test と PassengerId を結合
answer1 = pd.DataFrame(y_test1, PassengerId1, columns = ["Survived"])

# titanic_answer_01.csv としてエクスポート
answer1.to_csv("titanic_answer_01.csv", index_label = ["PassengerId"])

In [None]:
answer1.head()

# GBDT

In [None]:
# データの取得
train2 = train.copy()
test2 = test.copy()

# 目的変数と特徴量に分ける
train2_x = train2.drop(['Survived'], axis=1)
train2_y = train['Survived']

# テストデータは特徴量のみ
test2_x = test2.copy()

In [None]:
train2.head()

## 特徴量の作成

In [None]:
from sklearn.preprocessing import LabelEncoder

# 変数PassegerIdを取得する
train2_x = train2_x.drop(['PassengerId'], axis=1)
test2_x = test2_x.drop(['PassengerId'], axis=1)

In [None]:
# 変数Name, Ticket, Cabinを除外する
train2_x = train2_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)
test2_x = test2_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)

In [None]:
# それぞれのカテゴリ変数にlabel encordingを適用する
for c in ['Sex', 'Embarked']:
    le = LabelEncoder()
    le.fit(train2_x[c].fillna('NA'))
    
    # 学習データ、テストデータを変換する
    train2_x[c] = le.transform(train2_x[c].fillna('NA'))
    test2_x[c] = le.transform(test2_x[c].fillna('NA'))

## モデルの作成
今回は、GBDTの中でもよく使用されるxgboostを使用する

In [None]:
from xgboost import XGBClassifier

# モデルの作成および学習データを与えて学習
model = XGBClassifier(n_estimators=20, random_state=71)
model.fit(train2_x, train2_y)

In [None]:
# テストデータの予測値を確率で出力する
pred = model.predict_proba(test2_x)[:,1]

In [None]:
# テストデータの予測値を二値に変換する
pred_label = np.where(pred>0.5,1,0)

In [None]:
# 提出用のファイルの作成
submission = pd.DataFrame({'Survived':pred_label})
submission.insert(0, 'PassengerId', test['PassengerId'])
submission.to_csv('titanic_answer_02.csv', index=False)

In [None]:
# 回答ファイルの作成・エクスポート
# PassengerId を取得
PassengerId2 = np.array(test2["PassengerId"]).astype(int)

# y_test と PassengerId を結合
answer2 = pd.DataFrame(pred_label, PassengerId2, columns = ["Survived"])

# titanic_answer_01.csv としてエクスポート
answer2.to_csv("titanic_answer_02.csv", index_label = ["PassengerId"])

# 提出用ファイルの確認

In [None]:
gender_submission = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")
gender_submission.head()

In [None]:
answer1.head()

In [None]:
answer2.head()