<a href="https://colab.research.google.com/github/Re14m/training/blob/master/2022-0531_recipie347.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [脅威の無料AutoML「Pycaret」でKaggleに挑む](https://axross-recipe.com/recipes/347)

## データセットの準備

In [None]:
# Google Driveをマウント
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# パッケージのインストール
!pip install pycaret
!pip install pandas-profiling==3.1.0

In [None]:
# Kaggle APIのインストール
!pip install kaggle

In [None]:
# jsonで認証
from google.colab import files
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

In [None]:
# datasetダウンロード
!kaggle competitions download -c titanic

In [None]:
# dataset 解凍
!unzip /content/titanic.zip

## 課題抽出

In [None]:
# パッケージのインポート
import pandas as pd
import numpy as np
import random as rnd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# データの読込
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [None]:
# 特徴量の確認
print(train_df.columns.values)

In [None]:
# データの種類の確認
display(train_df.head())
print('_'*40)
display(train_df.tail())

In [None]:
# 欠損値とデータ型の確認
print(train_df.info())
print('_'*40)
print(test_df.info())

In [None]:
# 統計情報(数値データ)の表示
train_df.describe()

In [None]:
# 統計情報の表示(カテゴリーデータ)
train_df.describe(include=['O'])

## データの基礎集計

In [None]:
# Pclass別の平均生存率
train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
# Pclass別の生存数の頻度分布
sns.countplot(train_df['Pclass'], hue=train_df['Survived'])

In [None]:
# 学習用データのPclass別の頻度
train_Pclass = train_df['Pclass'].value_counts(normalize=True, sort=False)
train_Pclass = train_Pclass.rename('Train')

# テスト用データのPclass別の頻度
test_Pclass = test_df['Pclass'].value_counts(normalize=True, sort=False)
test_Pclass = test_Pclass.rename('Test')

pd.concat([train_Pclass , test_Pclass], axis=1)

In [None]:
# Nameの基礎集計
print(train_df[['Name']].head())

In [None]:
# 敬称の抽出
train_df['Title'] = train_df['Name'].map(lambda x: x.split(', ')[1].split('. ')[0])
train_df['Title'].value_counts() #敬称別の頻度分布

In [None]:
# 敬称(Title)別の平均生存率
train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
# 学習用データの敬称(Title)別の頻度
train_Title = train_df['Title'].value_counts(sort=False)
train_Title = train_Title.rename('Train')

# テスト用データの敬称の抽出
test_df['Title'] = test_df['Name'].map(lambda x: x.split(', ')[1].split('. ')[0])
# テスト用データの敬称(Title)別の頻度
test_Title = test_df['Title'].value_counts(sort=False)
test_Title = test_Title.rename('Test')

# 並べて出力
merge_Title = pd.concat([train_Title , test_Title], axis=1)
display(merge_Title)

In [None]:
# 学習用にしかない敬称を抽出
train_only_title = merge_Title[merge_Title['Test'].isnull()].index
train_only_title = train_only_title.tolist()
print(train_only_title)

# 学習用にしかない敬称を除外
train_df2 = train_df[~train_df['Title'].isin(train_only_title)].reset_index(drop=True)
train_df2['Title'].value_counts() #敬称別の頻度分布

In [None]:
# 「Ms」を「Miss」に置換
train_df2['Title'].replace('Ms', 'Miss',inplace=True)

#「Master」,「Miss」,「Mrs」,「Mr」以外の敬称を統合
train_df2['Title'].replace(['Col','Dr', 'Rev'], 'Rare',inplace=True)
train_df2['Title'].value_counts() #敬称別の頻度分布

In [None]:
# 敬称別の平均生存率
train_df2['Survived'].groupby(train_df2['Title']).mean()

In [None]:
# Sexの基礎集計
# Sex別の平均生存率
train_df[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
# Sex別の生存数の頻度分布
sns.countplot(train_df['Sex'], hue=train_df['Survived'])

In [None]:
# 学習用データのSex別の頻度
train_Pclass = train_df['Sex'].value_counts(normalize=True, sort=False)
train_Pclass = train_Pclass.rename('Train')

# テスト用データのSex別の頻度
test_Pclass = test_df['Sex'].value_counts(normalize=True, sort=False)
test_Pclass = test_Pclass.rename('Test')

# 並べて表示
pd.concat([train_Pclass , test_Pclass], axis=1)

In [None]:
# Ageの欠損有無による生存率差
train_df['Survived'].groupby(train_df['Age'].isnull()).mean()

In [None]:
# Ageの欠損値を除外
train_df_age = train_df.dropna(subset=['Age'])

# Ageの分布
sns.distplot(train_df_age[train_df_age['Survived']==1]['Age'],kde=True,rug=False,bins=10,label='Survived=1') #生存者の分布
sns.distplot(train_df_age[train_df_age['Survived']==0]['Age'],kde=True,rug=False,bins=10,label='Survived=0') #死亡者の分布
plt.legend()

In [None]:
# テスト用でのAge欠損値を除外
train_df_age = train_df.dropna(subset=['Age'])

# TrainとTestのAge分布
sns.distplot(train_df_age['Age'],kde=True,rug=False,bins=10,label='Train') 
sns.distplot(test_df['Age'],kde=True,rug=False,bins=10,label='Test')
plt.legend()

In [None]:
# SibSp別の平均生存率
train_df[['SibSp', 'Survived']].groupby(['SibSp'], as_index=False).mean()

In [None]:
# SibSp別の生存数の頻度分布
sns.countplot(train_df['SibSp'], hue=train_df['Survived'])

In [None]:
# ibSpを生存率予測モデルの特徴量として利用
train_Pclass = train_df['SibSp'].value_counts(normalize=True, sort=False)
train_Pclass = train_Pclass.rename('Train')

# 学習用データのSibSp別の頻度
test_Pclass = test_df['SibSp'].value_counts(normalize=True, sort=False)
test_Pclass = test_Pclass.rename('Test')

# 並べて出力
pd.concat([train_Pclass , test_Pclass], axis=1)

In [None]:
# Parch別の平均生存率
train_df[['Parch', 'Survived']].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
# Parch別の生存数の頻度分布
sns.countplot(train_df['Parch'], hue=train_df['Survived'])

In [None]:
# 学習用データのParch別の頻度
train_Pclass = train_df['Parch'].value_counts(normalize=True, sort=False)
train_Pclass = train_Pclass.rename('Train')

# テスト用データのParch別の頻度
test_Pclass = test_df['Parch'].value_counts(normalize=True, sort=False)
test_Pclass = test_Pclass.rename('Test')

# 並べて出力
pd.concat([train_Pclass , test_Pclass], axis=1)

In [None]:
# Ticketの表示（特徴量としては今回は採用しない）
print(train_df[['Ticket']].head())
print(train_df[['Ticket']].tail())

In [None]:
# Fareの分布
sns.distplot(train_df_age[train_df['Survived']==1]['Fare'],kde=True,rug=False,bins=10,label='Survived=1') #生存者の分布
sns.distplot(train_df_age[train_df['Survived']==0]['Fare'],kde=True,rug=False,bins=10,label='Survived=0') #死亡者の分布
plt.legend()

In [None]:
# 対数変換後の描画
sns.distplot(np.log1p(train_df[train_df['Survived']==1]['Fare']),kde=True,rug=False,bins=10,label='Survived=1')
sns.distplot(np.log1p(train_df[train_df['Survived']==0]['Fare']),kde=True,rug=False,bins=10,label='Survived=0')
plt.legend()

In [None]:
# 対数変換後の描画
sns.distplot(np.log1p(train_df['Fare']),kde=True,rug=False,bins=10,label='Train')
sns.distplot(np.log1p(test_df['Fare']),kde=True,rug=False,bins=10,label='Test')
plt.legend()

In [None]:
# Cabin
print(train_df[['Cabin']].head())
print(train_df[['Cabin']].tail())

In [None]:
# Embarked別の平均生存率
train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
# Embarked別の生存数の頻度分布
sns.countplot(train_df['Embarked'], hue=train_df['Survived'])

In [None]:
# 学習用データのEmbarked別の頻度
train_Pclass = train_df['Embarked'].value_counts(normalize=True, sort=False)
train_Pclass = train_Pclass.rename('Train')

# テスト用データのEmbarked別の頻度
test_Pclass = test_df['Embarked'].value_counts(normalize=True, sort=False)
test_Pclass = test_Pclass.rename('Test')

# 並べて出力
pd.concat([train_Pclass , test_Pclass], axis=1)

## 基礎集計の整理
ここまでの各カラムので基礎集計を下記のように整理する。
*   PassengerId
不採用。
*   `Survived`
採用。
*   `Pclass`
採用。上級階級が生存しやすい。
*   `Name(敬称)`
採用。Age等ではわからない乗客の属性。
*   `Sex`
採用。女性が生存しやすい。
*   `Age`
採用。0～20歳が生存しやすい。
*   `SibSp`
採用。兄弟/配偶者の構成。
*   `Parch`
採用。親/子供の構成。
*   Ticket
不採用。
*   `Fare`
採用。低い運賃の乗客が死亡しやすい。
*   Cabin
不採用。
*   `Embarked`
採用。地域特性。

## 特徴量の作成と整理

In [None]:
# dataset 読込
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
combine = [train_df, test_df]

In [None]:
# 不要な特徴量の除去
print('Before', combine[0].shape, combine[1].shape)

combine[0].drop(['PassengerId','Ticket', 'Cabin'], axis=1, inplace=True)
combine[1].drop(['Ticket', 'Cabin'], axis=1, inplace=True)

print('After', combine[0].shape, combine[1].shape)

In [None]:
# 欠損値の補完
# 学習用とテスト用をfor文で逐次加工
for dataset in combine:
    # Ageの補完
    dataset["Age"].fillna(int(dataset.Age.mean()), inplace=True)
    # Fareの補完
    dataset["Fare"].fillna(dataset.Fare.median(), inplace=True)
    # Embarkedの補完
    dataset["Embarked"].fillna(dataset.Embarked.mode()[0], inplace=True)

In [None]:
# 補完後のdatasetの確認
for dataset in combine:
    print(dataset.info())

In [None]:
# データ型の整理
# 文字列型に変換
for dataset in combine:
    dataset['Pclass'] = dataset['Pclass'].astype('str')

In [None]:
# 新規特徴量の作成
# 学習用データの敬称(Title)別の頻度
combine[0]['Title'] = combine[0]['Name'].map(lambda x: x.split(', ')[1].split('. ')[0])
train_Title = combine[0]['Title'].value_counts(sort=False)
train_Title = train_Title.rename('Train')

# テスト用データの敬称の抽出
combine[1]['Title'] = combine[1]['Name'].map(lambda x: x.split(', ')[1].split('. ')[0])
# テスト用データの敬称(Title)別の頻度
test_Title = combine[1]['Title'].value_counts(sort=False)
test_Title = test_Title.rename('Test')

merge_Title = pd.concat([train_Title , test_Title], axis=1)

# 学習用にしかない敬称を抽出
train_only_title = merge_Title[merge_Title['Test'].isnull()].index
train_only_title = train_only_title.tolist()
# 学習用にしかない敬称を除外
combine[0] = combine[0][~combine[0]['Title'].isin(train_only_title)].reset_index(drop=True)

# 敬称の置換
for dataset in combine:
    # 「Ms」を「Miss」に置換
    dataset['Title'].replace('Ms', 'Miss',inplace=True)
    #「Master」,「Miss」,「Mrs」,「Mr」以外の敬称を統合
    dataset['Title'].replace(['Col','Dr', 'Rev', 'Dona'], 'Rare',inplace=True)
    # 確認
    print(dataset['Title'].value_counts(sort=False))

In [None]:
# モデル作成で使用しないNameを除去
print('Before', combine[0].shape, combine[1].shape)

for dataset in combine:
    dataset.drop('Name', axis=1, inplace=True)
    print(dataset.columns)

print('After', combine[0].shape, combine[1].shape)

In [None]:
for dataset in combine:
    # 家族構成 = Parch + SibSp + 1
    dataset['Family'] = dataset['Parch'] + dataset['SibSp'] + 1 #学習データ

# 家族構成の平均生存率
display(combine[0][['Family', 'Survived']].groupby(['Family'], as_index=False).mean())

# 家族構成ごとの分布を確認
sns.countplot(combine[0]['Family'],hue=combine[0]['Survived'])

In [None]:
for dataset in combine:
    # FamilySizeを離散化
    dataset['FamilySize'] = 'big'
    dataset.loc[dataset['Family']==1,'FamilySize'] = 'alone'
    dataset.loc[(dataset['Family']>=2) & (dataset['Family']<=4),'FamilySize'] = 'small'
    dataset.loc[(dataset['Family']>=5) & (dataset['Family']<=7),'FamilySize'] = 'mediam'

# FamilySizeを離散化したときの生存率
combine[0]['Survived'].groupby(combine[0]['FamilySize']).mean()

In [None]:
# モデル作成で使用しないSibSp,Parch,Familyを削除
print('Before', combine[0].shape, combine[1].shape)

for dataset in combine:
    dataset.drop(['SibSp', 'Parch', 'Family'], axis=1, inplace=True)
    print(dataset.columns)

print('After', combine[0].shape, combine[1].shape)

In [None]:
# Fareを4群に分割
for dataset in combine:
    dataset['FareBand'] = pd.qcut(dataset['Fare'], 4)

# Fareをカテゴリー化した平均生存率
combine[0][['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean()

In [None]:
# モデル作成で使用しないFareを削除
print('Before', combine[0].shape, combine[1].shape)

for dataset in combine:
    dataset.drop(['Fare'], axis=1, inplace=True)
    print(dataset.columns)

print('After', combine[0].shape, combine[1].shape)

In [None]:
# 加工データを出力
combine[0].to_csv('train_processing.csv', index=False)
combine[1].to_csv('test_processing.csv', index=False)

## pycaretによるモデル作成

In [None]:
# dataset読込
from pycaret.classification import *
train_df = pd.read_csv("train_processing.csv")

# 前処理
exp_1 = setup(data = train_df, target = "Survived", session_id = 123)

In [None]:
# モデル比較
compare_models()

In [None]:
# モデル作成
model = create_model("gbc")

In [None]:
# モデルの最適化
tuned_model = tune_model(model, optimize = 'Accuracy')

In [None]:
# モデルの評価指標
evaluate_model(tuned_model)

In [None]:
# テストデータの予測
# 作成モデルの読込
final_model = finalize_model(tuned_model)

# データの読込
test_df = pd.read_csv("test_processing.csv")
result = predict_model(final_model, data = test_df)

In [None]:
# データを提出用にフォーマット
result.rename(columns={"Label": "Survived"}).to_csv("pycaret_result.csv", index=False)
result[["PassengerId", "Label"]].rename(columns={"Label": "Survived"}).to_csv("submission.csv", index=False)