In [1]:
# データインポートライブラリ
import requests
import zipfile
import io
import re as re

# データ加工・処理・分析ライブラリ
import numpy as np
import numpy.random as random
import scipy as sp
from pandas import Series, DataFrame
import pandas as pd


# 可視化ライブラリ
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import sweetviz as sv

%matplotlib inline

%precision 3

'%.3f'

In [2]:
#pathからcsvファイルを読み込む場合
df = pd.read_csv("./data/train.csv")
df_test = pd.read_csv("./data/test.csv")
print("Shape of Train Data\n[row :{},column :{}]".format(
    df.shape[0], df.shape[1]))
print("--------------------")
print("Shape of Test Data\n[row :{},column :{}]".format(
    df_test.shape[0], df_test.shape[1]))
print("--------------------")
df.head()


Shape of Train Data
[row :891,column :12]
--------------------
Shape of Test Data
[row :418,column :11]
--------------------


Unnamed: 0,PassengerId,Perished,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,1,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
profile_df = sv.analyze(df, target_feat="Perished")
profile_df.show_html()

Done! Use 'show' commands to display/save.   |██████████| [100%]   00:00 -> (00:00 left)


Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [4]:
age = pd.concat([df['Age'], df_test['Age']])
fare = pd.concat([df['Fare'], df_test['Fare']])

df['Age'].fillna(age.mean(), inplace=True)
df_test['Age'].fillna(age.mean(), inplace=True)

df['Fare'].fillna(fare.mean(), inplace=True)
df_test['Fare'].fillna(fare.mean(), inplace=True)

df.isnull().sum()

PassengerId      0
Perished         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
df.drop('Cabin', axis=1, inplace=True)
df_test.drop('Cabin', axis=1, inplace=True)

In [6]:
df['Embarked'].fillna('S', inplace=True)
df_test['Embarked'].fillna('S', inplace=True)
df.isnull().sum()

PassengerId    0
Perished       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [7]:
df.drop('Ticket', axis=1, inplace=True)
df_test.drop('Ticket', axis=1, inplace=True)

In [8]:
def get_title(name):
	title_search = re.search(' ([A-Za-z]+)\.', name)
	# If the title exists, extract and return it.
	if title_search:
		return title_search.group(1)
	return ""


df['Title'] = df['Name'].apply(get_title)
df_test['Title'] = df_test['Name'].apply(get_title)
df.drop('Name',axis=1,inplace=True)
df_test.drop('Name', axis=1, inplace=True)
df.head()


Unnamed: 0,PassengerId,Perished,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,1,3,male,22.0,1,0,7.25,S,Mr
1,2,0,1,female,38.0,1,0,71.2833,C,Mrs
2,3,0,3,female,26.0,0,0,7.925,S,Miss
3,4,0,1,female,35.0,1,0,53.1,S,Mrs
4,5,1,3,male,35.0,0,0,8.05,S,Mr


In [9]:
df.replace({'Sex': {'male': 0, 'female': 1}}, inplace=True)
df_test.replace({'Sex': {'male': 0, 'female': 1}}, inplace=True)
df.head()

Unnamed: 0,PassengerId,Perished,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,1,3,0,22.0,1,0,7.25,S,Mr
1,2,0,1,1,38.0,1,0,71.2833,C,Mrs
2,3,0,3,1,26.0,0,0,7.925,S,Miss
3,4,0,1,1,35.0,1,0,53.1,S,Mrs
4,5,1,3,0,35.0,0,0,8.05,S,Mr


In [10]:
df['Title'] = df['Title'].replace(['Lady', 'Countess', 'Capt', 'Col',
                                             'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

df['Title'] = df['Title'].replace('Mlle', 'Miss')
df['Title'] = df['Title'].replace('Ms', 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
df['Title'] = df['Title'].map(title_mapping)
df['Title'] = df['Title'].fillna(0)
df.head()

Unnamed: 0,PassengerId,Perished,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,1,3,0,22.0,1,0,7.25,S,1
1,2,0,1,1,38.0,1,0,71.2833,C,3
2,3,0,3,1,26.0,0,0,7.925,S,2
3,4,0,1,1,35.0,1,0,53.1,S,3
4,5,1,3,0,35.0,0,0,8.05,S,1


In [11]:
df_test['Title'] = df_test['Title'].replace(['Lady', 'Countess', 'Capt', 'Col',
                                   'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

df_test['Title'] = df_test['Title'].replace('Mlle', 'Miss')
df_test['Title'] = df_test['Title'].replace('Ms', 'Miss')
df_test['Title'] = df_test['Title'].replace('Mme', 'Mrs')
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
df_test['Title'] = df_test['Title'].map(title_mapping)
df_test['Title'] = df_test['Title'].fillna(0)
df_test.head()


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,892,3,0,34.5,0,0,7.8292,Q,1
1,893,3,1,47.0,1,0,7.0,S,3
2,894,2,0,62.0,0,0,9.6875,Q,1
3,895,3,0,27.0,0,0,8.6625,S,1
4,896,3,1,22.0,1,1,12.2875,S,3


In [12]:
embarked = pd.concat([df['Embarked'], df_test['Embarked']])

embarked_ohe = pd.get_dummies(embarked)

embarked_ohe_train = embarked_ohe[:891]
embarked_ohe_test = embarked_ohe[891:]

df = pd.concat([df, embarked_ohe_train], axis=1)
df_test = pd.concat([df_test, embarked_ohe_test], axis=1)

df.drop('Embarked', axis=1, inplace=True)
df_test.drop('Embarked', axis=1, inplace=True)
df.head()

Unnamed: 0,PassengerId,Perished,Pclass,Sex,Age,SibSp,Parch,Fare,Title,C,Q,S
0,1,1,3,0,22.0,1,0,7.25,1,0,0,1
1,2,0,1,1,38.0,1,0,71.2833,3,1,0,0
2,3,0,3,1,26.0,0,0,7.925,2,0,0,1
3,4,0,1,1,35.0,1,0,53.1,3,0,0,1
4,5,1,3,0,35.0,0,0,8.05,1,0,0,1


In [13]:
df['Family'] = df['SibSp']+df['Parch']
df_test['Family'] = df_test['SibSp']+df_test['Parch']
df.drop(['SibSp','Parch'], axis=1, inplace=True)
df_test.drop(['SibSp','Parch'], axis=1, inplace=True)
df.head()

Unnamed: 0,PassengerId,Perished,Pclass,Sex,Age,Fare,Title,C,Q,S,Family
0,1,1,3,0,22.0,7.25,1,0,0,1,1
1,2,0,1,1,38.0,71.2833,3,1,0,0,1
2,3,0,3,1,26.0,7.925,2,0,0,1,0
3,4,0,1,1,35.0,53.1,3,0,0,1,1
4,5,1,3,0,35.0,8.05,1,0,0,1,0


In [14]:
#save csv
import sys
import os
PATH_dir = "./df"
df.to_csv(os.path.join(PATH_dir, "EDA_03_train.csv"), index_label=False)
df_test.to_csv(os.path.join(PATH_dir, "EDA_03_test.csv"), index_label=False)