In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
import seaborn as sns

# 한글 폰트 설정
font_name = font_manager.FontProperties(fname='c:/Windows/Fonts/malgun.ttf').get_name()
rc('font', family=font_name)

params = {
    'figure.figsize': [12, 8],
    "font.family": "Malgun Gothic",
    "font.size": 18,
    "font.weight": "bold",
    "axes.unicode_minus": False,
    "axes.labelsize": 13,
    "axes.labelweight": "bold",
    "xtick.labelsize": 13,
    "ytick.labelsize": 13,
    "legend.fontsize": 13
}
plt.rcParams.update(params)

%config InlineBackend.figure_format = 'retina'

In [2]:
# 데이타 로딩 - 기존 데이타셋과 다르기에 확인 필요
raw_data = pd.read_csv('./data/titanic.csv')
raw_data.info()

raw_data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.758889,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.00257,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,30.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [3]:
raw_data.sort_values('Age')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
803,804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C
755,756,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5000,,S
644,645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C
469,470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C
831,832,1,2,"Richards, Master. George Sibley",male,0.83,1,1,29106,18.7500,,S
78,79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29.0000,,S
305,306,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.5500,C22 C26,S
164,165,0,3,"Panula, Master. Eino Viljami",male,1.00,4,1,3101295,39.6875,,S
827,828,1,2,"Mallet, Master. Andre",male,1.00,0,2,S.C./PARIS 2079,37.0042,,C
172,173,1,3,"Johnson, Miss. Eleanor Ileen",female,1.00,1,1,347742,11.1333,,S


##  생존자와 사망자의 시각화

In [4]:
raw_data.columns
raw_data.head()
data = raw_data.groupby('Survived').count()

plt.bar(['사망', '생존'], data['생존여부'])

KeyError: '생존여부'

# 연령분포 시각화

In [None]:
raw_data['AgeCeil'] = raw_data['Age'].apply(np.ceil).astype(int)
data2 = raw_data.sort_values('AgeCeil')
data2 = data2.groupby('AgeCeil').count()
# plt.plot(data2['PassengerId'])
plt.bar(data2.index, data2['PassengerId'])

# 객실등급별로 각 평균 통계 - 표

In [None]:
data3 = raw_data.groupby('Pclass').count()
data3

# 성별, 연령대, 객실등급별 생존자 시각화

In [None]:
sex = raw_data.groupby('Sex').count()['']
sex

In [None]:
check_columns = ['Sex', 'Age', 'Pclass']
title_list = ['\n성별 생존자수 비교\n', '\n성별 생존자수 비율 비교\n',
              '\n연령대별 생존자수 비교\n', '\n연령대별 생존자수 비율 비교\n',
              '\n객실등급별 생존자수 비교\n', '\n객실등급별 생존자수 비율 비교\n']
gridsize=((len(check_columns), 2))
plt.figure(figsize=(16, 5*gridsize[0]))

for i in range(0, gridsize[0]*gridsize[1]):
    globals()['ax{}'.format(i+1)] = plt.subplot(gridsize[0], gridsize[1], i+1)

k = 0
for i, column in enumerate(check_columns):
    i *= 2
    tab = pd.crosstab(raw_data[column], raw_data['Survived'])
    tab.plot(kind='bar', stacked=True, rot=0, ax=globals()['ax{}'.format(i+1)])
    tab.div(tab.sum(1), axis=0).plot(kind='bar', stacked=True, rot=0, ax=globals()['ax{}'.format(i+2)])
    globals()['ax{}'.format(i+1)].set_title(title_list[k], fontsize=20, fontweight='bold')
    globals()['ax{}'.format(i+2)].set_title(title_list[k+1], fontsize=20, fontweight='bold')
    if column == 'Age':
        globals()['ax{}'.format(i+1)].set_xticks([10, 20, 30, 40, 50, 60, 70])
        globals()['ax{}'.format(i+1)].set_xticklabels([10, 20, 30, 40, 50, 60, 70])
        globals()['ax{}'.format(i+2)].set_xticks([10, 20, 30, 40, 50, 60, 70])
        globals()['ax{}'.format(i+2)].set_xticklabels([10, 20, 30, 40, 50, 60, 70])
    k += 2

plt.tight_layout()
plt.show()

# 상관관계 분석하여 해석하기

In [None]:
# 남자, 여자 0, 1로 형태로 변환
raw_data['adj_Sex'] = raw_data['Sex'].apply(lambda x: 0 if x == 'male' else 1)
corr_columns = ['adj_Sex', 'Age', 'Pclass', 'Survived']
corr_data = raw_data[corr_columns]
corr_data.tail()

In [None]:
# pearson corr
from scipy.stats import pearsonr

def corrcoef(dframe):

    fmatrix = dframe.values
    rows, cols = fmatrix.shape

    r = np.ones((cols, cols), dtype=float)
    p = np.ones((cols, cols), dtype=float)

    for i in range(cols):
        for j in range(cols):
            if i == j:
                r_, p_ = 1., 1.
            else:
                r_, p_ = pearsonr(fmatrix[:,i], fmatrix[:,j])

            r[j][i] = r_
            p[j][i] = p_

    return r, p

pearson_corr = pd.DataFrame(corrcoef(corr_data)[0], index=corr_columns, columns=corr_columns)
pearson_pval = pd.DataFrame(corrcoef(corr_data)[1], index=corr_columns, columns=corr_columns)
plt.figure(figsize=(16, 7))

ax1 = plt.subplot(1, 2, 1)
ax2 = plt.subplot(1, 2, 2)

mask = np.zeros_like(pearson_corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
cmap = sns.diverging_palette(220, 10, as_cmap=True)


sns.heatmap(pearson_corr, mask=mask, cmap=cmap, vmax=1, center=0,
            square=True, annot=True, fmt='.2g', linewidths=0, cbar_kws={"shrink": .8}, ax=ax1)
sns.heatmap(pearson_pval, mask=mask, cmap=cmap, vmax=1, center=0,
            square=True, annot=True, fmt='.2g', linewidths=0, cbar_kws={"shrink": .8}, ax=ax2)


ax1.set_title('\nPearson Correlation\n', fontsize=20, fontweight='bold', loc='left')
ax2.set_title('\np-value \n', fontsize=20, fontweight='bold', loc='left')

sns.despine()
plt.tight_layout()
plt.show()

- Sex와 Survived 사이 양의 (선형)상관관계가 존재
- Pclass와 Survived 사이 음의 (선형)상관관계가 존재
- Feature Engineering을 통해 Age와 Plcass 사이 Multicollinearity 문제를 해결해야 함