# 0. Overview
This is case study for getting used to Python, Pandas, Matplot and Seaborn.

Source Data from Kaggle **"Tabular Playground Series - Apr 2021"**

# 1. Set Up Environment

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas_profiling # display summury of data
import matplotlib.pyplot as plt # Plot library
import seaborn as sns # Plot library
from sklearn.model_selection import train_test_split # Split library to help dividing data to train and validation.
from sklearn.metrics import mean_absolute_error # Error metric to measure performance 
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer # Imputation Library
from sklearn.ensemble import ExtraTreesRegressor # IterativeImputer estimator parameter.
from xgboost import XGBRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# 2. Import Data

In [None]:
train_data = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
test_data = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')
submission = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv')

# 3. Exploratory Data Analysis

Let's check data overview.

In [None]:
train_data.info()

In [None]:
train_data.isnull().sum()

### **Column information**
1. PassengerID - Unique number for each passenger
2. Survived - Survival : 0 = No, 1 = Yes
3. pclass - Ticket class : 1 = 1st, 2 = 2nd, 3 = 3rd
4. Name - Name
5. Sex - Sex
6. Age - Age in years
7. Sibsp - Number of siblings / spouses aboard the Titanic
8. Parch - Number of parents / children aboard the Titanic
9. Ticket - Ticket number
10. Fare - Passenger fare
11. Cabin - Cabin number
12. Embarked - Port of Embarkation : C = Cherbourg, Q = Queenstown, S = Southampton

### **Check Points**
1. There are 3 data types in file: Float64, int64, object.
2. Total number of entries are 100,000.
3. Column names are PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked.
3. There are missing value in Age: 3,292, Ticket: 4,623, Fare: 134, Cabin: 67,866, Embarked: 250 columns.

## Overall Survival number and Rate

In [None]:
survive_overall = train_data['Survived'].value_counts()
print("Survival Number : ", survive_overall[0])
print("Not Survival Number: ", survive_overall[1])

In [None]:
survive_overall_rate = survive_overall.loc[1]/(survive_overall.loc[0] + survive_overall.loc[1]) * 100
print("Overall Survival Rate is : {}%".format(survive_overall_rate))

### Check Points
1. Only 42,774 people survived out of 100,000
2. The survival rate is 42.774%.

## Survival Number and Rate by Sex

In [None]:
survive_by_sex = train_data.groupby(by='Sex').Survived.value_counts()
print("Female Survival Number: ", survive_by_sex.loc['female', 1])
print("Female Not Survival Number: ", survive_by_sex.loc['female', 0])
print("Male Survival Number: ", survive_by_sex.loc['male', 1])
print("Male Not Survival Number: ", survive_by_sex.loc['male', 0])

In [None]:
survive_by_sex.index

In [None]:
survive_female_rate = survive_by_sex.loc['female', 1]/(survive_by_sex.loc['female', 1] + survive_by_sex.loc['female', 0])*100
survive_male_rate = survive_by_sex.loc['male', 1]/(survive_by_sex.loc['male', 1] + survive_by_sex.loc['male', 0])*100
print("Female survival rate: ","%.2f" % survive_female_rate,"%")
print("Male survival rate: ","%.2f" % survive_male_rate,"%")

### Check Points
1. Female has 71.15% survival rate.
2. Male has 20.58% survival rate

## Plot: Survival Number and Rate by Sex

In [None]:
labels = ['Male', 'Female']
survived = [survive_by_sex.loc['male', 1], survive_by_sex.loc['female', 1]]
not_survived = [survive_by_sex.loc['male', 0], survive_by_sex.loc['female', 0]]


plt.figure(figsize = (10, 5), dpi = 100)
width = 0.1  # the width of the bars

bar1 = plt.bar([0, 0.3], width=width, height=survived, color='blue', align='edge')
bar2 = plt.bar([0.1, 0.4], width=width, height=not_survived, color='red', align='edge')
plt.bar_label(container=bar1, padding=3, labels=survived)
plt.bar_label(container=bar2, padding=3, labels=not_survived)
plt.xticks([0.1, 0.4], labels)
plt.xlabel('Sex')
plt.ylabel('Counts')
plt.title('Number of Survival by Sex')
plt.legend(['Survived', 'Not Survived'])
plt.show()

In [None]:
male_x = [survive_by_sex.loc['male', 1], survive_by_sex.loc['male', 0]]
female_x = [survive_by_sex.loc['female', 1], survive_by_sex.loc['female', 0]]
fig, ax = plt.subplots(1, 2, figsize=(10,10), dpi=100)

ax[0].pie(male_x, colors=['Blue', 'Red'], autopct='%1.1f%%', textprops=dict(size=15, color="white"))
ax[0].legend(['Survived', 'Not Survived'])
ax[0].set(title='Male Survival Rate')

ax[1].pie(female_x, colors=['Blue', 'Red'], autopct='%1.1f%%', textprops=dict(size=15, color="white"))
ax[1].legend(['Survived', 'Not Survived'])
ax[1].set(title='Female Survival Rate')
plt.show()

### Check Points
1. Male survival rate is significantly lower than female survival rate.

## Survival Number by Age Group

In [None]:
print('Oldest passanger age is: ',train_data.Age.max())
print('Yougest passager age is: ',train_data.Age.min())

### Check Points
1. Age range is 0.08 - 87.0
2. How can I divide age groups?

In [None]:
ax = plt.subplots(figsize = (80,8))
ax = sns.countplot(x='Age', hue='Survived', data=train_data)

### Check Points
1. Survival rates are different by age groups.
2. Group together 0-11, 11-16, 16-40, 40-62, and 62-90 since survival rate trend is overturned on those points.
### ***Intentionally dividing the age group might not be a good idea. It might introduce the bias to analysis.***

## Plot: Survival Number by Age Group

In [None]:
age_group = pd.cut(train_data['Age'], bins=[0, 11, 16, 40, 62, 90], labels=['Child', 'Young', 'Young Adult', 'Adult', 'Old']) # Divide group to each bins.
train_data_with_age_group = train_data.copy()
train_data_with_age_group['AgeGroup'] = age_group

In [None]:
plt.figure(figsize = (10, 5), dpi = 100)
ax = sns.countplot(x='AgeGroup', hue='Survived', palette= ["b", "r"], data=train_data_with_age_group)

# Display values on the plot
for p in ax.patches:
    ax.annotate(text=p.get_height(), xy=(p.get_x()+0.2, p.get_height()+1000), horizontalalignment='center', verticalalignment='top', color='black', size=10)

plt.title("Number of Survival by Age Group", fontsize=20)    
plt.xlabel('Age Group', fontsize=15)
plt.ylabel('Counts', fontsize=15)
plt.legend(['Survived', 'Not Survived'], fontsize=15)
plt.show()

## Survival Rate by Age Group

In [None]:
survival_rate_age_group = train_data_with_age_group.groupby(by='AgeGroup').Survived.value_counts()
survival_child_rate = survival_rate_age_group.loc['Child',1]/(survival_rate_age_group.loc['Child',1]+survival_rate_age_group.loc['Child',0])*100
survival_young_rate = survival_rate_age_group.loc['Young',1]/(survival_rate_age_group.loc['Young',1]+survival_rate_age_group.loc['Young',0])*100
survival_young_adult_rate = survival_rate_age_group.loc['Young Adult',1]/(survival_rate_age_group.loc['Young Adult',1]+survival_rate_age_group.loc['Young Adult',0])*100
survival_adult_rate = survival_rate_age_group.loc['Adult',1]/(survival_rate_age_group.loc['Adult',1]+survival_rate_age_group.loc['Adult',0])*100
survival_old_rate = survival_rate_age_group.loc['Old',1]/(survival_rate_age_group.loc['Old',1]+survival_rate_age_group.loc['Old',0])*100
print("Child Survival rate is: {:.2f}%".format(survival_child_rate))
print("Young Survival rate is: {:.2f}%".format(survival_young_rate))
print("Young Adult Survival rate is: {:.2f}%".format(survival_young_adult_rate))
print("Adult Survival rate is: {:.2f}%".format(survival_adult_rate))
print("Old Survival rate is: {:.2f}%".format(survival_old_rate))

### Check Points
1. Young Adult age groups has work survival rate.
2. Compare to other groups, Child and Young Adult groups has very low survival rate. 

## Survival Number by Age Group & Sex

In [None]:
temp = train_data_with_age_group.groupby(by=['Sex', 'AgeGroup']).Survived.value_counts()
male_survive_by_age_group = temp.loc[temp.index.get_level_values('Sex') == 'male']
female_survive_by_age_group = temp.loc[temp.index.get_level_values('Sex') == 'female']

print(male_survive_by_age_group.loc[male_survive_by_age_group.index.get_level_values(level=2) == 0])
print("\n")
print(female_survive_by_age_group.loc[female_survive_by_age_group.index.get_level_values(level=2) == 0])

## Plot: Survival Number by Age Group & Sex

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18,5), dpi=100)
plt.suptitle("Survival Number by Age Group And Sex", fontsize=20) # Set Overall title for figure
width = 0.1  # the width of the bars

bar1 = ax[0].bar([0, 0.3, 0.6, 0.9, 1.2], width=width, height=male_survive_by_age_group.loc[male_survive_by_age_group.index.get_level_values(level=2) == 0], color='blue', align='edge')
bar2 = ax[0].bar([0.1, 0.4, 0.7, 1.0, 1.3], width=width, height=male_survive_by_age_group.loc[male_survive_by_age_group.index.get_level_values(level=2) == 1], color='red', align='edge')
plt.sca(ax[0]) # Choose what subplot selected to apply option.
plt.bar_label(container=bar1, padding=3, labels=male_survive_by_age_group.loc[male_survive_by_age_group.index.get_level_values(level=2) == 0])
plt.bar_label(container=bar2, padding=3, labels=male_survive_by_age_group.loc[male_survive_by_age_group.index.get_level_values(level=2) == 1])
plt.xticks([0.1, 0.4, 0.7, 1.0, 1.3], ['Child', 'Young', 'Young Adult', 'Adult', 'Old'])
# plt.title('Survival vs Age Group')
plt.xlabel('Male')
plt.ylabel('Counts')
plt.legend(['Survived', 'Not Survived'])


bar3 = ax[1].bar([0, 0.3, 0.6, 0.9, 1.2], width=width, height=female_survive_by_age_group.loc[female_survive_by_age_group.index.get_level_values(level=2) == 0], color='blue', align='edge')
bar4 = ax[1].bar([0.1, 0.4, 0.7, 1.0, 1.3], width=width, height=female_survive_by_age_group.loc[female_survive_by_age_group.index.get_level_values(level=2) == 1], color='red', align='edge')
plt.sca(ax[1]) # Choose what subplot selected to apply option.
plt.bar_label(container=bar3, padding=3, labels=female_survive_by_age_group.loc[female_survive_by_age_group.index.get_level_values(level=2) == 0])
plt.bar_label(container=bar4, padding=3, labels=female_survive_by_age_group.loc[female_survive_by_age_group.index.get_level_values(level=2) == 1])
plt.xticks([0.1, 0.4, 0.7, 1.0, 1.3], ['Child', 'Young', 'Young Adult', 'Adult', 'Old'])
# plt.title('Survival vs Age Group')
plt.xlabel('Female')
plt.ylabel('Counts')
plt.legend(['Survived', 'Not Survived'])

plt.show()

## Survival Rate by Age Group & Sex

In [None]:
survival_rate_age_group_by_sex = train_data_with_age_group.groupby(by=['AgeGroup', 'Sex']).Survived.value_counts()

child_female_rate = survival_rate_age_group_by_sex.loc['Child','female',1]/(survival_rate_age_group_by_sex.loc['Child','female',1] + survival_rate_age_group_by_sex.loc['Child','female',0])*100
child_male_rate = survival_rate_age_group_by_sex.loc['Child','male',1]/(survival_rate_age_group_by_sex.loc['Child','male',1] + survival_rate_age_group_by_sex.loc['Child','male',0])*100

young_female_rate = survival_rate_age_group_by_sex.loc['Young','female',1]/(survival_rate_age_group_by_sex.loc['Young','female',1] + survival_rate_age_group_by_sex.loc['Young','female',0])*100
young_male_rate = survival_rate_age_group_by_sex.loc['Young','male',1]/(survival_rate_age_group_by_sex.loc['Young','male',1] + survival_rate_age_group_by_sex.loc['Young','male',0])*100

young_adult_female_rate = survival_rate_age_group_by_sex.loc['Young Adult','female',1]/(survival_rate_age_group_by_sex.loc['Young Adult','female',1] + survival_rate_age_group_by_sex.loc['Young Adult','female',0])*100
young_adult_male_rate = survival_rate_age_group_by_sex.loc['Young Adult','male',1]/(survival_rate_age_group_by_sex.loc['Young Adult','male',1] + survival_rate_age_group_by_sex.loc['Young Adult','male',0])*100

adult_female_rate = survival_rate_age_group_by_sex.loc['Adult','female',1]/(survival_rate_age_group_by_sex.loc['Adult','female',1] + survival_rate_age_group_by_sex.loc['Adult','female',0])*100
adult_male_rate = survival_rate_age_group_by_sex.loc['Adult','male',1]/(survival_rate_age_group_by_sex.loc['Adult','male',1] + survival_rate_age_group_by_sex.loc['Adult','male',0])*100

old_female_rate = survival_rate_age_group_by_sex.loc['Old','female',1]/(survival_rate_age_group_by_sex.loc['Old','female',1] + survival_rate_age_group_by_sex.loc['Old','female',0])*100
old_male_rate = survival_rate_age_group_by_sex.loc['Old','male',1]/(survival_rate_age_group_by_sex.loc['Old','male',1] + survival_rate_age_group_by_sex.loc['Old','male',0])*100

In [None]:
print("Female Child Survival rate is: {:.2f}%".format(child_female_rate))
print("Male Child Survival rate is: {:.2f}%".format(child_male_rate))
print("Female Young Survival rate is: {:.2f}%".format(young_female_rate))
print("Male Young Survival rate is: {:.2f}%".format(young_male_rate))
print("Female Adult Young Survival rate is: {:.2f}%".format(young_adult_female_rate))
print("Male Adult Young Survival rate is: {:.2f}%".format(young_adult_male_rate))
print("Female Adult Survival rate is: {:.2f}%".format(adult_female_rate))
print("Male Adult Survival rate is: {:.2f}%".format(adult_male_rate))
print("Female Old Survival rate is: {:.2f}%".format(old_female_rate))
print("Male Old Survival rate is: {:.2f}%".format(old_male_rate))

## Plot: Survival Rate by Age Group & Sex

In [None]:
female_survival_rate = [child_female_rate, young_female_rate, young_adult_female_rate, adult_female_rate, old_female_rate]
male_survival_rate = [child_male_rate, young_male_rate, young_adult_male_rate, adult_male_rate, old_male_rate]
age_group = ['Child', 'Young', 'Young Adult', 'Adult', 'Old']

plt.figure(figsize = (10, 5), dpi = 100)
plt.plot(age_group, male_survival_rate, label="Male")
plt.plot(age_group, female_survival_rate, label="Female")
plt.legend()
plt.title("Survival Rate by Age Group", fontsize=20)
plt.xlabel('Age Group', fontsize=20)
plt.ylabel('Survival Rate(%)', fontsize=20)
plt.yticks([10, 20, 30, 40, 50, 60, 70, 80, 90])
# plt.ylim(bottom=15, top=80)

#Display number on top of the line
for i, v in enumerate(male_survival_rate):
    plt.text(i, v+1, "%d" %v, ha="center", fontsize=15)
for i, v in enumerate(female_survival_rate):
    plt.text(i, v+1, "%d" %v, ha="center", fontsize=15)
plt.grid()
plt.legend(fontsize=15)
plt.show()

### Check Points
1. Survival rate among age group are various by sex.
2. All age group in Female, survival rates are higher than male group.

### ***Stereotype thinking***
1. Male help people to escape from siking titanic.
2. Male young adult is most active group to help people.

### ***Questions***
   1. *Why Child and Old age groups in male also significantly lower survival rate than female group?*
   2. *Why Child age group in female has a lowest survival rate among female age groups?*

## Plot: Passenger Ratio by Pclass

In [None]:
train_data_group_by_pclass = train_data.groupby(by='Pclass').Survived.value_counts()
# train_data.groupby(by='Pclass').Survived.count()

plt.figure(figsize = (8, 5), dpi = 100)
plt.axes([0,0,1,1])
plt.axis('equal')
pclass = ['First', 'Second', 'Third']
passengers = [train_data.groupby(by='Pclass').Survived.count().loc[1], train_data.groupby(by='Pclass').Survived.count().loc[2], train_data.groupby(by='Pclass').Survived.count().loc[3]]
plt.pie(passengers, labels = pclass, autopct='%.2f%%', textprops={'fontsize':15})
plt.title("Population Ratio by Class", fontsize=20)
plt.legend(fontsize=15)
plt.show()

## Plot: Survival Number by Pclass

In [None]:
plt.figure(figsize = (10, 8), dpi = 100)
ax = sns.countplot(x='Pclass', hue='Survived', palette= ["r", "b"], data=train_data)
# Display values on the plot
for p in ax.patches:
    ax.annotate('{}'.format(p.get_height()), (p.get_x()+0.2, p.get_height()+1000), ha='center', va='top', color='black', size=15)

plt.title("Number of Survival by Class", fontsize=20)
plt.xlabel('Class', fontsize=20)
plt.ylabel('Counts', fontsize=20)
plt.legend(('Not Survived', 'Survived'), fontsize=15)
plt.show()

## Survival Rate by Pclass

In [None]:
survival_first_rate = train_data_group_by_pclass.loc[1, 1]/(train_data_group_by_pclass.loc[1, 1]+train_data_group_by_pclass.loc[1, 0])*100
survival_second_rate = train_data_group_by_pclass.loc[2, 1]/(train_data_group_by_pclass.loc[2, 1]+train_data_group_by_pclass.loc[2, 0])*100
survival_third_rate = train_data_group_by_pclass.loc[3, 1]/(train_data_group_by_pclass.loc[3, 1]+train_data_group_by_pclass.loc[3, 0])*100
survival_rate_by_class = [survival_first_rate, survival_second_rate, survival_third_rate]
print('First Class survival rate is : {:.2f}%'.format(survival_first_rate))
print('Second Class survival rate is : {:.2f}%'.format(survival_second_rate))
print('Third Class survival rate is : {:.2f}%'.format(survival_third_rate))

## Plot: Survival Rate by Pclass

In [None]:
plt.figure(figsize = (10, 5), dpi = 100)
plt.plot(['First', 'Second', 'Third'], survival_rate_by_class, color='blue', marker='o', linestyle='dashed', linewidth=2, markersize=12)
for i, v in enumerate(survival_rate_by_class):
    plt.text(i+0.12, v, "{:.2f}%".format(v), ha="center", fontsize=12)
plt.title("Survival Rate by Class", fontsize=20)
plt.ylabel('Survival Rate(%)', fontsize=20)
plt.grid()
plt.show()

### Check Points
1. Third class passgers are largest population.
2. Survival rates are ranked from Frist, Second, and Third order.

## Number of Passenger by Pclass & Sex

In [None]:
train_data.groupby(by=['Sex', 'Pclass']).Survived.value_counts()

## Plot: Number of Passenger by Pclass & Sex

In [None]:
temp = train_data.groupby(by=['Sex', 'Pclass']).Survived.count()
pclass = ['First', 'Second', 'Third']
male_passengers = [temp.loc['male',1], temp.loc['male',2], temp.loc['male',3]]
female_passengers = [temp.loc['female',1], temp.loc['female',2], temp.loc['female',3]]

plt.figure(figsize = (10, 5), dpi = 100)

plt.plot(['First', 'Second', 'Third'], male_passengers, color='green', marker='*', linestyle='-', linewidth=2, markersize=12)
for i, v in enumerate(male_passengers):
    plt.text(i+0.12, v, "{:,d}".format(v), ha="center", fontsize=12)
plt.plot(['First', 'Second', 'Third'], female_passengers, color='pink', marker='o', linestyle='-', linewidth=2, markersize=12)
for i, v in enumerate(female_passengers):
    plt.text(i+0.12, v, "{:,d}".format(v), ha="center", fontsize=12)

plt.title("Class", fontsize=20)
plt.ylabel('Number of Passengers', fontsize=20)
plt.xticks(fontsize=15)
format_change = lambda x: "{:,d}".format(x) # Labmda fuction to change displya yticks format.
yticks = [12000, 14000, 16000, 18000, 20000, 22000, 24000, 26000, 28000]
plt.yticks(yticks, [format_change(i) for i in yticks], fontsize=15)
plt.legend(['Male', 'Female'])
plt.grid()
plt.show()

## Plot: Number of Passenger by Age Group & Pclass

In [None]:
temp = train_data_with_age_group.groupby(by=['Sex', 'AgeGroup', 'Pclass']).Survived.count()
# male_survive_by_age_group.loc[male_survive_by_age_group.index.get_level_values(level=2) == 0]
male_temp = temp.loc[temp.index.get_level_values(level=0) == 'male']
female_temp = temp.loc[temp.index.get_level_values(level=0) == 'female']

# print(female_temp.loc[female_temp.index.get_level_values(level=1) == 'Child'])
# temp.loc[temp.index.get_level_values(level=2) == 3]

fig, ax = plt.subplots(1, 2, figsize=(15,8), dpi=100)
plt.suptitle("Number of Passenger by Age Group & Class", fontsize=20) # Set Overall title for figure

ax[0].plot(['First', 'Second', 'Third'], male_temp.loc[male_temp.index.get_level_values(level=1) == 'Child'], marker='8', linestyle='dotted', linewidth=2, markersize=12)
for i, v in enumerate(male_temp.loc[female_temp.index.get_level_values(level=1) == 'Child']):
    ax[0].text(i-0.11, v, "{:,d}".format(v), ha="center", fontsize=8)
    
ax[0].plot(['First', 'Second', 'Third'], male_temp.loc[male_temp.index.get_level_values(level=1) == 'Young'], marker='s', linestyle='dotted', linewidth=2, markersize=12)
for i, v in enumerate(male_temp.loc[male_temp.index.get_level_values(level=1) == 'Young']):
    ax[0].text(i-0.11, v, "{:,d}".format(v), ha="center", fontsize=8)
    
ax[0].plot(['First', 'Second', 'Third'], male_temp.loc[male_temp.index.get_level_values(level=1) == 'Young Adult'], marker='p', linestyle='dashdot', linewidth=3, markersize=15)
for i, v in enumerate(male_temp.loc[male_temp.index.get_level_values(level=1) == 'Young Adult']):
    ax[0].text(i-0.11, v, "{:,d}".format(v), ha="center", fontsize=8)
    
ax[0].plot(['First', 'Second', 'Third'], male_temp.loc[male_temp.index.get_level_values(level=1) == 'Adult'], marker='P', linestyle='dotted', linewidth=2, markersize=12)
for i, v in enumerate(male_temp.loc[male_temp.index.get_level_values(level=1) == 'Adult']):
    ax[0].text(i-0.11, v, "{:,d}".format(v), ha="center", fontsize=8)
    
ax[0].plot(['First', 'Second', 'Third'], male_temp.loc[male_temp.index.get_level_values(level=1) == 'Old'], marker='*', linestyle='dotted', linewidth=2, markersize=12)
for i, v in enumerate(male_temp.loc[male_temp.index.get_level_values(level=1) == 'Old']):
    ax[0].text(i-0.11, v, "{:,d}".format(v), ha="center", fontsize=8)
plt.sca(ax[0]) # Choose what subplot selected to apply option.
plt.title('Number of Male Passenger by Class')
plt.ylabel('Counts')
plt.xlabel('Male')
plt.legend(['Child', 'Young', 'Young Adult', 'Adult', 'Old'])


ax[1].plot(['First', 'Second', 'Third'], female_temp.loc[female_temp.index.get_level_values(level=1) == 'Child'], marker='8', linestyle='dotted', linewidth=2, markersize=12)
for i, v in enumerate(female_temp.loc[female_temp.index.get_level_values(level=1) == 'Child']):
    ax[1].text(i-0.11, v, "{:,d}".format(v), ha="center", fontsize=8)
    
ax[1].plot(['First', 'Second', 'Third'], female_temp.loc[female_temp.index.get_level_values(level=1) == 'Young'], marker='s', linestyle='dotted', linewidth=2, markersize=12)
for i, v in enumerate(female_temp.loc[female_temp.index.get_level_values(level=1) == 'Young']):
    ax[1].text(i-0.11, v, "{:,d}".format(v), ha="center", fontsize=8)
    
ax[1].plot(['First', 'Second', 'Third'], female_temp.loc[female_temp.index.get_level_values(level=1) == 'Young Adult'], marker='p', linestyle='dashdot', linewidth=3, markersize=15)
for i, v in enumerate(female_temp.loc[female_temp.index.get_level_values(level=1) == 'Young Adult']):
    ax[1].text(i-0.11, v, "{:,d}".format(v), ha="center", fontsize=8)
    
ax[1].plot(['First', 'Second', 'Third'], female_temp.loc[female_temp.index.get_level_values(level=1) == 'Adult'], marker='P', linestyle='dotted', linewidth=2, markersize=12)
for i, v in enumerate(female_temp.loc[female_temp.index.get_level_values(level=1) == 'Adult']):
    ax[1].text(i-0.11, v, "{:,d}".format(v), ha="center", fontsize=8)
    
ax[1].plot(['First', 'Second', 'Third'], female_temp.loc[female_temp.index.get_level_values(level=1) == 'Old'], marker='*', linestyle='dotted', linewidth=2, markersize=12)
for i, v in enumerate(female_temp.loc[female_temp.index.get_level_values(level=1) == 'Old']):
    ax[1].text(i-0.11, v, "{:,d}".format(v), ha="center", fontsize=8)
plt.sca(ax[1]) # Choose what subplot selected to apply option.
plt.title('Number of Female Passenger by Class')
plt.ylabel('Counts')
plt.xlabel('Female')
plt.legend(['Child', 'Young', 'Young Adult', 'Adult', 'Old'])

plt.show()

### Check Points
1. Thrid class survival rate is lowest since only Third class has more male passenger than female also number diffence is huge.
2. Thrid class survival rate is lowest since Thrid calss has most 'Young Adult' age group people.

## Plot: Survival Number by Embarked

In [None]:
plt.figure(figsize = (10, 8), dpi = 100)
ax = sns.countplot(x='Embarked', hue='Survived', palette= ["r", "b"], data=train_data)
# Display values on the plot
for p in ax.patches:
    ax.annotate('{}'.format(p.get_height()), (p.get_x()+0.2, p.get_height()+1500), ha='center', va='top', color='black', size=15)

plt.title("Number of Survival by Embarked", fontsize=20)
plt.xlabel('Embarkred', fontsize=20)
plt.ylabel('Counts', fontsize=20)
plt.legend(('Not Survived', 'Survived'), fontsize=15)
plt.show()

## Survival Rate by Embarked

In [None]:
temp = train_data.groupby(by='Embarked').Survived.value_counts()
S_rate_C = temp.loc['C', 1] / (temp.loc['C', 1] + temp.loc['C',0]) * 100
S_rate_S = temp.loc['S', 1] / (temp.loc['S', 1] + temp.loc['S',0]) * 100
S_rate_Q = temp.loc['Q', 1] / (temp.loc['Q', 1] + temp.loc['Q',0]) * 100
S_rate_by_Embarked = [S_rate_C, S_rate_S, S_rate_Q]

print('C survival rate is : {:.2f}%'.format(S_rate_by_Embarked[0]))
print('S survival rate is : {:.2f}%'.format(S_rate_by_Embarked[1]))
print('Q survival rate is : {:.2f}%'.format(S_rate_by_Embarked[2]))

## Plot: Survival Rate by Embarked

In [None]:
plt.figure(figsize = (10, 5), dpi = 100)
plt.plot(['C', 'S', 'Q'], S_rate_by_Embarked, color='blue', marker='o', linestyle='dashed', linewidth=2, markersize=12)
for i, v in enumerate(S_rate_by_Embarked):
    plt.text(i+0.12, v, "{:.2f}%".format(v), ha="center", fontsize=12)
plt.title("Survival Rate by Embarked", fontsize=20)
plt.ylabel('Survival Rate(%)', fontsize=20)
plt.grid()

## Number of Passenger by Sex & Embarked

In [None]:
temp = train_data.groupby(by='Embarked').Sex.value_counts()
print(temp)

## Plot: Number of Passenger Ratio by Embarked

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(20,5), dpi=100)
gender = ['Female', 'Male']

plt.suptitle("Sex Ratio by Embarked", fontsize=15) # Set Overall title for figure
C_passengers = [temp.loc['C', 'female'], temp.loc['C', 'male']]
ax[0].pie(C_passengers, labels = gender, autopct='%.2f%%', textprops={'fontsize':15})
ax[0].set_title("Sex Ratio on Embarked at C", fontsize=12)
ax[0].legend(fontsize=15)

S_passengers = [temp.loc['S', 'female'], temp.loc['S', 'male']]
ax[1].pie(S_passengers, labels = gender, autopct='%.2f%%', textprops={'fontsize':15})
ax[1].set_title("Sex Ratio on Embarked at S", fontsize=12)
ax[1].legend(fontsize=15)


Q_passengers = [temp.loc['Q', 'female'], temp.loc['Q', 'male']]
ax[2].pie(Q_passengers, labels = gender, autopct='%.2f%%', textprops={'fontsize':15})
ax[2].set_title("Sex Ratio on Embarked at Q", fontsize=12)
ax[2].legend(fontsize=15)

plt.show()

## Plot: Survival Rate by Embark & Sex

In [None]:
temp = train_data.groupby(by=['Embarked', 'Sex']).Survived.value_counts()

Male_S_rate_C = temp.loc['C', 'male',1] / (temp.loc['C', 'male', 1] + temp.loc['C', 'male', 0]) * 100
Male_S_rate_S = temp.loc['S', 'male',1] / (temp.loc['S', 'male', 1] + temp.loc['S', 'male', 0]) * 100
Male_S_rate_Q = temp.loc['Q', 'male',1] / (temp.loc['Q', 'male', 1] + temp.loc['Q', 'male', 0]) * 100
Male_S_rate_by_Embarked = [Male_S_rate_C, Male_S_rate_S, Male_S_rate_Q]

Female_S_rate_C = temp.loc['C', 'female',1] / (temp.loc['C', 'female', 1] + temp.loc['C', 'female', 0]) * 100
Female_S_rate_S = temp.loc['S', 'female',1] / (temp.loc['S', 'female', 1] + temp.loc['S', 'female', 0]) * 100
Female_S_rate_Q = temp.loc['Q', 'female',1] / (temp.loc['Q', 'female', 1] + temp.loc['Q', 'female', 0]) * 100
Female_S_rate_by_Embarked = [Female_S_rate_C, Female_S_rate_S, Female_S_rate_Q]

In [None]:
plt.figure(figsize = (10, 5), dpi = 100)

plt.plot(['C', 'S', 'Q'], Male_S_rate_by_Embarked, color='blue', marker='o', linestyle='dashed', linewidth=2, markersize=12)
for i, v in enumerate(Male_S_rate_by_Embarked):
    plt.text(i+0.12, v, "{:.2f}%".format(v), ha="center", fontsize=12)

plt.plot(['C', 'S', 'Q'], Female_S_rate_by_Embarked, color='pink', marker='*', linestyle='dashed', linewidth=2, markersize=12)
for i, v in enumerate(Female_S_rate_by_Embarked):
    plt.text(i+0.12, v, "{:.2f}%".format(v), ha="center", fontsize=12)
    
plt.title("Survival Rate by Embarked And Sex", fontsize=20)
plt.ylabel('Survival Rate(%)', fontsize=20)
plt.grid()

### Check Points
1. Somehow Embarked column has large variation for survival rate.
2. Logically, Survival rate can be affected by Class, Sex, and Age.
3. It is reasonable to check the Sex and Class and Age distribution by Embarked.
4. **Sex Ration on Embared at C and Q are almost same, but survival rate is differ by more than 10%.**

## Correlation Analysis (experimental)

In [None]:
train_data.hist(column='Age', by=['Embarked', 'Sex'], layout=(2,4), figsize=(20,10))
plt.show()

In [None]:
temp = train_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']]

# Transform Sex categorical columns to numeric columns.
# I will use survival rate from by Sex and AgeGroup to transform data.
# temp.loc[(temp.Sex == 'male') & (temp.AgeGroup  == 'Child'), 'AgeGroupRate'] = 0.2103
# temp.loc[(temp.Sex == 'male') & (temp.AgeGroup  == 'Young'), 'AgeGroupRate'] = 0.2021
# temp.loc[(temp.Sex == 'male') & (temp.AgeGroup  == 'Young Adult'), 'AgeGroupRate'] = 0.1766
# temp.loc[(temp.Sex == 'male') & (temp.AgeGroup  == 'Adult'), 'AgeGroupRate'] = 0.2416
# temp.loc[(temp.Sex == 'male') & (temp.AgeGroup  == 'Old'), 'AgeGroupRate'] = 0.2179
# temp.loc[(temp.Sex == 'female') & (temp.AgeGroup  == 'Child'), 'AgeGroupRate'] = 0.5910 
# temp.loc[(temp.Sex == 'female') & (temp.AgeGroup  == 'Young'), 'AgeGroupRate'] = 0.7680
# temp.loc[(temp.Sex == 'female') & (temp.AgeGroup  == 'Young Adult'), 'AgeGroupRate'] = 0.6980
# temp.loc[(temp.Sex == 'female') & (temp.AgeGroup  == 'Adult'), 'AgeGroupRate'] =  0.7475
# temp.loc[(temp.Sex == 'female') & (temp.AgeGroup  == 'Old'), 'AgeGroupRate'] = 0.7415
# transform_numerics = {'Sex' : {'male': 0.2058, 'female': 0.7115}}
# temp_numerics = temp.replace(transform_numerics)
# temp_numerics.drop(['AgeGroup'], axis=1, inplace=True)
# temp_numerics.rename(columns={'AgeGroupRate': 'AgeGroup'}, inplace=True)

In [None]:
temp.groupby(by=['SibSp', 'Sex']).Age.agg(['min', 'max', 'mean', 'median'])

In [None]:
temp.hist(column='Age', by=['SibSp', 'Sex'], layout=(7,2), figsize=(20,15))
plt.show()

In [None]:
plt.figure(figsize=(15,8))
box_plot = sns.boxplot(x='SibSp', y='Age', hue='Sex', showmeans=True, data=temp)
# Every graphic element's information are stored in line2D.
# I am still confusing to find out correct line segment from lines list.
# I manually put the value and changed linestyle to check the what it is.
# Midean value can be calculated using Pandas.
lines = box_plot.get_lines()
lines[4].set_linestyle('-.')
lines[11].set_linestyle('--')
# lines[18].set_linestyle('==')
# lines[5].set_marker('*')
# lines[5].set_markersize(15)
box_plot.tick_params(axis='both', labelsize=15)
box_plot.set_xlabel(box_plot.get_xlabel(), fontsize=15)
box_plot.set_ylabel(box_plot.get_ylabel(), fontsize=15)
plt.title('Boxplot by SibSp and Sex', fontsize=15)

plt.show()

In [None]:
lines[4].get_ydata() # SibSp is 0 and male median value.

In [None]:
temp.groupby(by=['Parch', 'Sex']).Age.agg(['min', 'max', 'mean', 'median'])

In [None]:
temp.hist(column='Age', by=['Parch', 'Sex'], layout=(4,4), figsize=(15,10))
plt.show()

In [None]:
plt.figure(figsize=(15,8))
box_plot = sns.boxplot(x='Parch', y='Age', hue='Sex', data=temp)
box_plot.tick_params(axis='both', labelsize=15)
box_plot.set_xlabel(box_plot.get_xlabel(), fontsize=15)
box_plot.set_ylabel(box_plot.get_ylabel(), fontsize=15)
plt.title('Boxplot by Parch and Sex', fontsize=15)

plt.show()

In [None]:
temp.groupby(by=['Pclass', 'Sex']).Age.agg(['min', 'max', 'mean', 'median'])

In [None]:
temp.hist(column='Age', by=['Pclass', 'Sex'], layout=(3,2), figsize=(15,8))
plt.show()

In [None]:
plt.figure(figsize=(15,8))
box_plot = sns.boxplot(x='Pclass', y='Age', hue='Sex', data=temp)

box_plot.tick_params(axis='both', labelsize=15)
box_plot.set_xlabel(box_plot.get_xlabel(), fontsize=15)
box_plot.set_ylabel(box_plot.get_ylabel(), fontsize=15)
plt.title('Boxplot by Class and Sex', fontsize=15)
plt.legend(fontsize=13)

plt.show()

### Check Points
   1. Age distributions are differ by Pclass, Parch, SibSp, Embarked, and Sex
   2. Mean imputation for Age is not good for this case since each plot show that skewness and different distribution.
   3. Median imputation might be better choice for skewed data.
   4. I think the best imputation method would be **IterativeImputer with ExtraTreeRegressor estimator**.

# 4. Imputation
1. Run out of memory problem to using IterativeImputer.
2. Try to reduce the memory usage by changing Dtype of columns.
3. Still has memory problem so I break the step. 
4. This is not a practical and correct example.
5. I wanted to wrap up the processing.


In [None]:
train_data.info()

### Remove PassengerId, Survived, Name, Ticket, Cabin columns

In [None]:
imp_train_input = train_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

In [None]:
transform_numerics = {'Sex' : {'male': 0, 'female': 1}, 'Embarked': {'S': 0, 'Q': 1, 'C': 2}}
imp_train_input = imp_train_input.replace(transform_numerics)

In [None]:
imp_train_input.head()

In [None]:
imp_train_input.info()

In [None]:
imp_train_input = imp_train_input.convert_dtypes()

In [None]:
imp_train_input.info()

In [None]:
imp_train_input = imp_train_input.astype({'Pclass': np.int8, 'Sex': np.int8, 'Age': np.float32, 'SibSp': np.int8, 'Parch': np.int8, 'Fare': np.float32, 'Embarked': np.int8}, errors='ignore')

In [None]:
# imp_train_input.Embarked = pd.to_numeric(imp_train_input.Embarked, downcast='integer')
# imp_train_input.Fare = pd.to_numeric(imp_train_input.Fare, downcast='float')
# imp_train_input.Age = pd.to_numeric(imp_train_input.Age, downcast='float')

In [None]:
imp_train_input.info()

In [None]:
imp_age = IterativeImputer(estimator=ExtraTreesRegressor(random_state=0), random_state=0)
imp_age_train_output = imp_age.fit_transform(imp_train_input[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']])

In [None]:
imp_train_output_df = pd.DataFrame(imp_age_train_output, columns=['Pclass', 'Sex', 'Age', 'SibSp', 'Parch'])

In [None]:
imp_train_input.Age = imp_train_output_df.Age

In [None]:
imp_train_input.info()

In [None]:
imp_fare = IterativeImputer(estimator=ExtraTreesRegressor(random_state=0), random_state=0)
imp_fare_train_output = imp_fare.fit_transform(imp_train_input[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']])

In [None]:
imp_train_output_df = pd.DataFrame(imp_fare_train_output, columns=['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'])

In [None]:
imp_train_input.Fare = imp_train_output_df.Fare

In [None]:
imp_train_input.info()

In [None]:
imp_embarked = IterativeImputer(estimator=ExtraTreesRegressor(random_state=0), random_state=0)
imp_embarked_train_output = imp_embarked.fit_transform(imp_train_input)

In [None]:
imp_train_output_df = pd.DataFrame(imp_embarked_train_output, columns=['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])

In [None]:
imp_train_input.Embarked = imp_train_output_df.Embarked

In [None]:
imp_train_input.Embarked.loc[(imp_train_input.Embarked >= 1.5)] = 2.0
imp_train_input.Embarked.loc[(imp_train_input.Embarked <1.5) & (imp_train_input.Embarked >= 0.5)] = 1.0
imp_train_input.Embarked.loc[(imp_train_input.Embarked <0.5)] = 0.0

In [None]:
imp_train_input = imp_train_input.astype({'Pclass': np.int8, 'Sex': np.int8, 'Age': np.float32, 'SibSp': np.int8, 'Parch': np.int8, 'Fare': np.float32, 'Embarked': np.int8}, errors='ignore')

In [None]:
imp_train_input.info()

In [None]:
imp_train_input['Survived'] = train_data.Survived

In [None]:
imp_train_input.to_csv('final_input.csv', index=False)

### Check Points
   1. **Cabin column in test_data have 70,831 mising values, it would be better to ignore this column since there is 70.8% of information is missing. Also training data have 67.8% of information is missing. It is better to ignore the column.**

# 5. Prediction using XGBoost

In [None]:
y = imp_train_input.Survived
X = imp_train_input.copy()
X.drop(['Survived'], axis=1, inplace=True)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [None]:
X.head()

In [None]:
from xgboost import XGBRegressor

# Define the model
my_model_1 = XGBRegressor(random_state=0)

# Fit the model
my_model_1.fit(X_train, y_train)

In [None]:
predictions_1 = my_model_1.predict(X_valid)

# Calculate MAE
mae_1 = mean_absolute_error(y_valid, predictions_1)

print("Mean Absolute Error:" , mae_1)

# 6. Appendix

### Using Last name to predict values

In [None]:
temp = test_data.Name.str.split(pat=',', expand=True)
temp.rename(columns={0: 'F_name', 1:'L_name'}, inplace=True)
temp.F_name = temp.F_name.str.strip()
temp.L_name = temp.L_name.str.strip()

In [None]:
name = test_data.columns.to_list()
b = temp.columns.to_list()
name.extend(b)
name

In [None]:
# test_data_with_name = test_data.copy()
test_data_with_name = pd.concat([test_data, temp], ignore_index=True, axis=1)
test_data_with_name.rename(columns={0: 'PassengerId', 1:'Pclass', 2:'Name',3:'Sex', 4:'Age', 5:'SibSp', 6:'Parch', 7:'Ticket', 8:'Fare', 9:'Cabin', 10:'Embarked', 11:'F_name', 12:'L_name'}, inplace=True)
test_data_with_name

In [None]:
test_data_with_name.groupby(by='L_name').count()

In [None]:
test_data_with_name.L_name.loc[test_data_with_name.Age.isnull()].unique()

In [None]:
test_data_with_name.groupby(by='F_name').get_group(name='Mcmullen')

In [None]:
train_data_with_age_group.info()

### Check Points
   1. 'Pclass' and 'Fare' might have correlation. Normally, higher class seat is more expensive than lower class seat.
   2.  Same ticket number might have same fare.
   3. Cabin number also have correlation with fare but there is too many missing values.

### ***This link explain about Cabins in cruise [Explain Cabin](https://www.cruisecritic.com/articles.cfm?ID=77)***

In [None]:
temp_age_null = train_data_with_age_group.loc[train_data_with_age_group.Age.isnull()]

In [None]:
temp_age_null.info()

In [None]:
transform_numerics = {'Sex' : {'male': 0, 'female': 1}, 'Age_group': {'Child': 0, 'Young': 1, 'Young Adult': 2, 'Adult': 3, 'Old': 4}}
temp_numerics = train_data_with_age_group.replace(transform_numerics)

In [None]:
temp_numerics[['Pclass', 'Sex', 'Age_group', 'SibSp', 'Parch','Fare']].corr()

### Check Points
   1. Fare and Pclass has weak correlation but others doesn't have correlation with Fare.

### ***Questions***
1. ***Is it true that Sex and Age Group doesn't correlate with Fare?***

In [None]:
temp = train_data_with_age_group['Ticket'].value_counts().nlargest(10)
temp

In [None]:
train_data_with_age_group.Ticket = train_data_with_age_group.Ticket.str.strip()

In [None]:
temp_index = (train_data_with_age_group['Ticket'] == 'A/5')
temp = train_data_with_age_group[['Pclass', 'Age group', 'Sex', 'Fare']].loc[temp_index]
temp2 = temp.groupby(by=['Pclass', 'Age_group', 'Sex']).Fare.agg(['count','min', 'max', 'mean'])
temp2

In [None]:
temp_index = (train_data_with_age_group['Ticket'] == 'C.A.')
temp = train_data_with_age_group[['Pclass', 'Age group', 'Sex', 'Fare']].loc[temp_index]
temp2 = temp.groupby(by=['Pclass', 'Age_group', 'Sex']).Fare.agg(['count','min', 'max', 'mean'])
temp2

In [None]:
temp_index = (train_data_with_age_group['Ticket'] == 'SOTON/O.Q.')
temp = train_data_with_age_group[['Pclass', 'Age group', 'Sex', 'Fare']].loc[temp_index]
temp2 = temp.groupby(by=['Pclass', 'Age_group', 'Sex']).Fare.agg(['count','min', 'max', 'mean'])
temp2

In [None]:
train_data_with_age_group['Fare'].agg(['min', 'max', 'mean'])

In [None]:
plt.figure(figsize=(10,5), dpi=100)
sns.boxplot(x='Fare', data=train_data_with_age_group)
plt.title('Fare Diversity')
plt.grid()
plt.show()

### Check Points
 1. Univariate feature imputation such as 'SimpleImputer' is not a good choice on this data set since 'Pclass', 'Age Group', and 'Sex' columns are somewhat correlated.
 2. Nearest Neighbors imputation (KNN) is not a good choice since KNN is sensitive to outliers. 'Fare Diversity' plot shows that Fare column wide diversity.
 3. Multivariate feature imputation such as 'IterativeImputer' with 'ExtraTreesRegressor' will be good choice.

### Don’t use a sample size less than 30.
### It has been statistically proven that 30 is the smallest sample size where an average result of a sample starts to represent the average result of a population.
### **If your maching learning model doesn't requrie nuemric input, you don't need to transformed categorical varibales to numeric variables.**

In [None]:
# pandas_profiling.ProfileReport(data)