In [1]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from util_plot import barplot_survived_rate
import re

%matplotlib inline
sns.set(color_codes = True)
plt.figure.rcParams = 13,8

%load_ext autoreload
%autoreload 2

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC



In [2]:
df = pd.read_csv('test.csv')

In [3]:
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [31]:
# Feature Engineering on Pclass
temp01 = df[['Pclass','PassengerId']]
total = temp01.groupby('Pclass').PassengerId.count()
Proportion = total/total.sum()
print(total,Proportion)

Pclass
1    107
2     93
3    218
Name: PassengerId, dtype: int64 Pclass
1    0.255981
2    0.222488
3    0.521531
Name: PassengerId, dtype: float64


In [33]:
# Feature Engineering on Name
temp02 = df[['Name','PassengerId']]

titles = ['Mr','Ms',"Mrs",'Dr','Miss','Master','Major']
for name in temp02.Name:
    check = False
    for i in titles:
        if i in name:
            check = True
    if not check:
        continue

# Extract title from name
titles_dict = {
    'Ms' : 'Ms',
    'Mrs': 'Mrs',
    'Mr' : 'Mr',
    'Dr' : 'Dr',
    'Miss': 'Ms',
    'Master': 'Master'
}
def extract_title(name):
    check = False
    for title in titles_dict.keys():
        if title in name:
            check = True
            return titles_dict[title]
    if not check:
        return 'Unclassified'
    
temp02['title'] = temp02['Name'].apply(extract_title)

total = temp02.groupby('title').PassengerId.count()

Proportion = total/total.sum()

print(total,Proportion)


title
Dr                3
Master           20
Mr              240
Mrs              72
Ms               78
Unclassified      5
Name: PassengerId, dtype: int64 title
Dr              0.007177
Master          0.047847
Mr              0.574163
Mrs             0.172249
Ms              0.186603
Unclassified    0.011962
Name: PassengerId, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [42]:
# Feature Engineering on Sex
temp03 = df[['Sex','PassengerId']]
def numeric_sex(sex):
    if sex == 'male':
        return 1
    else:
        return 0

temp03['Sex'] = temp03['Sex'].apply(numeric_sex)
total = temp03.groupby('Sex').PassengerId.count()
proportion = total/total.sum()
print(total,proportion)


Sex
0    152
1    266
Name: PassengerId, dtype: int64 Sex
0    0.363636
1    0.636364
Name: PassengerId, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [37]:
# Feature Engineering on Age
temp04 = df[['Age','PassengerId']]

def divide_age(age):
    """
    Definition of age group
        teenage: 0 <= age < 20
        young adult: 20 <= age <40
        adult: 40 <= age < 60
        elder: 60 <= age < 81
    """
    age_group = {'teenage':(0,20),'adult':(20,60),'elder':(60,81)}
    for i,j in age_group.items():
        if age >= j[0] and age < j[1]:
            return i
    else:
        return 'Unclassified'
    
temp04['Age'] = temp04['Age'].apply(divide_age)
total = temp04.groupby('Age').PassengerId.count()
proportion = total/total.sum()
print(total,proportion)

Age
Unclassified     86
adult           257
elder            14
teenage          61
Name: PassengerId, dtype: int64 Age
Unclassified    0.205742
adult           0.614833
elder           0.033493
teenage         0.145933
Name: PassengerId, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [8]:
# Feature Engineering on Sibling and parch

temp05 = df[['PassengerId','SibSp','Parch']]
temp05['family_size'] = temp05['SibSp'] + temp05['Parch']
total = temp05.groupby('family_size').PassengerId.count()
proportion = total/total.sum()
print(total,proportion)

def group_large(x):
    if x > 6:
        return '>6'
    else:
        return x
    
temp05['family_size'] = temp05['family_size'].apply(group_large)

family_size
0     253
1      74
2      57
3      14
4       7
5       3
6       4
7       2
10      4
Name: PassengerId, dtype: int64 family_size
0     0.605263
1     0.177033
2     0.136364
3     0.033493
4     0.016746
5     0.007177
6     0.009569
7     0.004785
10    0.009569
Name: PassengerId, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [27]:
# Feature Engineering on Ticket
temp06 = df[['PassengerId','Ticket']]
def isnumeric(x):
    return x.isnumeric()

def notnumeric(x):
    return not x.isnumeric()

def grouping(x):
    
    def group_num(x):
        """
        Group by first three numbers
        """
        if x[:3].isnumeric():
            num = f"Num {x[0]}"
            return num, True
        else:
            return None,False
        
    def group_A(x):
        """
        Group by A initial
        """
        d = re.search(r'^A.*',x)
        if d:
            return 'initial A', True
        else:
            return None,False
    
    def group_C(x):
        """
        Group by C initial
        """
        d = re.search(r'^C.*',x)
        if d:
            return 'initial C', True
        else:
            return None,False
    
    def group_F(x):
        """
        Group by F initial
        """
        d = re.search(r'^F.*',x)
        if d:
            return 'initial F', True
        else:
            return None,False
    
    def group_P(x):
        """
        Group by P initial
        """
        d = re.search(r'^P.*',x)
        if d:
            return 'initial P', True
        else:
            return None,False
    
    def group_S(x):
        """
        Group by S initial
        """
        d = re.search(r'^S.*',x)
        if d:
            return 'initial S', True
        else:
            return None,False
    
    def group_W(x):
        """
        Group by W initial
        """
        d = re.search(r'^W.*',x)
        if d:
            return 'initial W', True
        else:
            return None,False
    
        
    num,check = group_num(x)
    if check:
        return num
    result,check = group_A(x)
    if check:
        return result
    result,check = group_C(x)
    if check:
        return result
    result,check = group_F(x)
    if check:
        return result
    result,check = group_P(x)
    if check:
        return result
    result,check = group_S(x)
    if check:
        return result
    result,check = group_W(x)
    if check:
        return result

temp06['Grouped_ticket'] = temp06.Ticket.apply(grouping)

# Handling missing value, replacing it by mode value
temp06.Grouped_ticket.isnull().sum()
temp06.Grouped_ticket.replace(np.NaN,'Num 3',inplace=True)

total = temp06.groupby("Grouped_ticket").PassengerId.count()
proportion = total/total.sum()
print(total, proportion)
missing_groups_ticket = ['Num 5','Num 8']



Grouped_ticket
Num 1         64
Num 2         95
Num 3        129
Num 4          1
Num 6          3
Num 7          4
Num 9          1
initial A     13
initial C     30
initial F      6
initial P     33
initial S     33
initial W      6
Name: PassengerId, dtype: int64 Grouped_ticket
Num 1        0.153110
Num 2        0.227273
Num 3        0.308612
Num 4        0.002392
Num 6        0.007177
Num 7        0.009569
Num 9        0.002392
initial A    0.031100
initial C    0.071770
initial F    0.014354
initial P    0.078947
initial S    0.078947
initial W    0.014354
Name: PassengerId, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [28]:
# Feature Engineering on Fare
temp07 = df[["PassengerId",'Fare']]
sorted_Fare = sorted(temp07.Fare)
length_Fare = len(temp07.Fare)
L = 9
k = int(length_Fare/L)
grouping_q = {}
for i in range(1,L+1):
    if k*i <= length_Fare-1:
        grouping_q[f'group{i}'] = sorted_Fare[k*i-1]
    else:
        grouping_q[f'group{i}'] = sorted_Fare[-1]

def grouping_quantile(x):
    for key in grouping_q.keys():
        if x > grouping_q[key]:
            continue
        else:
            return key

temp07['Grouped_Fare_quantile'] = temp07.Fare.apply(grouping_quantile)

# handling missing value, replacing it by median group
temp07.Grouped_Fare_quantile.replace(np.NaN,'group4',inplace=True)

total = temp07.groupby('Grouped_Fare_quantile').PassengerId.count()
proportion = total/total.sum()
print(total, proportion)
missing_groups_fare = ['group5','group6','group7','group8']



Grouped_Fare_quantile
group1    97
group2    97
group3    93
group4    76
group9    55
Name: PassengerId, dtype: int64 Grouped_Fare_quantile
group1    0.232057
group2    0.232057
group3    0.222488
group4    0.181818
group9    0.131579
Name: PassengerId, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [29]:
# Feature Engineering on Embarked
temp08 = df[["PassengerId",'Embarked']]
total = temp08.groupby('Embarked').PassengerId.count()
proportion = total/total.sum()
print(total,proportion)

Embarked
C    102
Q     46
S    270
Name: PassengerId, dtype: int64 Embarked
C    0.244019
Q    0.110048
S    0.645933
Name: PassengerId, dtype: float64


In [47]:
# One Hot features

one_hot_pclass = pd.get_dummies(temp01['Pclass'])
one_hot_title = pd.get_dummies(temp02['title'])
one_hot_age = pd.get_dummies(temp04['Age'])
one_hot_family_size = pd.get_dummies(temp05['family_size'])
one_hot_grouped_ticket = pd.get_dummies(temp06['Grouped_ticket'])
one_hot_missing_ticket = pd.DataFrame(np.zeros((df.shape[0],len(missing_groups_ticket))),columns=missing_groups_ticket)
one_hot_grouped_fare_quantile = pd.get_dummies(temp07['Grouped_Fare_quantile'])
one_hot_missing_fare = pd.DataFrame(np.zeros((df.shape[0],len(missing_groups_fare))),columns=missing_groups_fare)
one_hot_embarked = pd.get_dummies(temp08['Embarked'])


In [48]:
# Concatenate all results

results = pd.concat([one_hot_pclass,
                     one_hot_title,
                     temp03['Sex'],
                     one_hot_age,
                     one_hot_family_size,
                     one_hot_grouped_ticket,
                     one_hot_missing_ticket,
                     one_hot_grouped_fare_quantile,
                     one_hot_missing_fare,
                     one_hot_embarked
                    ],axis=1)

In [46]:
len(results.columns)

50

results = pd.concat([temp01,
                     #temp02['title'],
                     temp03,
                     temp05['Age'],
                     temp06['family_size'],
                     #temp06['Parch'],
                     #temp06['SibSp'],
                     #temp10,
                     #temp11
                    ],axis=1)

Unnamed: 0,Pclass,Sex,Age,family_size
0,3,male,adult,0
1,3,female,adult,1
2,2,male,elder,0
3,3,male,adult,0
4,3,female,adult,2


def numeric_sex(sex):
    if sex == 'male':
        return 1
    else:
        return 0

results['Sex'] = results['Sex'].apply(numeric_sex)

result = pd.get_dummies(results,columns=[#'Name',
                                         'Age',
                                         #'Embarked',
                                         'Pclass',
                                         #'Fare',
                                          'family_size'
                                        ])

result.shape

additional = np.zeros((418,4))
temp_add = pd.DataFrame(additional,columns=[f"Fare_group{i}" for i in range(6,10)])
result = pd.concat([result,temp_add],axis=1)

In [50]:
results.shape

(418, 50)

In [51]:
results.to_csv('cleaneddata_test.csv',index=False)