In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [186]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

from sklearn.model_selection import cross_val_score

# Feature Construction

In [8]:
df = pd.read_csv('train.csv', usecols=['Age','Pclass', 'SibSp', 'Parch', 'Survived'])

In [9]:
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
0,0,3,22.0,1,0
1,1,1,38.0,1,0
2,1,3,26.0,0,0
3,1,1,35.0,1,0
4,0,3,35.0,0,0


In [11]:
df.dropna(inplace=True)

In [14]:
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
0,0,3,22.0,1,0
1,1,1,38.0,1,0
2,1,3,26.0,0,0
3,1,1,35.0,1,0
4,0,3,35.0,0,0


In [17]:
x = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [48]:
print(np.mean(cross_val_score(estimator=LogisticRegression(), X=x, y=y, cv=20)))
print(np.mean(cross_val_score(estimator=DecisionTreeClassifier(), X=x, y=y, cv=20)))

0.7031746031746031
0.6582142857142856


### Combining features

In [22]:
x['Family'] = x['SibSp'] + x['Parch'] + 1

In [28]:
new_x = x.drop(columns=['SibSp', 'Parch'])

In [49]:
print(np.mean(cross_val_score(estimator=LogisticRegression(), X=new_x, y=y, cv=20)))
print(np.mean(cross_val_score(estimator=DecisionTreeClassifier(), X=new_x, y=y, cv=20)))

0.7003174603174602
0.6501190476190476


In [43]:
def func(val):
    if val == 1:
        # travelling alone
        return 0
    elif val>1 and val<=4:
        # small family
        return 1
    else:
        # large family
        return 2

In [44]:
x['Family_Type'] = x['Family'].apply(func)

In [45]:
new_x = x.drop(columns=['SibSp', 'Parch', 'Family'])

In [50]:
print(np.mean(cross_val_score(estimator=LogisticRegression(), X=new_x, y=y, cv=20)))
print(np.mean(cross_val_score(estimator=DecisionTreeClassifier(), X=new_x, y=y, cv=20)))

0.7003174603174602
0.651547619047619


In [190]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(new_x, y, test_size=0.15, random_state=42)

In [191]:
# predicting the vals

clf = LogisticRegression()
clf2 = DecisionTreeClassifier()

clf.fit(x_train, y_train)
clf2.fit(x_train, y_train)

pred = clf.predict(x_test)
pred2 = clf2.predict(x_test)

print(accuracy_score(pred, y_test))
print(accuracy_score(pred2, y_test))


0.6666666666666666
0.6481481481481481


In [192]:
print(np.mean(cross_val_score(estimator=clf, X=new_x, y=y, cv=20)))
print(np.mean(cross_val_score(estimator=clf2, X=new_x, y=y, cv=20)))

0.7003174603174602
0.6501190476190476


# Feature Splitting

In [135]:
df = pd.read_csv('train.csv')

In [136]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [137]:
df['Title'] = df['Name'].str.split(',', expand=True)[1].str.split('.', expand=True)[0]

In [164]:
    df['Title2'] = df['Name'].str.extract('([A-Za-z]+)\.',expand=False)


  df['Title2'] = df['Name'].str.extract('([A-Za-z]+)\.',expand=False)


In [165]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Title2
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,Mr


In [140]:
df['Title'].value_counts()

Title
Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Col               2
Mlle              2
Major             2
Ms                1
Mme               1
Don               1
Lady              1
Sir               1
Capt              1
the Countess      1
Jonkheer          1
Name: count, dtype: int64

In [141]:
(df.groupby('Title')['Survived'].mean() * 100).sort_values(ascending=False)

Title
Lady            100.000000
Ms              100.000000
Sir             100.000000
Mme             100.000000
the Countess    100.000000
Mlle            100.000000
Mrs              79.200000
Miss             69.780220
Master           57.500000
Major            50.000000
Col              50.000000
Dr               42.857143
Mr               15.667311
Capt              0.000000
Jonkheer          0.000000
Don               0.000000
Rev               0.000000
Name: Survived, dtype: float64

In [161]:
mask = df['Title'] == "Mrs"
df[mask]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Title2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,Mrs
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Mrs,Mrs
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,Mrs,Mrs
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,Mrs,Mrs
15,16,1,2,"Hewlett, Mrs. (Mary D Kingcome)",female,55.0,0,0,248706,16.0000,,S,Mrs,Mrs
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S,Mrs,Mrs
874,875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28.0,1,0,P/PP 3381,24.0000,,C,Mrs,Mrs
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C,Mrs,Mrs
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0000,,S,Mrs,Mrs


In [158]:
df['Title'] = df['Title'].str.strip(' ')

In [159]:
df['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer'], dtype=object)

In [182]:
df['new'] =  df['Ticket'].str.split(' ([0-9]+) ', expand=False)

In [183]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Title2,new
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,Mr,[A/5 21171]
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,Mrs,[PC 17599]
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,Miss,[STON/O2. 3101282]
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,Mrs,[113803]
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,Mr,[373450]
