In [36]:
 import pandas as pd
 import numpy as np
 import seaborn as sns

 from sklearn.model_selection import train_test_split
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import cross_val_score

In [37]:
df = pd.read_csv('train.csv',usecols=['Age','Pclass','SibSp','Parch','Survived'])

In [38]:
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
0,0,3,22.0,1,0
1,1,1,38.0,1,0
2,1,3,26.0,0,0
3,1,1,35.0,1,0
4,0,3,35.0,0,0


In [39]:
df.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Age,177
SibSp,0
Parch,0


In [40]:
# To remove missing values
df.dropna(inplace=True)

In [41]:
df.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Age,0
SibSp,0
Parch,0


In [42]:
x = df.drop(columns=['Survived'])
y = df['Survived']

In [43]:
x.head()

Unnamed: 0,Pclass,Age,SibSp,Parch
0,3,22.0,1,0
1,1,38.0,1,0
2,3,26.0,0,0
3,1,35.0,1,0
4,3,35.0,0,0


In [44]:
np.mean(cross_val_score(LogisticRegression(),x,y,scoring='accuracy',cv=20))*100

69.33333333333333

# **Feature Construction**

In [45]:
x['Family_size'] = x['SibSp']+x['Parch']+1

In [46]:
x.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Family_size
0,3,22.0,1,0,2
1,1,38.0,1,0,2
2,3,26.0,0,0,1
3,1,35.0,1,0,2
4,3,35.0,0,0,1


In [47]:
# checking which type of family --> alone,small,large
def myfamily(num):
  if num == 1:
    return 0
  elif num>1 and num<=4:
    return 1
  else:
    return 2

In [48]:
myfamily(4)

1

In [49]:
x['Family_type'] = x['Family_size'].apply(myfamily)

In [50]:
x.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Family_size,Family_type
0,3,22.0,1,0,2,1
1,1,38.0,1,0,2,1
2,3,26.0,0,0,1,0
3,1,35.0,1,0,2,1
4,3,35.0,0,0,1,0


In [51]:
x.drop(columns=['SibSp','Parch','Family_size'],inplace=True)

In [52]:
x.head()

Unnamed: 0,Pclass,Age,Family_type
0,3,22.0,1
1,1,38.0,1
2,3,26.0,0
3,1,35.0,1
4,3,35.0,0


In [53]:
np.mean(cross_val_score(LogisticRegression(),x,y,scoring='accuracy',cv=20))*100

70.03174603174602

# **Feature Splitting**

In [54]:
df = pd.read_csv('train.csv')

In [55]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [56]:
df['Name'].str.split(',',expand=True)

Unnamed: 0,0,1
0,Braund,Mr. Owen Harris
1,Cumings,Mrs. John Bradley (Florence Briggs Thayer)
2,Heikkinen,Miss. Laina
3,Futrelle,Mrs. Jacques Heath (Lily May Peel)
4,Allen,Mr. William Henry
...,...,...
886,Montvila,Rev. Juozas
887,Graham,Miss. Margaret Edith
888,Johnston,"Miss. Catherine Helen ""Carrie"""
889,Behr,Mr. Karl Howell


In [57]:
df['Name'].str.split(',',expand=True)[1].str.split('.',expand=True)

Unnamed: 0,0,1,2
0,Mr,Owen Harris,
1,Mrs,John Bradley (Florence Briggs Thayer),
2,Miss,Laina,
3,Mrs,Jacques Heath (Lily May Peel),
4,Mr,William Henry,
...,...,...,...
886,Rev,Juozas,
887,Miss,Margaret Edith,
888,Miss,"Catherine Helen ""Carrie""",
889,Mr,Karl Howell,


In [58]:
df['Name'].str.split(',',expand=True)[1].str.split('.',expand=True)[0]

Unnamed: 0,0
0,Mr
1,Mrs
2,Miss
3,Mrs
4,Mr
...,...
886,Rev
887,Miss
888,Miss
889,Mr


In [59]:
df['Title'] = df['Name'].str.split(',',expand=True)[1].str.split('.',expand=True)[0]

In [60]:
df[['Title','Name']]

Unnamed: 0,Title,Name
0,Mr,"Braund, Mr. Owen Harris"
1,Mrs,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,Miss,"Heikkinen, Miss. Laina"
3,Mrs,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,Mr,"Allen, Mr. William Henry"
...,...,...
886,Rev,"Montvila, Rev. Juozas"
887,Miss,"Graham, Miss. Margaret Edith"
888,Miss,"Johnston, Miss. Catherine Helen ""Carrie"""
889,Mr,"Behr, Mr. Karl Howell"


In [65]:
df['Title'].describe()

Unnamed: 0,Title
count,891
unique,17
top,Mr
freq,517


In [68]:
df.groupby('Title')['Survived'].mean().sort_values(ascending=False)*100

Unnamed: 0_level_0,Survived
Title,Unnamed: 1_level_1
the Countess,100.0
Mlle,100.0
Sir,100.0
Ms,100.0
Lady,100.0
Mme,100.0
Mrs,79.2
Miss,69.78022
Master,57.5
Col,50.0


we can see the survival rate of few title is 0 compared to other titles