In [43]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [44]:
df = pd.read_csv('/content/train.csv')[['Age', 'Pclass', 'SibSp', 'Parch', 'Survived']]

In [45]:
df.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Survived
0,22.0,3,1,0,0
1,38.0,1,1,0,1
2,26.0,3,0,0,1
3,35.0,1,1,0,1
4,35.0,3,0,0,0


In [46]:
df.dropna(inplace = True)

In [47]:
df.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Survived
0,22.0,3,1,0,0
1,38.0,1,1,0,1
2,26.0,3,0,0,1
3,35.0,1,1,0,1
4,35.0,3,0,0,0


In [48]:
X = df.iloc[:,0:4]
y = df.iloc[:,-1]

In [49]:
X.head()

Unnamed: 0,Age,Pclass,SibSp,Parch
0,22.0,3,1,0
1,38.0,1,1,0
2,26.0,3,0,0
3,35.0,1,1,0
4,35.0,3,0,0


In [50]:
np.mean(cross_val_score(LogisticRegression(), X, y, scoring = 'accuracy', cv = 20))

0.6933333333333332

# Apply Feature Construction

In [51]:
X['Family_size'] = X['SibSp'] + X['Parch'] + 1 # Adding that passenger to the family

In [52]:
X.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size
0,22.0,3,1,0,2
1,38.0,1,1,0,2
2,26.0,3,0,0,1
3,35.0,1,1,0,2
4,35.0,3,0,0,1


In [53]:
def myfunc(num):
  if num == 1:
    return 0
  elif num > 1 and num <= 4:
    return 1
  else:
    return 2

In [54]:
myfunc(4)

1

In [55]:
X['Family_type'] = X['Family_size'].apply(myfunc)

In [56]:
X.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size,Family_type
0,22.0,3,1,0,2,1
1,38.0,1,1,0,2,1
2,26.0,3,0,0,1,0
3,35.0,1,1,0,2,1
4,35.0,3,0,0,1,0


In [57]:
X.drop(columns=['SibSp', 'Parch', 'Family_size'], inplace=True)

In [58]:
X.head()

Unnamed: 0,Age,Pclass,Family_type
0,22.0,3,1
1,38.0,1,1
2,26.0,3,0
3,35.0,1,1
4,35.0,3,0


In [59]:
np.mean(cross_val_score(LogisticRegression(), X, y, scoring = 'accuracy', cv = 20))

0.7003174603174602

# Feature Splitting

In [60]:
df = pd.read_csv('/content/train.csv')

In [61]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [62]:
df['Name']

Unnamed: 0,Name
0,"Braund, Mr. Owen Harris"
1,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,"Heikkinen, Miss. Laina"
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,"Allen, Mr. William Henry"
...,...
886,"Montvila, Rev. Juozas"
887,"Graham, Miss. Margaret Edith"
888,"Johnston, Miss. Catherine Helen ""Carrie"""
889,"Behr, Mr. Karl Howell"


In [63]:
df['Name'].str.split(', ', expand = True)[1]

Unnamed: 0,1
0,Mr. Owen Harris
1,Mrs. John Bradley (Florence Briggs Thayer)
2,Miss. Laina
3,Mrs. Jacques Heath (Lily May Peel)
4,Mr. William Henry
...,...
886,Rev. Juozas
887,Miss. Margaret Edith
888,"Miss. Catherine Helen ""Carrie"""
889,Mr. Karl Howell


In [65]:
df['Name'].str.split('.', expand = True)[0]

Unnamed: 0,0
0,"Braund, Mr"
1,"Cumings, Mrs"
2,"Heikkinen, Miss"
3,"Futrelle, Mrs"
4,"Allen, Mr"
...,...
886,"Montvila, Rev"
887,"Graham, Miss"
888,"Johnston, Miss"
889,"Behr, Mr"


In [67]:
df['Title'] = df['Name'].str.split(', ', expand = True)[1].str.split('.', expand = True)[0]

In [68]:
df[['Title', 'Name']]

Unnamed: 0,Title,Name
0,Mr,"Braund, Mr. Owen Harris"
1,Mrs,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,Miss,"Heikkinen, Miss. Laina"
3,Mrs,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,Mr,"Allen, Mr. William Henry"
...,...,...
886,Rev,"Montvila, Rev. Juozas"
887,Miss,"Graham, Miss. Margaret Edith"
888,Miss,"Johnston, Miss. Catherine Helen ""Carrie"""
889,Mr,"Behr, Mr. Karl Howell"


In [72]:
# Convert the 'Survived' column to numeric type to ensure compatibility with the mean calculation.
df['Survived'] = pd.to_numeric(df['Survived'])

# Extract titles and handle potential errors or non-standard formats.
df['Title'] = df['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]

# Group by 'Title' and calculate the mean of 'Survived', handling potential errors.
# The errors='coerce' argument converts any non-numeric values in the 'Survived' column to NaN.
# This allows the mean calculation to proceed without raising an error.
(df.groupby('Title').mean(numeric_only=True)['Survived']).sort_values(ascending=False)

Unnamed: 0_level_0,Survived
Title,Unnamed: 1_level_1
the Countess,1.0
Mlle,1.0
Sir,1.0
Ms,1.0
Lady,1.0
Mme,1.0
Mrs,0.792
Miss,0.697802
Master,0.575
Col,0.5


In [73]:
df['Is_Married'] = 0
df['Is_Married'].loc[df['Title'] == 'Mrs'] = 1

In [74]:
df['Is_Married']

Unnamed: 0,Is_Married
0,0
1,1
2,0
3,1
4,0
...,...
886,0
887,0
888,0
889,0
