In [21]:
import numpy as np
import pandas as pd
import sklearn.preprocessing

In [22]:
#Data reading
train_data = pd.read_csv("data/cleaned_train.csv")
test_data = pd.read_csv("data/cleaned_test.csv")
train_data.head()
#test_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [23]:
#Generating a new feature: Title
train_data['Title'] = train_data.Name.apply(lambda name: name.split(',')[1].split('.')[0].strip())
train_data.Title.value_counts()

Title
Mr              515
Miss            181
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Mme               1
Don               1
Jonkheer          1
Name: count, dtype: int64

In [24]:
test_data['Title'] = test_data.Name.apply(lambda name: name.split(',')[1].split('.')[0].strip())
test_data.Title.value_counts()

Title
Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Ms          1
Dr          1
Dona        1
Name: count, dtype: int64

In [25]:
#title normalization:

normalized_titles = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royal",
    "Don":        "Royal",
    "Sir" :       "Royal",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the Countess":"Royal",
    "Dona":       "Royal",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royal"
}

train_data.Title = train_data.Title.map(normalized_titles)
test_data.Title = test_data.Title.map(normalized_titles)


In [26]:
train_data.Title.value_counts()

Title
Mr         515
Miss       183
Mrs        127
Master      40
Officer     18
Royal        5
Name: count, dtype: int64

In [27]:
test_data.Title.value_counts()

Title
Mr         240
Miss        78
Mrs         73
Master      21
Officer      5
Royal        1
Name: count, dtype: int64

In [28]:
#Encoding categorical variables

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

#initialize label encoce as first step.
le = LabelEncoder()

train_data["Sex"] = le.fit_transform(train_data["Sex"].values)
test_data["Sex"] = le.transform(test_data["Sex"].values)

In [29]:
#Tickets aren't meaningful, so we can drop the column

train_data = train_data.drop(["Ticket"], axis = 1)
test_data = test_data.drop(["Ticket"], axis = 1)


In [30]:
train_data["Embarked"].value_counts()

Embarked
S    644
C    165
Q     79
Name: count, dtype: int64

In [31]:
#Encoding embarkment
train_data["Embarked"] = le.fit_transform(train_data["Embarked"].values)
test_data["Embarked"] = le.transform(test_data["Embarked"].values)

In [32]:
#Title encoding
train_data["Title"] = le.fit_transform(train_data["Title"].values)
test_data["Title"] = le.transform(test_data["Title"].values)

In [33]:
#Names aren't meaningful, so we can drop the column
train_data = train_data.drop(["Name"], axis = 1)
test_data = test_data.drop(["Name"], axis = 1)

In [34]:
train_data.Title.value_counts
#est_data.Title.value_counts

<bound method IndexOpsMixin.value_counts of 0      2
1      3
2      1
3      3
4      2
      ..
883    4
884    1
885    1
886    2
887    2
Name: Title, Length: 888, dtype: int64>

In [35]:
train_data["Title"].unique()
#test_data["Title"].unique()

array([2, 3, 1, 0, 5, 4])

In [36]:
train_data.info()
#test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 888 entries, 0 to 887
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  888 non-null    int64  
 1   Survived     888 non-null    int64  
 2   Pclass       888 non-null    int64  
 3   Sex          888 non-null    int64  
 4   Age          888 non-null    float64
 5   SibSp        888 non-null    int64  
 6   Parch        888 non-null    int64  
 7   Fare         888 non-null    float64
 8   Embarked     888 non-null    int64  
 9   Title        888 non-null    int64  
dtypes: float64(2), int64(8)
memory usage: 69.5 KB


In [37]:
train_data.head()
#test_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,0,3,1,22.0,1,0,7.25,2,2
1,2,1,1,0,38.0,1,0,71.2833,0,3
2,3,1,3,0,26.0,0,0,7.925,2,1
3,4,1,1,0,35.0,1,0,53.1,2,3
4,5,0,3,1,35.0,0,0,8.05,2,2


In [38]:
#Save my preprocessed data:
#Saving my clean data
train_data.to_csv("data/preprocessed_train.csv", index=False)
test_data.to_csv("data/preprocessed_test.csv", index=False)