In [94]:
import pandas as pd

initial_training_data = pd.read_csv("data/train.csv")
init_size = initial_training_data.shape
print(f"Inital shape: {init_size}")

initial_training_data

Inital shape: (891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Data Cleaning

I decided to drop the Cabin coloumn as it contained as the majority of entries where NaN. The remaining entries did not add much information to the dataset.

## Splitting the name column into first name, last name, and, honourific
I thought that honourifics may play a role in the survival of passengers they will then be keyed, in the event of a lack of an honourific "none" will be used

## Finding the rareity of given names
Using the us [Social Security Administraions (SSA) records](https://www.ssa.gov/oact/babynames/limits.html) I will find the birthyear of each passanger and check the receords for the frequency of babies born during that year. My idea is that rarer names may be indicidive of upper class passengers and as such it could influence survival rates.
As the records only go back to 1880 if they where born before that date I took the records from 1880.

In [95]:
# Removing Cabin Coloumn
del initial_training_data['Cabin']

#Removing NaN row entries
initial_training_data.dropna(inplace=True)
new_size = initial_training_data.shape

print(f"Rows that countained NaN: {init_size[0]- new_size[0]}")
print(f"Updated Shape: {new_size}")

Rows that countained NaN: 179
Updated Shape: (712, 11)


In [96]:
# casting age as an int value
initial_training_data["Age"] = initial_training_data["Age"].astype(int)

# formating Sex data to either be M or F
x = lambda str : "M" if str == "male" else "F"
initial_training_data["Sex"] = initial_training_data["Sex"].apply(x)

In [97]:
first_name = []
last_name = []
honourific = []

common_honourifics = ["Master.", "Mr.", "Miss.", "Mrs.", "Ms."]

for index, row in initial_training_data.iterrows():
    name = row["Name"].split()
    
    last_name.append(name[0].strip(","))

    if name[1] in common_honourifics:
        honourific.append(name[1])
        first_name.append(name[2])

    else:
        honourific.append("none")
        first_name.append(name[1])

initial_training_data['First_Name'] = first_name
initial_training_data['Last_Name'] = last_name
initial_training_data['Honourific'] = honourific

initial_training_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,First_Name,Last_Name,Honourific
0,1,0,3,"Braund, Mr. Owen Harris",M,22,1,0,A/5 21171,7.2500,S,Owen,Braund,Mr.
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",F,38,1,0,PC 17599,71.2833,C,John,Cumings,Mrs.
2,3,1,3,"Heikkinen, Miss. Laina",F,26,0,0,STON/O2. 3101282,7.9250,S,Laina,Heikkinen,Miss.
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",F,35,1,0,113803,53.1000,S,Jacques,Futrelle,Mrs.
4,5,0,3,"Allen, Mr. William Henry",M,35,0,0,373450,8.0500,S,William,Allen,Mr.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",F,39,0,5,382652,29.1250,Q,William,Rice,Mrs.
886,887,0,2,"Montvila, Rev. Juozas",M,27,0,0,211536,13.0000,S,Rev.,Montvila,none
887,888,1,1,"Graham, Miss. Margaret Edith",F,19,0,0,112053,30.0000,S,Margaret,Graham,Miss.
889,890,1,1,"Behr, Mr. Karl Howell",M,26,0,0,111369,30.0000,C,Karl,Behr,Mr.


In [107]:
#Name frequency by birth year
name_freq = []

year_data = {}

for index, row in initial_training_data.iterrows():
    birthyear = 1912 - row["Age"]
    
    # historical records go back to 1880
    if birthyear < 1880:
        birthyear = 1880

    if birthyear in year_data.keys():
        pass

    else:
        filename = f"yob{birthyear}.txt"
        year_data[birthyear] = pd.read_csv("data/namedata/"+filename, header= None, names=["Name", "Sex", "Freq"])

    curr_year = year_data[birthyear]
    curr_row = curr_year.loc[(curr_year["Name"] == row["First_Name"]) & (curr_year["Sex"] == row["Sex"])]

    print(curr_row["Freq"])
    #name_freq.append(curr_row["Freq"])

#initial_training_data["Name_Freq"] = name_freq

#initial_training_data  
    

1730    68
Name: Freq, dtype: int64
228    46
Name: Freq, dtype: int64
Series([], Name: Freq, dtype: int64)
Series([], Name: Freq, dtype: int64)
943    9532
Name: Freq, dtype: int64
1124    70
Name: Freq, dtype: int64
Series([], Name: Freq, dtype: int64)
Series([], Name: Freq, dtype: int64)
Series([], Name: Freq, dtype: int64)
87    913
Name: Freq, dtype: int64
3    1939
Name: Freq, dtype: int64
1662    7782
Name: Freq, dtype: int64
Series([], Name: Freq, dtype: int64)
253    138
Name: Freq, dtype: int64
Series([], Name: Freq, dtype: int64)
2828    888
Name: Freq, dtype: int64
Series([], Name: Freq, dtype: int64)
948    2632
Name: Freq, dtype: int64
1006    257
Name: Freq, dtype: int64
1    5429
Name: Freq, dtype: int64
1173    8897
Name: Freq, dtype: int64
Series([], Name: Freq, dtype: int64)
Series([], Name: Freq, dtype: int64)
1656    3723
Name: Freq, dtype: int64
Series([], Name: Freq, dtype: int64)
952    2364
Name: Freq, dtype: int64
1235    290
Name: Freq, dtype: int64
1014    2