# Start

In [1]:
import pandas as pd

df = pd.read_csv("raw_train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [3]:
df.shape

(891, 12)

# Dropping unnecessary columns

In [4]:
df.drop(["Name", "Ticket", "PassengerId"], axis=1, inplace=True)

# Check for NaN

In [5]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

Since most of the valuse for ``Cabin`` is missing, we can drop that column

In [6]:
df.drop("Cabin", axis=1, inplace=True)

In [7]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


# Converting columns to numbers

## Sex

In [8]:
df["Sex_encoded"] = df["Sex"].apply(lambda x: 1 if x == "male" else 0)

## Embarked

In [9]:
# s c q
df["Embarked"] = df["Embarked"].astype("category")
df["Embarked_encoded"] = df["Embarked"].cat.codes

In [10]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Sex_encoded,Embarked_encoded
0,0,3,male,22.0,1,0,7.25,S,1,2
1,1,1,female,38.0,1,0,71.2833,C,0,0
2,1,3,female,26.0,0,0,7.925,S,0,2
3,1,1,female,35.0,1,0,53.1,S,0,2
4,0,3,male,35.0,0,0,8.05,S,1,2


# Fixing NaN values in ``Age``

We can groupby ``Sex`` and ``Pclass`` and fill NaN values based on the mean values in the respectice groups

In [11]:
sum(df["Age"].isnull())

177

In [12]:
df["Age"].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [13]:
male_df = df[df["Sex_encoded"] == 1]
female_df = df[df["Sex_encoded"] == 0]

## Male

In [14]:
male_df["Age"].describe()

count    453.000000
mean      30.726645
std       14.678201
min        0.420000
25%       21.000000
50%       29.000000
75%       39.000000
max       80.000000
Name: Age, dtype: float64

In [15]:
male_df["Age"] = male_df.groupby("Pclass")["Age"].transform(lambda x: x.fillna(round(x.mean(), 0)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  male_df["Age"] = male_df.groupby("Pclass")["Age"].transform(lambda x: x.fillna(round(x.mean(), 0)))


## Female

In [16]:
female_df["Age"].describe()

count    261.000000
mean      27.915709
std       14.110146
min        0.750000
25%       18.000000
50%       27.000000
75%       37.000000
max       63.000000
Name: Age, dtype: float64

In [17]:
female_df["Age"] = female_df.groupby("Pclass")["Age"].transform(lambda x: x.fillna(round(x.mean(), 0)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  female_df["Age"] = female_df.groupby("Pclass")["Age"].transform(lambda x: x.fillna(round(x.mean(), 0)))


## Creating a new df after filling NaN values

In [18]:
df_1 = pd.concat([male_df, female_df])
df_1.sort_index(axis=0, inplace=True)

In [19]:
df_1

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Sex_encoded,Embarked_encoded
0,0,3,male,22.0,1,0,7.2500,S,1,2
1,1,1,female,38.0,1,0,71.2833,C,0,0
2,1,3,female,26.0,0,0,7.9250,S,0,2
3,1,1,female,35.0,1,0,53.1000,S,0,2
4,0,3,male,35.0,0,0,8.0500,S,1,2
...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,1,2
887,1,1,female,19.0,0,0,30.0000,S,0,2
888,0,3,female,22.0,1,2,23.4500,S,0,2
889,1,1,male,26.0,0,0,30.0000,C,1,0


# Categorise ``Age``

In [20]:
age_categories_df = pd.DataFrame.from_dict(
    {
        1: ['0 - 18', 1],
        2: ['19 - 40', 2],
        3: ['41 - 60', 3],
        4: ['60+', 4]
    },
    orient="index",
    columns=["Age group", "Category"]
)

age_categories_df

Unnamed: 0,Age group,Category
1,0 - 18,1
2,19 - 40,2
3,41 - 60,3
4,60+,4


In [21]:
def categorize_age(x):
    if x <= 18:
        return 1
    elif x > 18 and x <= 40:
        return 2
    elif x > 40 and x <= 60:
        return 3
    else:
        return 4

df_1["Age_categorized"] = df_1["Age"].apply(categorize_age)

In [22]:
df_1

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Sex_encoded,Embarked_encoded,Age_categorized
0,0,3,male,22.0,1,0,7.2500,S,1,2,2
1,1,1,female,38.0,1,0,71.2833,C,0,0,2
2,1,3,female,26.0,0,0,7.9250,S,0,2,2
3,1,1,female,35.0,1,0,53.1000,S,0,2,2
4,0,3,male,35.0,0,0,8.0500,S,1,2,2
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,1,2,2
887,1,1,female,19.0,0,0,30.0000,S,0,2,2
888,0,3,female,22.0,1,2,23.4500,S,0,2,2
889,1,1,male,26.0,0,0,30.0000,C,1,0,2


# Exporting the new file

In [23]:
df_1.to_csv("clean_train.csv")