In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt

import os

In [None]:
pd.set_option('display.precision', 3)
np.set_printoptions(precision=3)

## Read the dataset

In [None]:
filename = 'titanic.csv'

data_dir = os.sep.join(['..', '..', 'datasets'])
path = os.sep.join([data_dir, filename])

data = pd.read_csv(path)

In [None]:
data.info()

## A first inspection of the dataset

In [None]:
data.describe()

In [None]:
data.head(-10)

### Delete useless data columns (variables, features)

In [None]:
data.drop(['Name', 'Ticket', 'PassengerId'], axis=1, inplace=True)

print(data.info())

## Missing values

In [None]:
data[5:6]

Simply check for NaN (not a number).

In [None]:
data.isnull().sum()

In [None]:
for c in data.columns:
    if data[c].isnull().values.any():
        print('Column %s contains missing values'%(c))

Try to determine min or max values for numerical variables. Must use numpy-arrays for that.

In [None]:
for c in data.columns:
    if np.issubdtype(data[c].dtype, np.number):
        c_min = data[c].values.min()
        print('Column %s: %f'%(c, c_min))       

### Delete rows

In [None]:
data2 = data.dropna()
data2.info()

In [None]:
data3 = data.dropna(axis=1)
data3.info()

In [None]:
data4 = data.drop('Cabin', axis = 1)
data4.info()

### Impute missing values

In [None]:
data5 = data[['Pclass', 'Sex', 'Age', 'Cabin']].copy()
print(data5.head(6), '\n')
print(data5.Age.describe())

In [None]:
data5['Age'].replace(np.NaN, data5['Age'].mean(), inplace=True)
print(data5.head(6), '\n')
print(data5.Age.describe())

In [None]:
# Reset to original age data
data5.Age = data.Age

from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='mean')

imp.fit(data5.Age.values.reshape(-1,1))

data5.Age = imp.transform(data5.Age.values.reshape(-1,1))

data5.Age.describe()

In [None]:
# Reset to original age data
data5.Age = data.Age

imp = SimpleImputer(missing_values=np.nan, strategy='median')
data5.Age = imp.fit_transform(data5.Age.values.reshape(-1,1))

data5.Age.describe()

In [None]:
# Reset to original age data
data5.Age = data.Age

imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')   # Not really meaningful here 
data5.Age = imp.fit_transform(data5.Age.values.reshape(-1,1))

data5.Age.describe()

### Replace with a unique category

In [None]:
print(data5.head(6), '\n')

data5.Cabin.fillna('U', inplace=True)

print(data5.head(6))

In [None]:
# Reset to original cabin data
data5.Cabin = data.Cabin

imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='U')

data5.Cabin = imp.fit_transform(data5.Cabin.values.reshape(-1,1))

print(data5.head(6))

## Transforming data

### Introducing a new variable/feature

In [None]:
print(data5.head(6))

In [None]:
data5['Gender'] = data5['Sex'].map({'female': 1, 'male': 0}).astype(int)
data5['Deck'] = data5['Cabin'].str[0]

print(data5.head(6))

In [None]:
print(data5.Pclass.unique(), data5.Sex.unique())

### Encoding categorical variables

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [None]:
le = LabelEncoder().fit(data5.Pclass)

print(le.classes_)

In [None]:
for v in [1, 2, 3]:
    print(f'{v:3d}  ->  {le.transform([v])[0]}')

In [None]:
le = LabelEncoder().fit(data5.Sex)

for v in le.classes_:
    print(f'{v:6}  ->  {le.transform([v])[0]}')

In [None]:
ohe = OneHotEncoder().fit(data5.Sex.values.reshape(-1,1))

codes = ohe.transform(ohe.categories_[0].reshape(-1,1)).toarray()    # result is a sparse matrix

for v, c in zip(ohe.categories_[0], codes):
    print(f'{v:6}  ->  {c}')

In [None]:
ohe = OneHotEncoder().fit(data5.Pclass.values.reshape(-1,1))

codes = ohe.transform(ohe.categories_[0].reshape(-1,1)).toarray()    # result is a sparse matrix

for v, c in zip(ohe.categories_[0], codes):
    print(f'{v:6}  ->  {c}')

In [None]:
# If you need to avoid collinearity 
ohe = OneHotEncoder(drop='first').fit(data5.Pclass.values.reshape(-1,1))

codes = ohe.transform(ohe.categories_[0].reshape(-1,1)).toarray()    # result is a sparse matrix

for v, c in zip(ohe.categories_[0], codes):
    print(f'{v:6}  ->  {c}')

In [None]:
le = LabelEncoder()
data5['SexCode'] = le.fit_transform(data5['Sex'])

print(data5.head())

Dummy variables in Pandas

In [None]:
# The old variable is no longer available
data6 = pd.get_dummies(data5, columns=['Sex'])

print(data6.head())

In [None]:
print(data6.head(10), '\n')

data6 = pd.get_dummies(data6, columns=['Pclass'], drop_first=True)

print(data6.head(10))

## Possible Outliers

In [None]:
plt.figure(figsize=(8,2))
sns.boxplot(x=data['Fare'], whis=5)
plt.title('Box-Cox-Plot of variable Fare')