# import libraries

In [112]:
import numpy as np
import pandas as pd
import math 
from sklearn.preprocessing import MinMaxScaler

# load dataset

In [113]:
df=pd.read_csv('dataset/titanic.csv')


# exploring data


In [114]:
df.shape


(891, 12)

In [115]:
df.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
322,323,1,2,"Slayter, Miss. Hilda Mary",female,30.0,0,0,234818,12.35,,Q
20,21,0,2,"Fynney, Mr. Joseph J",male,35.0,0,0,239865,26.0,,S
774,775,1,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54.0,1,3,29105,23.0,,S
582,583,0,2,"Downton, Mr. William James",male,54.0,0,0,28403,26.0,,S
461,462,0,3,"Morley, Mr. William",male,34.0,0,0,364506,8.05,,S
331,332,0,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S
768,769,0,3,"Moran, Mr. Daniel J",male,,1,0,371110,24.15,,Q
323,324,1,2,"Caldwell, Mrs. Albert Francis (Sylvia Mae Harb...",female,22.0,1,1,248738,29.0,,S
618,619,1,2,"Becker, Miss. Marion Louise",female,4.0,2,1,230136,39.0,F4,S
720,721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6.0,0,1,248727,33.0,,S


In [116]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

# cleaning data

In [117]:
#drop some columns
drop_col = ['PassengerId','Cabin','Embarked']

df=df.drop(drop_col, axis=1) 
df.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare'],
      dtype='object')

In [118]:
#remove columns spaces
df.columns=df.columns.str.replace(' ','')

In [119]:
# check no. fo nulls in columns
df.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
dtype: int64

In [120]:
#calculate the mean of the Age column
df['Age'].mean()


29.69911764705882

In [121]:
#fill all the empties/nulls in Age column with mean
df['Age'] = df['Age'].fillna(df['Age'].mean())


In [122]:
#drop duplicated rows - if any
df=df.drop_duplicates(keep='first')

In [123]:
df.shape

(891, 9)

In [124]:
#check if there’s any empties existing in the whole dataset or not.
df.isnull().values.any()

False

In [125]:
#combining both of Parch and SibSp columns
df['Alone'] = df.Parch + df.SibSp


In [126]:
df['Alone'].loc[df['Alone']>0] = 'With Family'
df['Alone'].loc[df['Alone'] == 0] = 'Without Family'


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [127]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Alone
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,With Family
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,With Family
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Without Family
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,With Family
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Without Family


In [128]:
#create a new column called person which is similar to sex column, contains if the passenger is child
def children(passenger):    
    age , sex = passenger
    if age <16:
        return 'child'
    else:
        return sex
df['person'] = df[['Age','Sex']].apply(children,axis=1)

In [129]:
df.head(10)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Alone,person
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,With Family,male
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,With Family,female
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Without Family,female
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,With Family,female
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Without Family,male
5,0,3,"Moran, Mr. James",male,29.699118,0,0,330877,8.4583,Without Family,male
6,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,Without Family,male
7,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,With Family,child
8,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,With Family,female
9,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,With Family,child


In [62]:
 df.columns.get_loc("Sex")

3

## Handling Categorical Data 
### Label Encoder

In [130]:
encoder = LabelEncoder()
df['Sex'] = encoder.fit_transform(df['Sex'])
df['Alone'] = encoder.fit_transform(df['Alone'])
df['person'] = encoder.fit_transform(df['person'])
df.head(10)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Alone,person
0,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,0,2
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,0,1
2,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,1,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,0,1
4,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,1,2
5,0,3,"Moran, Mr. James",1,29.699118,0,0,330877,8.4583,1,2
6,0,1,"McCarthy, Mr. Timothy J",1,54.0,0,0,17463,51.8625,1,2
7,0,3,"Palsson, Master. Gosta Leonard",1,2.0,3,1,349909,21.075,0,0
8,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",0,27.0,0,2,347742,11.1333,0,1
9,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",0,14.0,1,0,237736,30.0708,0,0


### one-hot-encoder

In [135]:

person_dummies = pd.get_dummies(df['person'])
person_dummies.columns=['Male','Female','Child']

alone_dummies = pd.get_dummies(df['Alone'])
#person_dummies.columns=['With_Family','without']
pclass_dummies = pd.get_dummies(df['Pclass'])
pclass_dummies.columns=['class_1','class_2','class_3']


In [136]:
#add all the new columns to our dataset
df = pd.concat([df,pclass_dummies,person_dummies,alone_dummies],axis=1)


In [137]:
df.columns

Index(['Survived',   'Pclass',     'Name',      'Sex',      'Age',    'SibSp',
          'Parch',   'Ticket',     'Fare',    'Alone',   'person',  'class_1',
        'class_2',  'class_3',     'Male',   'Female',    'Child',          0,
                1],
      dtype='object')

In [133]:
df.sample(10)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Alone,person
659,0,1,"Newell, Mr. Arthur Webster",1,58.0,0,2,35273,113.275,0,2
557,0,1,"Robbins, Mr. Victor",1,29.699118,0,0,PC 17757,227.525,1,2
827,1,2,"Mallet, Master. Andre",1,1.0,0,2,S.C./PARIS 2079,37.0042,0,0
640,0,3,"Jensen, Mr. Hans Peder",1,20.0,0,0,350050,7.8542,1,2
668,0,3,"Cook, Mr. Jacob",1,43.0,0,0,A/5 3536,8.05,1,2
275,1,1,"Andrews, Miss. Kornelia Theodosia",0,63.0,1,0,13502,77.9583,0,1
315,1,3,"Nilsson, Miss. Helmina Josefina",0,26.0,0,0,347470,7.8542,1,1
865,1,2,"Bystrom, Mrs. (Karolina)",0,42.0,0,0,236852,13.0,1,1
533,1,3,"Peter, Mrs. Catherine (Catherine Rizk)",0,29.699118,0,2,2668,22.3583,0,1
113,0,3,"Jussila, Miss. Katriina",0,20.0,1,0,4136,9.825,0,1


In [139]:
#drop all the useless columns from the dataframe
drop_col = ['Name','SibSp','Parch','Ticket']
drop_col = ['Sex','person','Alone','Pclass']


df=df.drop(drop_col, axis=1) 
df.columns

Index(['Survived',      'Age',     'Fare',  'class_1',  'class_2',  'class_3',
           'Male',   'Female',    'Child',          0,          1],
      dtype='object')

In [142]:
#remove the percent from Age & Fare
df['Age'] = df['Age'].apply(math.ceil)
df['Fare'] = df['Fare'].apply(math.ceil)

In [143]:
df.head(10)

Unnamed: 0,Survived,Age,Fare,class_1,class_2,class_3,Male,Female,Child,0,1
0,0,22,8,0,0,1,0,0,1,1,0
1,1,38,72,1,0,0,0,1,0,1,0
2,1,26,8,0,0,1,0,1,0,0,1
3,1,35,54,1,0,0,0,1,0,1,0
4,0,35,9,0,0,1,0,0,1,0,1
5,0,30,9,0,0,1,0,0,1,0,1
6,0,54,52,1,0,0,0,0,1,0,1
7,0,2,22,0,0,1,1,0,0,1,0
8,1,27,12,0,0,1,0,1,0,1,0
9,1,14,31,0,1,0,1,0,0,1,0
