In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

In [3]:
df = pd.read_csv('train.csv')

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'],inplace=True)

In [6]:
df.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
50,0,3,male,7.0,4,1,39.6875,S
264,0,3,female,,0,0,7.75,Q
472,1,2,female,33.0,1,2,27.75,S
857,1,1,male,51.0,0,0,26.55,S
580,1,2,female,25.0,1,1,30.0,S


In [7]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),df['Survived'],test_size=0.2,random_state = 2)

In [8]:
X_train.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
30,1,male,40.0,0,0,27.7208,C
10,3,female,4.0,1,1,16.7,S


In [9]:
y_train.head(2)

30    0
10    1
Name: Survived, dtype: int64

In [10]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

### Applying Imputation to Columns with Missing Values

In [11]:
si_age = SimpleImputer()
si_embarked = SimpleImputer(strategy='most_frequent')

X_train_age = si_age.fit_transform(X_train[['Age']])
X_train_embarked = si_embarked.fit_transform(X_train[['Embarked']])

X_test_age = si_age.transform(X_test[['Age']])
X_test_embarked = si_embarked.transform(X_test[['Embarked']])

In [12]:
#X_train_age

In [13]:
#X_test_embarked

### Applying One-Hot Encoding to Sex and Embarked  Columns

In [14]:
"""
"handle_unknown" must be 'error' when the drop parameter is specified, as both would create categories that are all zero.
If handle_unknown is set to 'ignore' then don't use drop parameter.
"""

ohe_sex = OneHotEncoder(sparse=False,handle_unknown='ignore')
ohe_embarked = OneHotEncoder(sparse = False,handle_unknown='ignore')

X_train_sex = ohe_sex.fit_transform(X_train[['Sex']])
X_train_embarked = ohe_embarked.fit_transform(X_train_embarked)

X_test_sex = ohe_sex.transform(X_test[['Sex']])
X_test_embarked = ohe_embarked.transform(X_test_embarked)

In [15]:
#X_train_sex

In [16]:
#X_train_embarked

In [17]:
X_train_remain = X_train.drop(columns=['Sex','Embarked','Age'])
X_test_remain = X_test.drop(columns=['Sex','Embarked','Age'])

In [18]:
X_train_remain

Unnamed: 0,Pclass,SibSp,Parch,Fare
30,1,0,0,27.7208
10,3,1,1,16.7000
873,3,0,0,9.0000
182,3,4,2,31.3875
876,3,0,0,9.8458
...,...,...,...,...
534,3,0,0,8.6625
584,3,0,0,8.7125
493,1,0,0,49.5042
527,1,0,0,221.7792


In [19]:
X_train_transformed = np.concatenate((X_train_remain,X_train_age,X_train_sex,X_train_embarked),axis = 1)
X_test_transformed = np.concatenate((X_test_remain,X_test_age,X_test_sex,X_test_embarked),axis = 1)

In [20]:
X_train_transformed.shape

(712, 10)

In [21]:
X_train_transformed

array([[1., 0., 0., ..., 1., 0., 0.],
       [3., 1., 1., ..., 0., 0., 1.],
       [3., 0., 0., ..., 0., 0., 1.],
       ...,
       [1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.]])

In [22]:
clf = DecisionTreeClassifier()
clf.fit(X_train_transformed,y_train)

DecisionTreeClassifier()

In [23]:
y_pred = clf.predict(X_test_transformed)
y_pred

array([0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 1], dtype=int64)

In [24]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.776536312849162

### Python Pickle 

Any object on python can be pickled so that it can be saved on disk. Using <b style = 'color:red'>pickle.dump()</b> function we can store the object data to the file. pickle.dump() function takes 3 arguments. The <b style = 'color:orange'>first argument</b> is the <b style = 'color:green'>object that you want to store</b>. The <b style = 'color:orange'>second argument</b> is the <b style = 'color:green'>file object you get by opening the desired file in write-binary (wb) mode</b>. And the <b style = 'color:orange'>third argument</b> is the <b style = 'color:green'>key-value argument</b>. This argument defines the protocol. There are two type of protocol – pickle.HIGHEST_PROTOCOL and pickle.DEFAULT_PROTOCOL.

In [25]:
import pickle

In [26]:
"""
These objects will be stored as binary files, which isn't meaningful to you. But you can use them 
which is the ultimate goal. In dump(), we encode these objects in pkl format.
"""
pickle.dump(ohe_sex,open('models/ohe_sex.pkl','wb'))   #write-binary(wb)
pickle.dump(ohe_embarked,open('models/ohe_embarked.pkl','wb'))
pickle.dump(clf,open('models/clf.pkl','wb'))