# **Titanic without using pipelines**

In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

In [6]:
df = pd.read_csv('train.csv')

In [7]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Unwanter columns are excluded

In [8]:
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)

In [9]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


# Split the data df into train and test

In [10]:
x_train,x_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']), df['Survived'],test_size=0.2,random_state=1)

In [11]:
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
301,3,male,,2,0,23.25,Q
309,1,female,30.0,0,0,56.9292,C
516,2,female,34.0,0,0,10.5,S
120,2,male,21.0,2,0,73.5,S
570,2,male,62.0,0,0,10.5,S


In [12]:
y_train.head()

Unnamed: 0,Survived
301,1
309,1
516,1
120,0
570,1


In [13]:
x_test.shape

(179, 7)

In [14]:
df.isnull().sum()
# To check how many missing values in each column

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,177
SibSp,0
Parch,0
Fare,0
Embarked,2


# Applying SimpleImputer to fill the missing values in the age and embarked column

In [15]:
si_age = SimpleImputer() # fill with mean(default)
si_embarked = SimpleImputer(strategy='most_frequent')

x_train_age = si_age.fit_transform(x_train[['Age']])
x_train_embarked = si_embarked.fit_transform(x_train[['Embarked']])

x_test_age = si_age.transform(x_test[['Age']])
x_test_embarked = si_embarked.transform(x_test[['Embarked']])

In [16]:
x_train_age

array([[30.16623239],
       [30.        ],
       [34.        ],
       [21.        ],
       [62.        ],
       [29.        ],
       [18.        ],
       [30.16623239],
       [18.        ],
       [30.16623239],
       [44.        ],
       [30.16623239],
       [45.        ],
       [30.16623239],
       [15.        ],
       [ 9.        ],
       [38.        ],
       [30.16623239],
       [19.        ],
       [24.5       ],
       [71.        ],
       [17.        ],
       [53.        ],
       [30.16623239],
       [23.        ],
       [37.        ],
       [24.        ],
       [51.        ],
       [25.        ],
       [30.16623239],
       [37.        ],
       [32.        ],
       [ 4.        ],
       [28.5       ],
       [11.        ],
       [32.        ],
       [35.        ],
       [30.16623239],
       [17.        ],
       [33.        ],
       [56.        ],
       [30.16623239],
       [30.16623239],
       [52.        ],
       [42.        ],
       [30

In [17]:
x_train_embarked

array([['Q'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['Q'],
       ['C'],
       ['S'],
       ['S'],
       ['Q'],
       ['C'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['C'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['C'],
       ['S'],
       ['S'],
       ['Q'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['Q'],
       ['C'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['C'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['Q'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['Q'],
      

# Applying One Hot Encoding for two categorical column--> sex and embarked

In [18]:
ohe_sex = OneHotEncoder(sparse=False, handle_unknown='ignore',dtype=np.int32)
ohe_embarked = OneHotEncoder(sparse=False, handle_unknown='ignore',dtype=np.int32)

x_train_sex = ohe_sex.fit_transform(x_train[['Sex']])
x_train_embarked = ohe_embarked.fit_transform(x_train_embarked)

x_test_sex = ohe_sex.transform(x_test[['Sex']])
x_test_embarked = ohe_embarked.transform(x_test_embarked)



In [19]:
x_train_sex

array([[0, 1],
       [1, 0],
       [1, 0],
       ...,
       [0, 1],
       [1, 0],
       [0, 1]], dtype=int32)

In [20]:
x_train_embarked

array([[0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       ...,
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1]], dtype=int32)

In [21]:
x_test_embarked.shape

(179, 3)

# Now we have 3 arrays as sex, age and embarked

# we have to concatenate them now

In [22]:
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
301,3,male,,2,0,23.25,Q
309,1,female,30.0,0,0,56.9292,C
516,2,female,34.0,0,0,10.5,S
120,2,male,21.0,2,0,73.5,S
570,2,male,62.0,0,0,10.5,S


In [23]:
x_train_rem = x_train.drop(columns=['Sex','Age','Embarked'])

In [24]:
x_test_rem = x_test.drop(columns=['Sex','Age','Embarked'])

In [25]:
print(x_train_embarked.shape)

(712, 3)


In [26]:
print(x_test_rem.shape)
print(x_test_age.shape)
print(x_test_sex.shape)
print(x_test_embarked.shape)

(179, 4)
(179, 1)
(179, 2)
(179, 3)


In [27]:
x_train_transformed = np.concatenate((x_train_rem,x_train_age,x_train_sex,x_train_embarked),axis=1)
x_test_transformed = np.concatenate((x_test_rem,x_test_age,x_test_sex,x_test_embarked),axis=1)

In [28]:
x_train_transformed.shape

(712, 10)

In [29]:
x_test_transformed.shape

(179, 10)

# **Modeling**

In [30]:
clf = DecisionTreeClassifier()
clf.fit(x_train_transformed,y_train)

In [31]:
y_pred = clf.predict(x_test_transformed)
y_pred.shape

(179,)

In [32]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)*100

76.53631284916202

# To export this model to a website or an app where users can give the data to get output.

In [33]:
# create a folder as 'models' before running this to dump the files there
import pickle
# sex and embarked is all passed because it has to be one hot encoded (user would enter a string)
pickle.dump(ohe_sex,open('models/ohe__sex.pkl','wb'))
pickle.dump(ohe_embarked,open('models/ohe__embarked.pkl','wb'))
pickle.dump(clf,open('models/clf.pkl','wb'))