In [1]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("tested.csv")

In [5]:
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
26,918,1,1,"Ostby, Miss. Helene Ragnhild",female,22.0,0,1,113509,61.9792,B36,C
137,1029,0,2,"Schmidt, Mr. August",male,26.0,0,0,248659,13.0,,S
258,1150,1,2,"Bentham, Miss. Lilian W",female,19.0,0,0,28404,13.0,,S
203,1095,1,2,"Quick, Miss. Winifred Vera",female,8.0,1,1,26360,26.0,,S
197,1089,1,3,"Nilsson, Miss. Berta Olivia",female,18.0,0,0,347066,7.775,,S


In [8]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [9]:
df.drop(columns = ['PassengerId','Ticket','Name','Cabin'],inplace=True)

In [10]:
df.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
66,1,3,female,18.0,0,0,7.8792,Q
339,0,3,male,,0,0,7.2292,C
41,0,1,male,,0,0,26.55,S
347,1,3,female,38.0,0,0,7.2292,C
171,0,3,male,27.0,0,0,7.225,C


In [11]:
from sklearn.model_selection import train_test_split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Survived',axis=1),df['Survived'],test_size = 0.2)

In [33]:
X_train.isnull().sum()

Pclass       0
Sex          0
Age         67
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [34]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 334 entries, 384 to 191
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    334 non-null    int64  
 1   Sex       334 non-null    object 
 2   Age       267 non-null    float64
 3   SibSp     334 non-null    int64  
 4   Parch     334 non-null    int64  
 5   Fare      333 non-null    float64
 6   Embarked  334 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 20.9+ KB


In [35]:
#imputation transformer
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]),
    ('impute_fare',SimpleImputer(),[5])
],remainder='passthrough')

In [36]:
#onehot encoder
trf2 = ColumnTransformer([
    ('ohe_sex',OneHotEncoder(sparse=False,handle_unknown='ignore'),[1,6])
],remainder='passthrough')

In [37]:
#minmax scaler
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))
])

In [38]:
#model
trf4 = DecisionTreeClassifier()

In [39]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
])

In [40]:
pipe.fit(X_train,y_train)



In [41]:
y_pred = pipe.predict(X_test)

In [42]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [43]:
from sklearn.metrics import accuracy_score

In [44]:
accuracy_score(y_test,y_pred)

0.6547619047619048

In [45]:
import pickle
pickle.dump(pipe,open('first.pkl','wb'))

In [46]:
pipe = pickle.load(open('first.pkl','rb'))

In [47]:
test_input = np.array([3,'female',18.0,0,0,7.8792,'Q'],dtype=object).reshape(1,7)

In [48]:
pipe.predict(test_input)



array([0], dtype=int64)