In [2]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline,make_pipeline


In [4]:
training_dataset=pd.read_csv("train.csv")
testing_dataset=pd.read_csv("test.csv")

In [6]:
training_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
testing_dataset.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [10]:
training_dataset.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [12]:
testing_dataset.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [14]:
training_dataset.shape

(891, 12)

In [16]:
updated_training_dataset=training_dataset.drop(["PassengerId","Name","Ticket","Fare","Cabin"],axis=1)
updated_testing_dataset=testing_dataset.drop(["PassengerId","Name","Ticket","Fare","Cabin"],axis=1)


In [18]:
updated_training_dataset.head(1)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,22.0,1,0,S


In [20]:
updated_training_dataset["Sex"] = updated_training_dataset["Sex"].astype("category")
updated_training_dataset["Embarked"] = updated_training_dataset["Embarked"].astype("category")

updated_testing_dataset["Sex"] = updated_testing_dataset["Sex"].astype("category") 
updated_testing_dataset["Embarked"] = updated_testing_dataset["Embarked"].astype("category")
X_train=updated_training_dataset[["Pclass","Sex","Age","SibSp","Parch","Embarked"]]
Y_train=updated_training_dataset["Survived"]


In [22]:
X_train["Sex"].dtype

CategoricalDtype(categories=['female', 'male'], ordered=False, categories_dtype=object)

In [24]:
trf1=ColumnTransformer([
    ("simpleimput",SimpleImputer(),[2]),
    ("freq_impu",SimpleImputer(strategy="most_frequent"),[5])],
    remainder="passthrough")

In [158]:
trf2=ColumnTransformer([
    ("ohe",OneHotEncoder(handle_unknown="ignore",drop="first"),[1,3])],remainder="passthrough")

In [166]:
trf3=ColumnTransformer([
    ("scalling",StandardScaler(),slice(0,8))],remainder="passthrough")

In [168]:
trf4=DecisionTreeClassifier()

In [170]:
pipe=Pipeline([
    ("trf1",trf1),
    ("trf2",trf2),
    ("trf3",trf3),
    ("trf4",trf4)])

In [172]:
print(pipe)

Pipeline(steps=[('trf1',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('simpleimput',
                                                  SimpleImputer(), [2]),
                                                 ('freq_impu',
                                                  SimpleImputer(strategy='most_frequent'),
                                                  [5])])),
                ('trf2',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ohe',
                                                  OneHotEncoder(drop='first',
                                                                handle_unknown='ignore'),
                                                  [1, 3])])),
                ('trf3',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('scalling', StandardScaler(),
                           

In [176]:
from sklearn import set_config
set_config(display="diagram")

(891, 7)

In [174]:
pipe.fit(X_train,Y_train)

In [186]:
predictions=pipe.predict(updated_testing_dataset)
predictions

array([0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,

In [188]:
results = pd.DataFrame({
    'PassengerId': testing_dataset['PassengerId'],
    'Survived': predictions
})

# Save to CSV file
results.to_csv('titanic_predictions.csv', index=False)


In [192]:
predi=pd.read_csv("titanic_predictions.csv")

In [196]:
predi.shape

(418, 2)