<a href="https://colab.research.google.com/github/Poushali-02/notebooks/blob/main/titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using sklearn pipeline in this case

In [108]:
#  importing necesasry libraries
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split # for spilitting training and testing data
from sklearn.compose import ColumnTransformer # for featuring data
from sklearn.impute import SimpleImputer #for handling missing values -> numerical data
from sklearn.preprocessing import OneHotEncoder # for encoding nominal categorical data into numerical data
from sklearn.preprocessing import MinMaxScaler # for encoding ordinal categorical data into numerical data
from sklearn.pipeline import Pipeline, make_pipeline # for pipeline make_pipeline is a function
from sklearn.tree import DecisionTreeClassifier  #algorithm

In [111]:
import zipfile
zip_path = "titanic.zip"
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    file_list = zip_ref.namelist()
    csv_file = next((f for f in file_list if f.endswith('.csv')), None)
    with zip_ref.open(csv_file) as file:
        df = pd.read_csv(file)

In [109]:
df.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,177
SibSp,0
Parch,0
Fare,0
Embarked,2


In [112]:
df.drop(columns = ['Cabin', 'PassengerId', 'Name', 'Ticket'], inplace = True)

In [113]:
#train-test-split
X_train, X_test, Y_train, Y_test = train_test_split(df.drop(columns = ['Survived']), df['Survived'], test_size = 0.2, random_state=42)

In [114]:
X_train.isnull().sum()

Unnamed: 0,0
Pclass,0
Sex,0
Age,140
SibSp,0
Parch,0
Fare,0
Embarked,2


In [115]:
#Age and Embarked has missing values
#Age -> numerical value
#Embarked -> Nominal Categorical Data

#imputation transformer
transform1 = ColumnTransformer([
    ('impute_age', SimpleImputer(), [2]),
    ('impute_embarked', SimpleImputer(strategy = 'most_frequent'), [6])
], remainder = 'passthrough')

In [116]:
#Sex and Embarked -> nominal ordinary data -> encoding to numerical data ->
#OneHotEncoding
transform2 = ColumnTransformer([
        ('ohe_sex_embarked', OneHotEncoder(sparse_output = False, handle_unknown = 'ignore'), [1, 6])
    ], remainder = 'passthrough'
)


In [117]:
#Scaling
transform3 = ColumnTransformer([
    ('scale', MinMaxScaler(), slice(0, 10))
])
#-> on all columns in slice(0,10) range

In [119]:
#training the model
transform4 = DecisionTreeClassifier()

# All changes to the data is now done and the model is ready to be trained, before that, connecting the pipeline

In [120]:
#Pipeline object
pipe = make_pipeline(transform1, transform2, transform3, transform4)

In [121]:
pipe.fit(X_train, Y_train)

In [122]:
pipe.named_steps

{'columntransformer-1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'columntransformer-2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'columntransformer-3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'decisiontreeclassifier': DecisionTreeClassifier()}

In [123]:
Y_pred = pipe.predict(X_test)

In [126]:
Y_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0])

In [125]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, Y_pred) * 100

62.56983240223464