<a href="https://colab.research.google.com/github/NiravKakadiya29/ML/blob/main/Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

In [44]:
url = 'https://drive.google.com/uc?id=1b3BZe-vr8Y4XZIqiAugxL1FZIekTfh9C'
df = pd.read_csv(url)

In [45]:
df.drop(['PassengerId','Name','Ticket'], axis=1, inplace=True)

In [46]:
df['Family'] = df['SibSp'] + df['Parch']
df.drop(['SibSp','Parch'], axis=1, inplace=True)

In [47]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Cabin,Embarked,Family
0,0,3,male,22.0,7.25,,S,1
1,1,1,female,38.0,71.2833,C85,C,1
2,1,3,female,26.0,7.925,,S,0
3,1,1,female,35.0,53.1,C123,S,1
4,0,3,male,35.0,8.05,,S,0


In [48]:
df.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,177
Fare,0
Cabin,687
Embarked,2
Family,0


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   Fare      891 non-null    float64
 5   Cabin     204 non-null    object 
 6   Embarked  889 non-null    object 
 7   Family    891 non-null    int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 55.8+ KB


In [51]:
# 2️⃣ Select features and target
X = df.drop('Survived', axis=1)
y = df['Survived']

In [53]:
# Identify numerical and categorical columns
num_features = ["Age", "Fare", "Family"]
cat_features = ["Pclass", "Sex", "Embarked"]

In [56]:
# 3️⃣ Define ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(transformers=[
    # Numerical features: Impute missing values & scale
    ("num", SimpleImputer(strategy="median"), num_features),

    # Categorical features: Impute missing values & One-Hot Encode
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
])

model = RandomForestClassifier(n_estimators=100, random_state=42)

In [57]:
# 4️⃣ Define the full pipeline (ColumnTransformer + Model)
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),  # Preprocessing step
    ("scaler", StandardScaler()),    # Scaling after preprocessing
    ("classifier", model)  # Model
])

In [58]:
# 5️⃣ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [59]:
# 6️⃣ Train the pipeline
pipeline.fit(X_train, y_train)

In [60]:
# 7️⃣ Evaluate the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

Model Accuracy: 0.8212


In [63]:
from sklearn.model_selection import cross_val_score
cross_val_scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Accuracy: {cross_val_scores.mean():.4f}")

Cross-Validation Scores: [0.75418994 0.80337079 0.83707865 0.78651685 0.82022472]
Mean Accuracy: 0.8003


In [64]:
# GridSearch using Pipeline
Params = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20],
}

In [65]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(pipeline, Params, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [66]:
grid_search.best_score_

0.810400866738895

In [67]:
grid_search.best_params_

{'classifier__max_depth': 10, 'classifier__n_estimators': 100}

In [68]:
grid_search.best_index_

3

In [69]:
grid_search.best_estimator_

In [71]:
# exporting the pipeline
import joblib
joblib.dump(pipeline, open('model.pkl', 'wb'))