<a href="https://colab.research.google.com/github/Sayandeep27/ML-DL-Codes/blob/main/ML_with_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

In [2]:
# 1. Create a synthetic DataFrame
np.random.seed(42)
data = {
    'Age': np.random.randint(18, 70, size=100),
    'Salary': np.random.randint(30000, 100000, size=100),
    'Gender': np.random.choice(['Male', 'Female'], size=100),
    'Purchased': np.random.choice(['Yes', 'No'], size=100)
}

df = pd.DataFrame(data)
df.head()

Unnamed: 0,Age,Salary,Gender,Purchased
0,56,91228,Male,No
1,69,78984,Female,Yes
2,46,70774,Female,No
3,32,32568,Male,Yes
4,60,92592,Male,No


In [3]:
# Introduce some missing values
df.loc[5:10, 'Age'] = np.nan
df.loc[15:20, 'Salary'] = np.nan

In [4]:
# Split the data into features and target variable
X = df.drop('Purchased', axis=1)
y = df['Purchased']

In [5]:
# Encode the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# 2. Define the preprocessing steps

# Numeric features and their transformations
numeric_features = ['Age', 'Salary']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])


In [8]:
# Categorical features and their transformations
categorical_features = ['Gender']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [9]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [10]:
# 3. Create the pipeline with preprocessing and model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

In [11]:
# 4. Train the model
pipeline.fit(X_train, y_train)

In [12]:
# 5. Evaluate the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy:.2f}')

Model Accuracy: 0.25
