## Ensemble Techniques And Its Types Assignments-4

In [2]:
# Q1. You are working on a machine learning project where you have a dataset containing numerical and
# categorical features. You have identified that some of the features are highly correlated and there are
# missing values in some of the columns. You want to build a pipeline that automates the feature
# engineering process and handles the missing values.

# Design a pipeline that includes the following steps:
# Use an automated feature selection method to identify the important features in the datasetC
# Create a numerical pipeline that includes the following steps"
# Impute the missing values in the numerical columns using the mean of the column valuesC
# Scale the numerical columns using standardisationC
# Create a categorical pipeline that includes the following steps"
# Impute the missing values in the categorical columns using the most frequent value of the columnC
# One-hot encode the categorical columnsC
# Combine the numerical and categorical pipelines using a ColumnTransformerC
# Use a Random Forest Classifier to build the final modelC
# Evaluate the accuracy of the model on the test dataset.

# Ans:


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score


# Load the dataset
df = pd.read_csv(r"DataSource\diabetes.csv")


# Define target column 
target = 'Outcome'
X = df.drop(columns=[target])
y = df[target]

# Identify numerical and categorical columns
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()


# Feature Selection
# Train a RandomForest model to identify important features
rf_selector = RandomForestClassifier(n_estimators=100, random_state=42)
rf_selector.fit(X.fillna(0), y)  # Filling NaN with 0 temporarily

# Select important features
selector = SelectFromModel(rf_selector, threshold="mean", prefit=True)
X_selected = selector.transform(X.fillna(0))

# Get selected feature names
selected_features = X.columns[selector.get_support()]
print("Selected Features: ", selected_features.tolist())

# Update feature lists
numerical_features = [col for col in selected_features if col in numerical_features]
categorical_features = [col for col in selected_features if col in categorical_features]

# Create pipeline for numerical features
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values with mean
    ('scaler', StandardScaler())  # Standardize numerical features
])


# Create pipeline for categorical features
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values with mode
    ('encoder', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical variables
])

# Combine Pipelines using ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_features),
    ('cat', cat_pipeline, categorical_features)
])

# Build the Final Pipeline with Random Forest
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X[selected_features], y, test_size=0.2, random_state=42)

# Train the model
model_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = model_pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")


Selected Features:  ['Glucose', 'BMI', 'Age']
Model Accuracy: 0.7403




In [5]:
# Interpretation of Results:

# Feature Selection: The model automatically selects the most relevant features, reducing noise and 
# improving performance.
# Missing Values Handling: The numerical columns use mean imputation, while categorical columns use mode imputation.
# Preprocessing: Standardization ensures numerical features have a mean of 0 and variance of 1, and 
# one-hot encoding makes categorical data usable for Random Forest.
# Performance: Accuracy provides insight into model effectiveness. If the accuracy is low, consider 
# hyperparameter tuning or using different feature engineering techniques.

# Possible Improvements:
# Different Feature Selection Methods:
# We can use mutual_info_classif for feature selection instead of Random Forest.
# We also can apply Recursive Feature Elimination (RFE).

# Experiment with Hyperparameter Tuning:
# We can use GridSearchCV or RandomizedSearchCV to optimize n_estimators, max_depth, and other hyperparameters.
# Handle Class Imbalance:
# If the dataset is imbalanced, then we can use SMOTE (Synthetic Minority Over-sampling Technique).
# Try Different Models:
# We can Gradient Boosting, XGBoost, or CatBoost for better accuracy.

In [6]:
# Import the necessary packages only
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression


# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Pipeline for Logistic Regression
logistic_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardizing features
    ('logistic', LogisticRegression(max_iter=200))
])


# Pipeline for Random Forest
random_forest_pipeline = Pipeline([
    ('random_forest', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Create a Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('logistic', logistic_pipeline),
        ('random_forest', random_forest_pipeline)
    ],
    voting='hard'  # Hard voting: Majority class prediction
)

# Train the voting classifier
voting_clf.fit(X_train, y_train)

# Make predictions
y_pred = voting_clf.predict(X_test)

# Evaluate model accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Voting Classifier Accuracy: {accuracy:.4f}")


Voting Classifier Accuracy: 1.0000
