In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score

# Load your dataset
url = 'https://drive.google.com/uc?id=1bGoIE4Z2kG5nyh-fGZAJ7LH0ki3UfmSJ'
data = pd.read_csv(url)

# Assume 'target' is the name of the target column
X = data.drop('target', axis=1)
y = data['target']

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define numerical and categorical columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Q1. Build the pipeline

print("Q1. Building the pipeline")

# Automated feature selection
feature_selector = SelectKBest(score_func=f_classif, k='all')

# Numerical pipeline
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Categorical pipeline
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine numerical and categorical pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ]
)

# Full pipeline with feature selection and classification
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selector', feature_selector),
    ('classifier', RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Random Forest model: {accuracy:.2f}")

# Interpretation and suggestions
print("\nInterpretation and Suggestions:")
print("1. The pipeline includes automated feature selection and handles missing values and feature scaling for numerical features, as well as one-hot encoding for categorical features.")
print("2. Possible improvements could include tuning hyperparameters, trying different feature selection methods, or using other classification algorithms.")

# Q2. Build a pipeline with a Random Forest Classifier and Logistic Regression, then use a Voting Classifier

print("\nQ2. Building the pipeline with Voting Classifier")

# Define classifiers
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
lr_classifier = LogisticRegression()

# Voting classifier
voting_clf = VotingClassifier(estimators=[
    ('rf', rf_classifier),
    ('lr', lr_classifier)
], voting='soft')

# Full pipeline with voting classifier
voting_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selector', feature_selector),
    ('voting_clf', voting_clf)
])

# Train the voting classifier
voting_pipeline.fit(X_train, y_train)

# Evaluate the voting classifier
y_pred_voting = voting_pipeline.predict(X_test)
accuracy_voting = accuracy_score(y_test, y_pred_voting)
print(f"Accuracy of the Voting Classifier: {accuracy_voting:.2f}")

# Interpretation and suggestions
print("\nInterpretation and Suggestions:")
print("1. The Voting Classifier combines predictions from both Random Forest and Logistic Regression models.")
print("2. It can provide better performance by leveraging the strengths of different classifiers.")
print("3. Evaluate whether the soft voting improves accuracy compared to individual models.")


Q1. Building the pipeline
Accuracy of the Random Forest model: 0.82

Interpretation and Suggestions:
1. The pipeline includes automated feature selection and handles missing values and feature scaling for numerical features, as well as one-hot encoding for categorical features.
2. Possible improvements could include tuning hyperparameters, trying different feature selection methods, or using other classification algorithms.

Q2. Building the pipeline with Voting Classifier
Accuracy of the Voting Classifier: 0.84

Interpretation and Suggestions:
1. The Voting Classifier combines predictions from both Random Forest and Logistic Regression models.
2. It can provide better performance by leveraging the strengths of different classifiers.
3. Evaluate whether the soft voting improves accuracy compared to individual models.
