In [None]:
ans 1

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectFromModel

# Load your dataset
# Replace 'your_data.csv' with the actual dataset file path
data = pd.read_csv('your_data.csv')

# Split the data into features (X) and target (y)
X = data.drop('target_column', axis=1)  # Replace 'target_column' with the actual target column name
y = data['target_column']

# Step 1: Automated Feature Selection
# Use SelectFromModel to select important features
feature_selection = SelectFromModel(RandomForestClassifier(n_estimators=100))
feature_selection_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('feature_selector', feature_selection)
])

# Step 2: Numerical Pipeline
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Step 3: Categorical Pipeline
categorical_features = X.select_dtypes(include=['object']).columns
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

# Step 4: Combine Numerical and Categorical Pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

# Step 5: Final Model Pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100))
])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Evaluate the model
accuracy = model.score(X_test, y_test)
print(f"Accuracy on the test dataset: {accuracy:.2f}")

# You can use other evaluation metrics as needed, like precision, recall, F1-score, etc.

# Possible Improvements:
# 1. Hyperparameter tuning for the Random Forest Classifier for better performance.
# 2. Experiment with different imputation strategies (e.g., median for numerical, a custom value for categorical).
# 3. Try different feature selection methods or thresholds to improve feature selection.
# 4. Explore other preprocessing techniques, such as feature scaling methods, like Min-Max scaling.
# 5. Consider feature engineering techniques if relevant to your problem.

In [None]:
ans 2

In [2]:
# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Load the Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 1: Create individual classifiers and pipelines
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
logistic_classifier = LogisticRegression(random_state=42)

# Step 2: Create the Voting Classifier pipeline
voting_classifier = VotingClassifier(
    estimators=[('random_forest', rf_classifier), ('logistic_regression', logistic_classifier)],
    voting='hard'  # You can use 'soft' for weighted voting based on probability
)

# Step 3: Build the main pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize the features
    ('voting', voting_classifier)
])

# Step 4: Train the pipeline
pipeline.fit(X_train, y_train)

# Step 5: Evaluate the model
accuracy = pipeline.score(X_test, y_test)
print(f"Accuracy on the test dataset: {accuracy:.2f}")


Accuracy on the test dataset: 1.00
