In [None]:
#Q1):-
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

# Load your dataset
data = pd.read_csv('your_dataset.csv')

# Split the dataset into features (X) and target variable (y)
X = data.drop('target_column', axis=1)
y = data['target_column']

# Step 1: Feature Selection
feature_selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))
X_selected = feature_selector.fit_transform(X, y)

# Step 2: Numerical Pipeline
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Step 3: Categorical Pipeline
categorical_cols = X.select_dtypes(include=['object']).columns
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Step 4: Column Transformer to combine numerical and categorical pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols),
        ('cat', categorical_pipeline, categorical_cols)
    ])

# Step 5: Final Model Pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Step 6: Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Step 7: Interpretation and Possible Improvements
# - You can further fine-tune hyperparameters of the RandomForestClassifier.
# - Explore other feature selection methods and preprocessing techniques.
# - Consider handling class imbalance if it exists in your target variable.
# - Use cross-validation for a more robust evaluation.
In this pipeline:

Step 1: Feature selection is performed using a RandomForestClassifier to select important features.
Step 2: For numerical columns, missing values are imputed with the mean and then scaled using standardization.
Step 3: For categorical columns, missing values are imputed with the most frequent value, and one-hot encoding is applied.
Step 4: The ColumnTransformer combines both numerical and categorical pipelines.
Step 5: A RandomForestClassifier is used as the final model.
Step 6: The model is trained and evaluated on a test dataset using accuracy as the evaluation metric.
Step 7: Suggestions for improvements and further steps are provided.

In [None]:
#Q2):-
# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the iris dataset as an example
data = load_iris()
X = data.data
y = data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 1: Create individual classifiers
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
lr_classifier = LogisticRegression(max_iter=1000, random_state=42)

# Step 2: Create a Voting Classifier
voting_classifier = VotingClassifier(estimators=[
    ('random_forest', rf_classifier),
    ('logistic_regression', lr_classifier)
], voting='hard')

# Step 3: Train the Voting Classifier
voting_classifier.fit(X_train, y_train)

# Step 4: Make predictions and evaluate accuracy
y_pred = voting_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")


In this code:

Step 1: Two individual classifiers, a Random Forest Classifier (rf_classifier) and a Logistic Regression Classifier (lr_classifier), are created.
Step 2: A Voting Classifier is created, specifying both classifiers to be used and setting voting='hard', which means the final prediction is based on a majority vote.
Step 3: The Voting Classifier is trained on the training data.
Step 4: Predictions are made on the test data, and the accuracy is evaluated.