# Q1.

In [11]:
# Import necessary libraries
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

In [None]:
# Load your dataset 
data = pd.read_csv("C:\\Programming\\coding\\Pwskills\\Excel files\\dataset.csv")

X = data.drop('target', axis=1)
y = data['target']

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define the numerical pipeline
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', StandardScaler())  # Standardize numerical features
])

# Define the categorical pipeline
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

In [None]:
# Use SelectFromModel for automated feature selection using RandomForestClassifier
feature_selection = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))

# Combine numerical and categorical pipelines using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, X.select_dtypes(include=['int64', 'float64']).columns),
        ('cat', categorical_pipeline, X.select_dtypes(include=['object']).columns)
    ]
)

# Create the final pipeline with feature selection and the Random Forest Classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selection),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Interpretation of results:
# The accuracy score on the test dataset gives an indication of how well the model performs. 
# You can further analyze other metrics (precision, recall, etc.) based on the specific requirements of your problem.

# Possible improvements:
# 1. Fine-tune hyperparameters of the RandomForestClassifier for better model performance.
# 2. Experiment with other feature selection methods.
# 3. Explore different imputation strategies for missing values.
# 4. Consider cross-validation for a more robust evaluation.
# 5. Handle class imbalance if present in the target variable.
# 6. Include more advanced techniques like hyperparameter optimization for a comprehensive approach.


In [4]:
# Import necessary libraries
from sklearn.linear_model import LogisticRegression

X = data.drop('target', axis=1)
y = data['target']

Accuracy: 0.8360655737704918


In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the numerical pipeline
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Define the categorical pipeline
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine numerical and categorical pipelines using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, X.select_dtypes(include=['int64', 'float64']).columns),
        ('cat', categorical_pipeline, X.select_dtypes(include=['object']).columns)
    ]
)

# Create a pipeline with a VotingClassifier that combines Random Forest and Logistic Regression
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', VotingClassifier(
        estimators=
        [
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('lr', LogisticRegression(random_state=42))
    ], 
    voting='hard')
    )
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
