In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification

# Generate a sample dataset
X, y = make_classification(n_samples=1000, n_features=5, n_informative=3, n_redundant=0, n_classes=2, random_state=42)

# Convert X to a DataFrame
columns = ['numerical_feature_1', 'numerical_feature_2', 'numerical_feature_3', 'categorical_feature_1', 'categorical_feature_2']
X_df = pd.DataFrame(X, columns=columns)

# Display the first few rows of the dataset
print("Sample dataset:")
print(X_df.head())

# Define numerical and categorical features
numerical_features = ['numerical_feature_1', 'numerical_feature_2', 'numerical_feature_3']
categorical_features = ['categorical_feature_1', 'categorical_feature_2']



Sample dataset:
   numerical_feature_1  numerical_feature_2  numerical_feature_3  \
0            -0.529332            -0.093387            -1.526572   
1            -0.978500            -1.690672             1.229308   
2            -2.171571             0.545787             1.253433   
3            -0.151299            -0.365506             1.335714   
4            -0.777371             1.146030            -2.479343   

   categorical_feature_1  categorical_feature_2  
0               0.406847              -0.619699  
1              -0.703071               0.202055  
2               1.527726               1.780785  
3               0.038355              -0.005317  
4               0.297014               1.518522  


In [8]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Step 1: Define the numerical and categorical features
numerical_features = ['numerical_feature_1', 'numerical_feature_2', 'numerical_feature_3']
categorical_features = ['categorical_feature_1', 'categorical_feature_2']

# Step 2: Define the preprocessing pipelines for numerical and categorical features
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean
    ('scaler', StandardScaler())  # Standardize the numerical features
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent value
    ('encoder', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode the categorical features and ignore unknown categories
])


# Step 3: Combine numerical and categorical pipelines
preprocessor = ColumnTransformer([
    ('numerical', numerical_pipeline, numerical_features),
    ('categorical', categorical_pipeline, categorical_features)
])

# Step 4: Define the final pipeline with Random Forest Classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Step 5: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)

# Step 6: Train the pipeline on the training data
pipeline.fit(X_train, y_train)

# Step 7: Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Step 8: Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.92


# Q2