In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load Dataset 1 (IRIS)
iris = pd.read_csv("Iris.csv")

# Encode target labels if necessary
if iris.iloc[:, -1].dtype == 'object':
    iris.iloc[:, -1] = LabelEncoder().fit_transform(iris.iloc[:, -1])

# Split data
X_iris = iris.iloc[:, :-1]
y_iris = iris.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, test_size=0.2, random_state=42)

# Train and evaluate Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

# Train and evaluate Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Compare results
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report for Random Forest:\n", classification_report(y_test, y_pred_rf))

# Load Dataset 2 (Behavioral Risk Factor Surveillance System)
brfss = pd.read_csv("BehaviouralRskFactorSurvillanceSystem.csv")

# Handle missing values and encoding
brfss.fillna(brfss.mean(), inplace=True)
for col in brfss.select_dtypes(include=['object']).columns:
    brfss[col] = LabelEncoder().fit_transform(brfss[col])

X_brfss = brfss.iloc[:, :-1]
y_brfss = brfss.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X_brfss, y_brfss, test_size=0.2, random_state=42)

# Train Random Forest with all features
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("\nRandom Forest Accuracy (All Features):", accuracy_score(y_test, y_pred_rf))

# Feature importance analysis
importances = rf.feature_importances_
important_features = np.argsort(importances)[-10:]  # Select top 10 important features
X_train_imp = X_train.iloc[:, important_features]
X_test_imp = X_test.iloc[:, important_features]

# Train Random Forest on important features
rf_imp = RandomForestClassifier(n_estimators=100, random_state=42)
rf_imp.fit(X_train_imp, y_train)
y_pred_rf_imp = rf_imp.predict(X_test_imp)

# Compare results
print("\nRandom Forest Accuracy (Selected Features):", accuracy_score(y_test, y_pred_rf_imp))

ValueError: Unknown label type: unknown. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.