In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, roc_curve, auc
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
dataset = pd.read_csv('dataset.csv')

# Step 1: Harmonize features
def harmonize_features(dataset):
    dataset.rename(columns={
        "Age Group": "Age",
        "Mental Health": "MentalDistress",
        "Data_Value": "MentalDistressPercentage",
        "Outcome_Column_Name": "Outcome"  # Replace with the actual target column name
    }, inplace=True)
    return dataset

dataset = harmonize_features(dataset)


In [None]:
# Step 2: Data standardization
def preprocess_data(dataset):
    # Handle missing values
    dataset.fillna({
        "Age": dataset["Age"].median(),
        "MentalDistressPercentage": dataset["MentalDistressPercentage"].mean(),
        "Race": "Unknown"
    }, inplace=True)
    
    # Normalize numerical features
    scaler = StandardScaler()
    dataset["MentalDistressPercentage"] = scaler.fit_transform(dataset[["MentalDistressPercentage"]])
    
    # Encode categorical features
    encoder = OneHotEncoder()
    encoded_race = encoder.fit_transform(dataset[["Race"]]).toarray()
    race_columns = [f"Race_{cat}" for cat in encoder.categories_[0]]
    encoded_df = pd.DataFrame(encoded_race, columns=race_columns, index=dataset.index)
    dataset = pd.concat([dataset, encoded_df], axis=1).drop("Race", axis=1)
    
    return dataset

dataset = preprocess_data(dataset)

In [None]:
# Step 3: Split the dataset into training and testing sets
X = dataset.drop(columns=["Outcome"])
y = dataset["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 4: Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [None]:
# Step 5: Evaluate the model
y_pred = model.predict(X_test)
print("Evaluation on Test Data:")
print(classification_report(y_test, y_pred))

In [None]:
# Step 6: Plot ROC curve
y_pred_prob = model.predict_proba(X_test)

fpr, tpr, _ = roc_curve(y_test, y_pred_prob[:, 1])
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], 'k--', label="Random")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()