In [None]:
# ------------------------------------------------------------
# Income Classification Model using Random Forest (adult_income.csv)
# ------------------------------------------------------------
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Step 1: Load dataset
data = pd.read_csv("adult_income.csv")

# Step 2: Encode categorical variables
for col in data.select_dtypes(include='object').columns:
    data[col] = LabelEncoder().fit_transform(data[col].astype(str))

# Step 3: Select features & target
target = 'income'
features = [col for col in data.columns if col != target]
X, y = data[features], data[target]

# Optional: Scale features (not strictly needed for Random Forest)
X = StandardScaler().fit_transform(X)

# Step 4: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train Random Forest Classifier
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1]

# Step 6: Evaluate model
accuracy  = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall    = recall_score(y_test, y_pred)
f1        = f1_score(y_test, y_pred)
roc_auc   = roc_auc_score(y_test, y_proba)

print("\n----- Random Forest Model Performance -----")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-Score : {f1:.4f}")
print(f"ROC-AUC  : {roc_auc:.4f}")

# Step 7: Plot Predicted vs Actual
plt.figure(figsize=(6,5))
sns.scatterplot(x=y_test, y=y_proba, alpha=0.6)
plt.xlabel('Actual Income (0=<=50K,1=>50K)')
plt.ylabel('Predicted Probability (>50K)')
plt.title('Predicted vs Actual Income')
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig("predicted_vs_actual_income_rf.png")
plt.close()

# Step 8: Save metrics
metrics = pd.DataFrame({
    'Metric': ['Accuracy','Precision','Recall','F1-Score','ROC-AUC'],
    'Value': [accuracy, precision, recall, f1, roc_auc]
})
metrics.to_csv("income_classification_summary_rf.csv", index=False)

print("\nâœ… Income Classification with Random Forest completed!")
print(" - income_classification_summary_rf.csv")
print(" - predicted_vs_actual_income_rf.png")