In [None]:
# Step 1: Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 2: Loading dataset
from google.colab import files
uploaded = files.upload()

# Replacing with your file name
df = pd.read_csv("adult (1).csv")

# Step 3: Cleaning dataset
df = df.iloc[:, :-1]  # remove unnamed column if any
df = df.dropna()

# Encoding categorical features
label_encoders = {}
for col in df.select_dtypes(include="object").columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Step 4: Training/testing split
X = df.drop("income", axis=1)
y = df["income"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Scaling the features for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 6: Training the models
svm_model = SVC()
rf_model = RandomForestClassifier(random_state=42)
lr_model = LogisticRegression(max_iter=2000)  # Increased max_iter

svm_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
lr_model.fit(X_train_scaled, y_train)  # Scaled input

# Step 7: Predict and evaluate
svm_pred = svm_model.predict(X_test)
rf_pred = rf_model.predict(X_test)
lr_pred = lr_model.predict(X_test_scaled)

results = {
    "SVM": {
        "accuracy": accuracy_score(y_test, svm_pred),
        "report": classification_report(y_test, svm_pred, output_dict=True),
        "confusion": confusion_matrix(y_test, svm_pred)
    },
    "Random Forest": {
        "accuracy": accuracy_score(y_test, rf_pred),
        "report": classification_report(y_test, rf_pred, output_dict=True),
        "confusion": confusion_matrix(y_test, rf_pred)
    },
    "Logistic Regression": {
        "accuracy": accuracy_score(y_test, lr_pred),
        "report": classification_report(y_test, lr_pred, output_dict=True),
        "confusion": confusion_matrix(y_test, lr_pred)
    }
}

# Step 8: Accuracy bar chart
accuracies = {name: result["accuracy"] for name, result in results.items()}
plt.figure(figsize=(8, 5))
sns.barplot(x=list(accuracies.keys()), y=list(accuracies.values()))
plt.title("Model Accuracy Comparison")
plt.ylabel("Accuracy")
plt.ylim(0.7, 0.9)
plt.show()

# Step 9: Confusion matrix heatmaps
for model_name, model_result in results.items():
    plt.figure(figsize=(5, 4))
    sns.heatmap(model_result["confusion"], annot=True, fmt="d", cmap="Blues")
    plt.title(f"{model_name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

# Step 10: RandomizedSearchCV for Random Forest
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

search = RandomizedSearchCV(RandomForestClassifier(random_state=42),
                            param_distributions=param_grid,
                            n_iter=10, cv=3, verbose=1, n_jobs=-1)
search.fit(X_train, y_train)

print("🔍 Best Hyperparameters (Random Forest):", search.best_params_)

# Step 11: Predicted for single input
sample = X_test.iloc[[0]]
predicted_income = search.best_estimator_.predict(sample)
decoded_prediction = label_encoders['income'].inverse_transform(predicted_income)

print("\n🤖 Predicted income for a sample person:", decoded_prediction[0])
