In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [4]:
def load_and_preprocess(file_path):
    df = pd.read_csv(file_path)

    df.replace("?", np.nan, inplace=True)

    X = df.drop("income", axis=1)
    y = df["income"].astype(str).apply(lambda x: 1 if ">50K" in x else 0)

    categorical_features = X.select_dtypes(include=["object"]).columns
    numerical_features = X.select_dtypes(exclude=["object"]).columns

    numeric_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="median"))
        ]
    )

    categorical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", OneHotEncoder(handle_unknown="ignore"))
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numerical_features),
            ("cat", categorical_transformer, categorical_features)
        ]
    )

    return X, y, preprocessor


def train_model(X_train, y_train, preprocessor):
    model = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("classifier", RandomForestClassifier(
                n_estimators=200,
                random_state=42,
                n_jobs=-1,
                class_weight="balanced"
            ))
        ]
    )

    model.fit(X_train, y_train)
    return model


def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:\n")
    print(confusion_matrix(y_test, y_pred))


def display_feature_importance(model, feature_names):
    classifier = model.named_steps["classifier"]
    importances = classifier.feature_importances_

    importance_df = pd.DataFrame({
        "Feature": feature_names,
        "Importance": importances
    }).sort_values(by="Importance", ascending=False)

    print("\nTop 15 Important Features:\n")
    print(importance_df.head(15))


def main():
    file_path = "data.csv"


    X, y, preprocessor = load_and_preprocess(file_path)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    model = train_model(X_train, y_train, preprocessor)

    evaluate_model(model, X_test, y_test)

    ohe = model.named_steps["preprocessor"].named_transformers_["cat"].named_steps["encoder"]
    categorical_features = ohe.get_feature_names_out(
        model.named_steps["preprocessor"].transformers_[1][2]
    )

    numerical_features = model.named_steps["preprocessor"].transformers_[0][2]

    all_feature_names = np.concatenate([numerical_features, categorical_features])

    display_feature_importance(model, all_feature_names)


if __name__ == "__main__":
    main()


Accuracy: 0.8513741747274681

Classification Report:

              precision    recall  f1-score   support

           0       0.88      0.93      0.90      4945
           1       0.73      0.60      0.66      1568

    accuracy                           0.85      6513
   macro avg       0.81      0.77      0.78      6513
weighted avg       0.85      0.85      0.85      6513

Confusion Matrix:

[[4597  348]
 [ 620  948]]

Top 15 Important Features:

                              Feature  Importance
0                                 age    0.154118
1                              fnlwgt    0.136098
32  marital.status_Married-civ-spouse    0.083222
5                      hours.per.week    0.080032
3                        capital.gain    0.068505
2                       education.num    0.063482
51               relationship_Husband    0.052792
34       marital.status_Never-married    0.043863
4                        capital.loss    0.022035
40         occupation_Exec-managerial    0.0