In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

df = pd.read_csv('adult.csv')

print(df.head())


   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18          ?  103497  Some-college               10       Never-married   

          occupation relationship   race  gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                  ?    Own-child  White  Female             0             0   

   hours-per-week native-country incom

In [3]:
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
              'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
              'hours-per-week', 'native-country', 'income']


In [4]:
df.replace(' ?', pd.NA, inplace=True)

df.dropna(inplace=True)


In [5]:
categorical_columns = ['workclass', 'education', 'marital-status', 'occupation',
                       'relationship', 'race', 'sex', 'native-country', 'income']

label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


In [6]:
print(df)

       age  workclass  fnlwgt  education  education-num  marital-status  \
0       25          4  226802          1              7               4   
1       38          4   89814         11              9               2   
2       28          2  336951          7             12               2   
3       44          4  160323         15             10               2   
4       18          0  103497         15             10               4   
...    ...        ...     ...        ...            ...             ...   
48837   27          4  257302          7             12               2   
48838   40          4  154374         11              9               2   
48839   58          4  151910         11              9               6   
48840   22          4  201490         11              9               4   
48841   52          5  287927         11              9               2   

       occupation  relationship  race  sex  capital-gain  capital-loss  \
0               7        

In [7]:
X = df.drop('income', axis=1)
y = df['income']


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
from sklearn.tree import export_graphviz
import pydotplus
from IPython.display import Image, display


In [11]:
import numpy as np

In [12]:
def bootstrap_evaluate(X, y, n_bootstraps=100):
    accuracies = []
    for i in range(n_bootstraps):
        indices = np.random.randint(0, len(X), len(X))

        bootstrap_X, bootstrap_y = X.iloc[indices], y.iloc[indices]

        X_train, X_test, y_train, y_test = train_test_split(bootstrap_X, bootstrap_y, test_size=0.2, random_state=42)

        rf = RandomForestClassifier(n_estimators=100, random_state=42)
        rf.fit(X_train, y_train)

        y_pred = rf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)

    mean_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)

    return mean_accuracy, std_accuracy

mean_acc, std_acc = bootstrap_evaluate(X, y, n_bootstraps=10)

X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(X, y, test_size=0.2, random_state=42)

rf_final = RandomForestClassifier(n_estimators=100, random_state=42)
rf_final.fit(X_train_final, y_train_final)

y_pred_final = rf_final.predict(X_test_final)

print(f"Final Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test_final, y_pred_final)}")
print(f"Classification Report:\n{classification_report(y_test_final, y_pred_final)}")
print(f"Mean Accuracy (Bootstrapping): {mean_acc}")
print(f"Standard Deviation of Accuracy (Bootstrapping): {std_acc}")

Final Model Evaluation:
Accuracy: 0.8639574163169209
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      7479
           1       0.74      0.64      0.69      2290

    accuracy                           0.86      9769
   macro avg       0.82      0.79      0.80      9769
weighted avg       0.86      0.86      0.86      9769

Mean Accuracy (Bootstrapping): 0.9363292046268809
Standard Deviation of Accuracy (Bootstrapping): 0.002042167810754414
