In [1]:
import graphviz
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz
from sklearn.metrics import confusion_matrix, accuracy_score, auc, roc_curve


In [2]:
df_income_train = pd.read_csv("../dados/adults-income-uci/adult_data.csv", header=None, sep=',')
df_income_test = pd.read_csv("../dados/adults-income-uci/adult_test.csv", header=None, sep = ',')

df_income_train.columns = ["Age", "Workclass","FnlWgt", "Education", "Education_Num", "Marital_Status", "Occupation", "Relationship", "Race", "Sex", "Capital_Gain", "Capital_Loss", "Hours_per_Week", "Country", "Earns"]

df_income_test.columns = ["Age", "Workclass","FnlWgt", "Education", "Education_Num", "Marital_Status", "Occupation", "Relationship", "Race", "Sex", "Capital_Gain", "Capital_Loss", "Hours_per_Week", "Country", "Earns"]



In [3]:
encoder = LabelEncoder()

cat_columns = ["Workclass", "Education", "Marital_Status", "Occupation", "Relationship", "Race", "Sex", "Country"]

for col in cat_columns:
    df_income_train[col] = encoder.fit_transform(df_income_train[col])
    df_income_test[col]  = encoder.fit_transform(df_income_test[col])

df_income_train["Earns"] = df_income_train["Earns"].apply(lambda x: 1 if x.startswith(' >50K') else 0)
df_income_test["Earns"] = df_income_test["Earns"].apply(lambda x: 1 if x.startswith(' >50K') else 0)

X_train = df_income_train[df_income_train.columns[:-1]]
y_train = df_income_train[df_income_train.columns[-1]]
X_test = df_income_test[df_income_test.columns[:-1]]
y_test = df_income_test[df_income_test.columns[-1]]

In [4]:
tree = DecisionTreeClassifier(criterion='gini', max_depth=5)
acc = []

kf = KFold(n_splits=10, shuffle=True, random_state=42)   

for i, (train_index, test_index) in enumerate(kf.split(df_income_train)):
    
    tree.fit(df_income_train.iloc[train_index, :-1], df_income_train.iloc[train_index, -1])
    y_pred = tree.predict(df_income_train.iloc[test_index, :-1])
    y_true = df_income_train.iloc[test_index, -1]
    accuracy = accuracy_score(y_true, y_pred)
    print(print(f"Fold {i+1}: {accuracy}"))
    acc.append(accuracy)
    
print(f"Mean and std of accuracies: {np.array(acc).mean()} - {np.array(acc).std()}")

dot_data = export_graphviz(tree, out_file=None, 
                           feature_names=df_income_train.columns[:-1], 
                           class_names=['<=50K', '>50K'],  # Replace with your class labels
                           filled=True, rounded=True)

graph = graphviz.Source(dot_data)
graph.render("decision_tree")
graph.view() 

print(confusion_matrix(y_test, tree.predict(X_test)))
print(accuracy_score(y_test, tree.predict(X_test)))

print(tree.feature_importances_)

Fold 1: 0.8584587043291373
None
Fold 2: 0.8513513513513513
None
Fold 3: 0.8488943488943489
None
Fold 4: 0.8381449631449631
None
Fold 5: 0.8513513513513513
None
Fold 6: 0.8525798525798526
None
Fold 7: 0.8522727272727273
None
Fold 8: 0.8522727272727273
None
Fold 9: 0.855958230958231
None
Fold 10: 0.8482800982800983
None
Mean and std of accuracies: 0.8509564355434789 - 0.005130920477991151
[[11856   579]
 [ 1916  1930]]
0.8467538848965052
[3.46486680e-02 3.31877883e-04 1.23930686e-03 0.00000000e+00
 2.28523818e-01 0.00000000e+00 4.01301366e-03 4.63028659e-01
 0.00000000e+00 0.00000000e+00 2.33899221e-01 2.25454570e-02
 1.17699782e-02 0.00000000e+00]


In [5]:
ensemble = RandomForestClassifier()

ensemble.fit(X_train, y_train)

print(confusion_matrix(y_test, ensemble.predict(X_test)))
print(accuracy_score(y_test, ensemble.predict(X_test)))

[[11591   844]
 [ 1526  2320]]
0.8544315459738345
