In [34]:
import graphviz
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score


In [35]:
df_income_train = pd.read_csv("../dados/adults-income-uci/adult_data.csv", header=None, sep=',')
df_income_test = pd.read_csv("../dados/adults-income-uci/adult_test.csv", header=None, sep = ',')

df_income_train.columns = ["Age", "Workclass","FnlWgt", "Education", "Education_Num", "Marital_Status", "Occupation", "Relationship", "Race", "Sex", "Capital_Gain", "Capital_Loss", "Hours_per_Week", "Country", "Earns"]

df_income_test.columns = ["Age", "Workclass","FnlWgt", "Education", "Education_Num", "Marital_Status", "Occupation", "Relationship", "Race", "Sex", "Capital_Gain", "Capital_Loss", "Hours_per_Week", "Country", "Earns"]



In [36]:
encoder = LabelEncoder()

cat_columns = ["Workclass", "Education", "Marital_Status", "Occupation", "Relationship", "Race", "Sex", "Country"]

for col in cat_columns:
    df_income_train[col] = encoder.fit_transform(df_income_train[col])
    df_income_test[col]  = encoder.fit_transform(df_income_test[col])

df_income_train["Earns"] = df_income_train["Earns"].apply(lambda x: 1 if x.startswith(' >50K') else 0)
df_income_test["Earns"] = df_income_test["Earns"].apply(lambda x: 1 if x.startswith(' >50K') else 0)

X_train = df_income_train[df_income_train.columns[:-1]]
y_train = df_income_train[df_income_train.columns[-1]]
X_test = df_income_test[df_income_test.columns[:-1]]
y_test = df_income_test[df_income_test.columns[-1]]

In [37]:
tree = DecisionTreeClassifier(criterion='gini', max_depth=5)
tree.fit(df_income_train[df_income_train.columns[:-1]], df_income_train[df_income_train.columns[-1]])

kf = KFold(n_splits=10, shuffle=True, random_state=42)

scores = cross_val_score(tree, X_train, y_train, cv=kf)

print("Cross-Validation Scores:", scores)
print("Mean Score:", scores.mean())
print("Standard Deviation:", scores.std())

dot_data = export_graphviz(tree, out_file=None, 
                           feature_names=df_income_train.columns[:-1], 
                           class_names=['<=50K', '>50K'],  # Replace with your class labels
                           filled=True, rounded=True)

graph = graphviz.Source(dot_data)
graph.render("decision_tree")
graph.view() 

print(confusion_matrix(y_test, tree.predict(X_test)))
print(accuracy_score(y_test, tree.predict(X_test)))

Cross-Validation Scores: [0.8584587  0.85135135 0.84889435 0.83814496 0.85135135 0.85288698
 0.85257985 0.85227273 0.85595823 0.84889435]
Mean Score: 0.851079285666329
Standard Deviation: 0.005120510892602048
[[11855   580]
 [ 1839  2007]]
0.8514219028315214


In [38]:
ensemble = RandomForestClassifier()

ensemble.fit(X_train, y_train)

print(confusion_matrix(y_test, ensemble.predict(X_test)))
print(accuracy_score(y_test, ensemble.predict(X_test)))

[[11605   830]
 [ 1521  2325]]
0.8555985504575886
