# Assignment 6 â€” Model Performance Evaluation
Decision Tree | Adult Census Income Prediction

## Import Libraries

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_curve, roc_auc_score, confusion_matrix, classification_report
)
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


## Load Adult Census Dataset

In [None]:

url = "https://raw.githubusercontent.com/selva86/datasets/master/Adult.csv"
df = pd.read_csv(url)

df.head()


## Basic Info

In [None]:

df.shape, df.columns


## Define Features and Target

In [None]:

X = df.drop('income', axis=1)
y = (df['income'] == '>50K').astype(int)

X.head(), y.head()


## Identify Categorical and Numerical Columns

In [None]:

cat_cols = X.select_dtypes(include=['object']).columns
num_cols = X.select_dtypes(exclude=['object']).columns

cat_cols, num_cols


## Preprocessing + Decision Tree Pipeline

In [None]:

preprocess = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
], remainder='passthrough')

model = Pipeline([
    ('prep', preprocess),
    ('tree', DecisionTreeClassifier(random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)


## Accuracy

In [None]:

accuracy_score(y_test, y_pred)


## Precision, Recall, and F1

In [None]:

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

precision, recall, f1


## Confusion Matrix

In [None]:

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.title("Confusion Matrix")
plt.show()


## ROC Curve & AUC

In [None]:

y_prob = model.predict_proba(X_test)[:,1]

fpr, tpr, thresholds = roc_curve(y_test, y_prob)
auc = roc_auc_score(y_test, y_prob)

plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.show()

auc


## K-Fold Cross Validation

In [None]:

scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

scores.mean(), scores.std()


## Discussion
Write your interpretation and conclusions here.