# INFO-H420 Management of Data Science and Business Workflows

## Project on Responsible Data Science

## 1. Classifier

In [None]:
import pandas as pd
import numpy as np
from aif360.datasets import AdultDataset
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

### Load and preprocess the dataset

In [None]:
# Load Dataset
dataset = AdultDataset()

# Convert to DataFrame
df = dataset.convert_to_dataframe()[0]

# Binarize age
median_age = df['age'].median()
df['age_binary'] = df['age'].apply(lambda x: 0 if x <= median_age else 1)
df = df.drop('age', axis=1)

# Convert categorical variables to dummies (binary)
df = pd.get_dummies(df, drop_first=True)

print(df.head())


### Split and Train Data

In [None]:
# Divide into features (X) and labels (y)
X = df.drop('income-per-year', axis=1)
y = df['income-per-year']

# Divide into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train an XGBoost classifier
model = XGBClassifier(eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)

### Test the Model

In [None]:
# Predictions
y_pred = model.predict(X_test)

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))