In [14]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [2]:
# Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 
           'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
           'hours-per-week', 'native-country', 'income']

data = pd.read_csv(url, names=columns, sep=r'\s*,\s*', engine='python')

In [3]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
data.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0
mean,38.437902,189793.8,10.121312,1092.007858,88.372489,40.931238
std,13.134665,105653.0,2.549995,7406.346497,404.29837,11.979984
min,17.0,13769.0,1.0,0.0,0.0,1.0
25%,28.0,117627.2,9.0,0.0,0.0,40.0
50%,37.0,178425.0,10.0,0.0,0.0,40.0
75%,47.0,237628.5,13.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30162 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             30162 non-null  int64 
 1   workclass       30162 non-null  object
 2   fnlwgt          30162 non-null  int64 
 3   education       30162 non-null  object
 4   education-num   30162 non-null  int64 
 5   marital-status  30162 non-null  object
 6   occupation      30162 non-null  object
 7   relationship    30162 non-null  object
 8   race            30162 non-null  object
 9   sex             30162 non-null  object
 10  capital-gain    30162 non-null  int64 
 11  capital-loss    30162 non-null  int64 
 12  hours-per-week  30162 non-null  int64 
 13  native-country  30162 non-null  object
 14  income          30162 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
# Preprocess the data
# Handle missing values by replacing '?' with NaN and dropping those rows
data.replace('?', pd.NA, inplace=True)
data.dropna(inplace=True)

In [7]:
# Function to detect outliers using IQR
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[column] < lower_bound) | (data[column] > upper_bound)]

# Detect outliers in capital-gain and hours-per-week
outliers_capital_gain = detect_outliers_iqr(data, 'capital-gain')
outliers_hours_per_week = detect_outliers_iqr(data, 'hours-per-week')

In [8]:
# Encode categorical features using LabelEncoder
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

In [9]:
# Split data into features and target
X = data.drop('income', axis=1)  # Features
y = data['income']  # Target (class)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
# Train the XGBoost Classifier
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')  # Suppress warning on label encoding
xgb_clf.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_parallel_tree=None, random_state=None, ...)

In [16]:
# Predict on the test set and evaluate accuracy
y_pred = xgb_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 86.89%


In [17]:
# Function to predict the class of a new user
def predict_user_class(age, workclass, education, marital_status, occupation, relationship, race, sex, hours_per_week):
    user_data = pd.DataFrame({
        'age': [age],
        'workclass': [label_encoders['workclass'].transform([workclass])[0]],
        'fnlwgt': [0],  # Placeholder, not used for prediction
        'education': [label_encoders['education'].transform([education])[0]],
        'education-num': [0],  # Placeholder, not used for prediction
        'marital-status': [label_encoders['marital-status'].transform([marital_status])[0]],
        'occupation': [label_encoders['occupation'].transform([occupation])[0]],
        'relationship': [label_encoders['relationship'].transform([relationship])[0]],
        'race': [label_encoders['race'].transform([race])[0]],
        'sex': [label_encoders['sex'].transform([sex])[0]],
        'capital-gain': [0],  # Placeholder
        'capital-loss': [0],  # Placeholder
        'hours-per-week': [hours_per_week],
        'native-country': [0]  # Placeholder
    })

    # Predict the class (income level)
    prediction = xgb_clf.predict(user_data)
    income_class = label_encoders['income'].inverse_transform(prediction)[0]
    return f'Predicted income class: {income_class}'

# Example of using the prediction function
user_prediction = predict_user_class(age=30, workclass='Private', education='Bachelors', marital_status='Never-married',
                                     occupation='Tech-support', relationship='Not-in-family', race='White', sex='Male',
                                     hours_per_week=40)
print(user_prediction)

Predicted income class: <=50K
