In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score
from xgboost import XGBClassifier

# Load dataset

In [3]:
dataset = pd.read_csv('adult.csv', header=None)
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Handle missing values

In [4]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
X[:, 1:] = imputer.fit_transform(X[:, 1:])

# Encode categorical features

In [5]:
categorical_indices = [1, 3, 5, 6, 7, 8, 9, 13]  # Indices of categorical columns
label_encoders = {}
for index in categorical_indices:
    le = LabelEncoder()
    X[:, index] = le.fit_transform(X[:, index])
    label_encoders[index] = le


# Encode target variable

In [6]:
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(y)

# Split data into training and testing sets


In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=0)


# Feature scaling


In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the XGBoost classifier


In [9]:
classifier = XGBClassifier()
classifier.fit(X_train, Y_train)

# Make predictions and evaluate


In [10]:

y_pred = classifier.predict(X_test)
cm = confusion_matrix(Y_test, y_pred)
accuracy = accuracy_score(Y_test, y_pred)

In [11]:
print("Confusion Matrix:\n", cm)
print(f"Accuracy: {accuracy:.2f}")

Confusion Matrix:
 [[4568  350]
 [ 529 1066]]
Accuracy: 0.87


# Perform cross-validation

In [12]:
accuracies = cross_val_score(estimator=classifier, X=X_train, y=Y_train, cv=10)
print(f"Cross-Validation Accuracy: {accuracies.mean() * 100:.2f}%")
print(f"Standard Deviation: {accuracies.std() * 100:.2f}%")

Cross-Validation Accuracy: 87.05%
Standard Deviation: 0.74%


# Compare predictions with actual results


In [13]:
comparison = np.concatenate((y_pred.reshape(len(y_pred), 1), Y_test.reshape(len(Y_test), 1)), axis=1)
print("Predicted vs Actual:\n", comparison)


Predicted vs Actual:
 [[0 0]
 [0 0]
 [0 0]
 ...
 [1 1]
 [0 0]
 [1 1]]


# Predict for a new individual

In [14]:
sample_data = [[40, 4, 80000, 9, 9, 0, 4, 0, 4, 1, 0, 1000, 50, 39]]
sample_data_scaled = scaler.transform(sample_data)
result = classifier.predict(sample_data_scaled)

In [15]:
if result == [0]:
    print("Person makes Below 50K/year")
else:
    print("Person makes Above 50K/year")

Person makes Above 50K/year
