In [11]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [12]:
# Step 1: Data Loading
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
data = pd.read_csv(url, names=column_names, na_values=' ?', skipinitialspace=True)

In [16]:
print(data_encoded.columns)


Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week', 'workclass_Federal-gov', 'workclass_Local-gov',
       'workclass_Never-worked', 'workclass_Private',
       ...
       'native-country_Puerto-Rico', 'native-country_Scotland',
       'native-country_South', 'native-country_Taiwan',
       'native-country_Thailand', 'native-country_Trinadad&Tobago',
       'native-country_United-States', 'native-country_Vietnam',
       'native-country_Yugoslavia', 'income_>50K'],
      dtype='object', length=101)


In [20]:
# Step 2: Data Preprocessing
# Drop rows with missing values
data.dropna(inplace=True)

# Encode categorical variables
data_encoded = pd.get_dummies(data, drop_first=True)

# Split data into features and target variable
X = data_encoded.drop('income_>50K', axis=1)

y = data_encoded['income_>50K']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Model Training and Evaluation - Naïve Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Make predictions
nb_pred = nb_model.predict(X_test)


In [21]:
# Evaluate model
nb_accuracy = accuracy_score(y_test, nb_pred)
nb_report = classification_report(y_test, nb_pred)

In [22]:
# Step 3: Model Training and Evaluation - Random Forest Classification
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)


In [23]:
# Make predictions
rf_pred = rf_model.predict(X_test)

In [24]:
# Evaluate model
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_report = classification_report(y_test, rf_pred)

In [25]:
# Step 4: Comparison
print("Naïve Bayes Classifier Performance:")
print("Accuracy:", nb_accuracy)
print("Classification Report:\n", nb_report)

Naïve Bayes Classifier Performance:
Accuracy: 0.7990173499155535
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.95      0.88      4942
           1       0.68      0.32      0.43      1571

    accuracy                           0.80      6513
   macro avg       0.75      0.64      0.66      6513
weighted avg       0.78      0.80      0.77      6513



In [26]:
print("\nRandom Forest Classifier Performance:")
print("Accuracy:", rf_accuracy)
print("Classification Report:\n", rf_report)


Random Forest Classifier Performance:
Accuracy: 0.8582834331337326
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.93      0.91      4942
           1       0.74      0.63      0.68      1571

    accuracy                           0.86      6513
   macro avg       0.82      0.78      0.80      6513
weighted avg       0.85      0.86      0.85      6513

