In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

# Step 1: Download Census data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
data = pd.read_csv(url, names=column_names, na_values=' ?', skipinitialspace=True)

# Step 2: Preprocessing
data.dropna(inplace=True)
X = data[['age', 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']]
y = data['income']
X = pd.get_dummies(X)  # Convert categorical variables into dummy/indicator variables

# Step 3: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 4: Train Naive Bayes Classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

# Step 5: Predictions
y_pred = nb_classifier.predict(X_test)

# Step 6: Compute Sensitivity and Specificity
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
print("Sensitivity:", sensitivity)
print("Specificity:", specificity)

# Step 7: Compute the posterior probability of making over 50K a year
posterior_probabilities = nb_classifier.predict_proba(X_test)
index_over_50k = nb_classifier.classes_.tolist().index('>50K')
posterior_prob_over_50k = posterior_probabilities[:, index_over_50k]
print("Posterior probability of making over 50K a year:", posterior_prob_over_50k)


Sensitivity: 0.8292999135695764
Specificity: 0.7813547954393025
Posterior probability of making over 50K a year: [2.95609603e-06 9.46590997e-01 9.53379784e-01 ... 3.22947297e-06
 9.85945065e-01 1.00000000e+00]
