## Importing libraries

In [1]:
import numpy as np
from ucimlrepo import fetch_ucirepo 
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

## Reading data into our model using document on website


In [2]:
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
  
# metadata 
print(adult.metadata) 

# variable information 
print(adult.variables) 
#checking the output data
print(y)

{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Mon Aug 07 2023', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': 'Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the following conditions: ((AAG

## Droping the missing values to clean the data

In [3]:
#concat features to delete the full row (we could fill the missing data put i pereferd to drop cause i dont know i could ruin data)
# Concatenate features and targets
data = pd.concat([X, y], axis=1)

#Remove dots in the target column like 50k>. to 50k>
last_column = data.columns[-1]
data[last_column] = data[last_column].str.rstrip('.')

# Drop rows with missing values
data_cleaned = data.dropna()

# Separate features and targets again
X_cleaned = data_cleaned.drop('income', axis=1)  # replace 'target_column_name' with the actual name of your target column
y_cleaned = data_cleaned['income']


## Creating dummies for the catgorical data

In [4]:
X_cleaned_encoded = pd.get_dummies(X_cleaned)

## Spliting the data 

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_cleaned_encoded, y_cleaned, test_size=0.2, random_state=42)

## Creating the model

In [6]:
#Creating Model and fit 
model = GaussianNB()
model.fit(X_train, y_train)

## Prediction and creating confusion matrix

In [7]:
# Make predictions on the testing set
predictions = model.predict(X_test)

y_test = np.array(y_test)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)
print("conf matrix",conf_matrix)

tp = conf_matrix[1, 1]
tn = conf_matrix[0, 0]
fp = conf_matrix[0, 1]
fn = conf_matrix[1, 0]

sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)

# Compute Sensitivity and Specificity
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)

print("Sensitivity :", sensitivity)
print("Specificity :", specificity)


conf matrix [[6806  364]
 [1648  707]]
Sensitivity : 0.30021231422505307
Specificity : 0.9492329149232915


## Posterior probability



In [8]:
posterior_probs = model.predict_proba(X_test)[:, 1]
print(posterior_probs)

[0.00928059 0.00264511 0.0116557  ... 0.00603148 1.         0.01547093]
