# Categorical Naive Bayes

In [1]:
import numpy as np
from pathlib import Path

from src.models import CategoricalNaiveBayes
from src.utils import train_test_split, classification_report

# Data Prep

## Read data

In [2]:
data_path = Path("./data/processed/processed_train.csv")

# Read first line as header
with open(data_path, "r", encoding="utf-8") as f:
    data_header = f.readline().strip().split(",")

data = np.genfromtxt(data_path, delimiter=",", skip_header=1, dtype=str)
data[:5]

array([['city_103', 'Male', 'Has relevent experience', 'no_enrollment',
        'Graduate', 'STEM', '>20', 'Missing', 'Missing', '1', '1.0'],
       ['city_40', 'Male', 'No relevent experience', 'no_enrollment',
        'Graduate', 'STEM', '15', '50-99', 'Pvt Ltd', '>4', '0.0'],
       ['city_21', 'Missing', 'No relevent experience',
        'Full time course', 'Graduate', 'STEM', '5', 'Missing',
        'Missing', 'never', '0.0'],
       ['city_115', 'Missing', 'No relevent experience', 'no_enrollment',
        'Graduate', 'Business Degree', '<1', 'Missing', 'Pvt Ltd',
        'never', '1.0'],
       ['city_162', 'Male', 'Has relevent experience', 'no_enrollment',
        'Masters', 'STEM', '>20', '50-99', 'Funded Startup', '4', '0.0']],
      dtype='<U23')

## Split data

80% train, 20% val

In [3]:
X_train, X_val, y_train, y_val = train_test_split(
    data[:, :-1], data[:, -1], test_size=0.2, random_state=42
)

# Modeling

In [4]:
model = CategoricalNaiveBayes()
model.fit(X_train, y_train)

In [5]:
y_pred = model.predict(X_val)

In [6]:
classification_report(y_val, y_pred)

Class 0.0: Precision: 0.7474, Recall: 0.7284, F1-Score: 0.7378
Class 1.0: Precision: 0.7352, Recall: 0.7538, F1-Score: 0.7444
Overall Accuracy: 0.7411


{'0.0': {'precision': 0.7474134855511951,
  'recall': 0.728442280945758,
  'f1_score': 0.7378059517520691},
 '1.0': {'precision': 0.735164462529671,
  'recall': 0.7538247566063978,
  'f1_score': 0.7443776824034334},
 'accuracy': 0.7411335187760779}