In [2]:
import numpy as np
import pandas as pd

from sklearn.naive_bayes import GaussianNB
from sklearn.base import BaseEstimator, ClassifierMixin

import sklearn.preprocessing

from sklearn.model_selection import train_test_split

# Assignment 1 - Naive Bayes

## Question 1 - Implementation

In [19]:
class MyGaussianNB(BaseEstimator, ClassifierMixin): 
    
    def fit(self, X, y):
        
        # First, the prior for each 
        # distinct class 'cls' in y
        
        N = len(y)
        tally = pd.Series(y).value_counts()
        
        self.priors = dict()
        for cls, count in tally.items():
            self.priors[cls] = count / N
            
        # Second, estimate the distribution of the 
        # likelihood for each class-value, feature-index pair.
        
        # More specifically, assume each is normally distributed and
        # estimate the mean and variance of that distribution
        
        self.means = dict()
        self.vars = dict()
        
        classes = tally.keys()
        n_features = X.shape[1]
        
        for cls in classes:
            
            cls_data = X[y == cls]
            current_means = np.zeros(n_features)
            current_vars = np.zeros(n_features)
            
            for feature_index in range(n_features):
                
                feature_data = cls_data.transpose()[feature_index]
                
                mean = np.mean(feature_data)
                var = np.var(feature_data)
                
                # This was an adjustment I considered that is discussed
                # later on. It did not have the effect I expected.
                # if var == 0:
                #    var = max(map(abs, feature_data)) * 0.5
                
                current_means[feature_index] = mean
                current_vars[feature_index] = var
                
            self.means[cls] = current_means
            self.vars[cls] = current_vars
                
        return self
    
    def predict(self, X):
        
        predictions = np.zeros(len(X))
        
        for i, sample in enumerate(X):
            
            # Choose the class with the highest posterior
            # probability.
            
            most_likely_cls = None
            highest_posterior = 0

            for cls, prior in self.priors.items():

                likelihood = 1
                
                for feature_index, feature_value in enumerate(sample):
                    
                    mean = self.means[cls][feature_index]
                    var = self.vars[cls][feature_index]
                    x = feature_value
                    
                    if var == 0:
                        p = 1 if x == mean else 0

                    else:
                        a = (var * 2 * np.pi)**-0.5
                        b = -(x - mean)**2 / (2 * var)
                        
                        p = a*np.exp(b)

                    likelihood *= p

                posterior = prior * likelihood
                
                if posterior > highest_posterior:
                    most_likely_cls = cls
                    highest_posterior = posterior
            
            if most_likely_cls is None:
                
                # This can easily happen when there is not much
                # data, and some of the variances are 0.
                # In this case, for lack of useful
                # input, any estimate will do.
                
                # This does bias the model
                # towards one class. I'm not sure what to
                # do about that though.
                
                most_likely_cls = cls
                
                
            predictions[i] = most_likely_cls
                
        return predictions
    
    def predict_proba(self, X):
        # The assignment specified that there
        # is no need to implement this.
        pass

## Utilities for Testing and Comparison to Sklearn

Data prep:

  - Limit class values

  - Convert to numerical
  
  - Train-test-split
  
  - Normalize



In [20]:
def prep_data(path, target_feature, allowed_classes=None, allowed_features=None):
    
    # Read in the data
    
    data = pd.read_csv(path, index_col = 0)
    
    # Only keep allowed classes
    
    if allowed_classes is not None:
        data = data[data[target_feature].isin(allowed_classes)]
        
    if allowed_features is not None:
        data = data[set([*allowed_features, target_feature])]
        
    # Convert nonnumeric columns to numeric columns

    numeric_cols = list(data.select_dtypes(np.number).columns)
    if target_feature in numeric_cols:
        numeric_cols.remove(target_feature)
        
    nonnumeric_cols = list(set(data.columns) - set(numeric_cols))
    if target_feature in nonnumeric_cols:
        nonnumeric_cols.remove(target_feature)
 
    numeric_data = pd.get_dummies(data, columns=nonnumeric_cols)
    
    encoder = sklearn.preprocessing.OrdinalEncoder()
    numeric_data[[target_feature]] = encoder.fit_transform(numeric_data[[target_feature]])
    
    # normalization and train-test-splitting
    
    # Note that y is reshaped below from a column vector to a row vector.
    # pandas yields a column, sklearn and my implementation expect a row.
    
    X = numeric_data.drop(target_feature, axis=1).values
    y = numeric_data[[target_feature]].values.reshape(len(numeric_data))

    test_size = 0.5
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=test_size)

    # Also note that normalization is done AFTER train-test-splitting
    # Preprocessing transformations should not depend on incoming
    # data.
    
    scaler = sklearn.preprocessing.MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    max_k = X_train.shape[1]
    
    return data, numeric_data, X_train, X_test, y_train, y_test


Model testing and comparison

In [21]:
def test_model(model_supplier, data):
    
    _, _, X_train, X_test, y_train, y_test = data
    
    model = model_supplier()
    model.fit(X_train,y_train)
    
    return model.score(X_test, y_test)

def compare_models(data):
    
    print("SKLearn GNB:", test_model(GaussianNB, data))
    print("My GNB:", test_model(MyGaussianNB, data))

## Question 2 - Test with 2 datasets

I tested with a variety of datasets. The two below are the "submitted" ones, and then later there is a cell with the results of the full set.

Penguins and Diabetes give exactly the same for mine and sklearn's models, which is what we expect.

There is an interesting difference when all the features for penguins are used - the score of mine stays much the same, but the score of the sklearn decreases by about 0.2, which is major. I don't know why. I looked at the sklearn source, and found that they did use logs for multiplying probabilities. They do actually use the log function, though, no shortcuts. Perhaps there are shortcuts elsewhere?

#### Penguins

The two work identically for this data set when the feature set is restricted.

In [22]:
compare_models(prep_data(
    path="penguins_af.csv", 
    target_feature="species", 
    allowed_classes=['Adelie','Chinstrap'],
    allowed_features=['bill_length_mm', 'bill_depth_mm','flipper_length_mm', 'body_mass_g']))

SKLearn GNB: 0.9626168224299065
My GNB: 0.9626168224299065


Expected results:
```
SKLearn GNB: 0.9626168224299065
My GNB: 0.9626168224299065
```

However, when the features are not restricted, they no longer match!

In [23]:
compare_models(prep_data(
    path="penguins_af.csv", 
    target_feature="species", 
    allowed_classes=['Adelie','Chinstrap']))

SKLearn GNB: 0.7383177570093458
My GNB: 0.9532710280373832


#### Diabetes

The two work identically (with all features), which is what we expect. 

In [24]:
compare_models(prep_data(
    path="diabetes.csv", 
    target_feature="neg_pos"))

SKLearn GNB: 0.7473958333333334
My GNB: 0.7473958333333334


Expected results:
```
SKLearn GNB: 0.7473958333333334
My GNB: 0.7473958333333334
```

## Other Datasets

I tested with some of the other datasets in the general data folder on brightspace.

For almost all of them, the SKLearn and my implementations perform identically. 

For a few, my implementation scores better. (glass2, penguins, restaurant) <br>
I don't know why.

In [15]:
import os

target_features = {
    "ApplesPears.csv": "Class",
    "AthleteSelection.csv": "Selected",
    "AthleteSelection1.csv": "Selected",
    "diabetes.csv": "neg_pos",
    "Forecast.csv": "Go-Out",
    "glassV2.csv": "Type",
    "Household.csv": "Category",
    "MamMass.csv": "Severity",
    "penguins_af.csv": "species",
    "restaurant.csv": "WillWait?",
    "survival.csv": "Class",
    "Swimming.csv": "Swimming",
    "vehicle.csv": "TYPE",
    "wine.csv": "class"
}

for path, target in target_features.items():
    
    print(path)
    
    if not os.path.isfile(path):
        print("Cannot find file")
        
    else:
        try:
            compare_models(prep_data(path, target))
        except:
            # Don't bother printing details, just single
            # it out the dataset for individual 
            # attention & debugging
            print("ERROR")
        
    print()

ApplesPears.csv
SKLearn GNB: 0.6
My GNB: 0.6

AthleteSelection.csv
SKLearn GNB: 0.8
My GNB: 0.7

AthleteSelection1.csv
SKLearn GNB: 0.8
My GNB: 0.7

diabetes.csv
SKLearn GNB: 0.7473958333333334
My GNB: 0.4869791666666667

Forecast.csv
SKLearn GNB: 0.7777777777777778
My GNB: 0.7777777777777778

glassV2.csv
SKLearn GNB: 0.3592233009708738
My GNB: 0.49514563106796117

Household.csv
SKLearn GNB: 0.0
My GNB: 0.0

MamMass.csv
SKLearn GNB: 0.5550935550935551
My GNB: 0.5821205821205822

penguins_af.csv
SKLearn GNB: 0.7544910179640718
My GNB: 0.8982035928143712

restaurant.csv
SKLearn GNB: 0.3333333333333333
My GNB: 0.5

survival.csv
SKLearn GNB: 0.7254901960784313
My GNB: 0.45751633986928103

Swimming.csv
SKLearn GNB: 0.6
My GNB: 0.6

vehicle.csv
SKLearn GNB: 0.42080378250591016
My GNB: 0.42080378250591016

wine.csv
SKLearn GNB: 0.9662921348314607
My GNB: 0.6966292134831461



Expected output:

```
ApplesPears.csv
SKLearn GNB: 0.6
My GNB: 0.6

AthleteSelection.csv
SKLearn GNB: 0.8
My GNB: 0.8

AthleteSelection1.csv
SKLearn GNB: 0.8
My GNB: 0.8

diabetes.csv
SKLearn GNB: 0.7473958333333334
My GNB: 0.7473958333333334

Forecast.csv
SKLearn GNB: 0.7777777777777778
My GNB: 0.7777777777777778

glassV2.csv
SKLearn GNB: 0.3592233009708738
My GNB: 0.6116504854368932

Household.csv
SKLearn GNB: 0.0
My GNB: 0.0

MamMass.csv
SKLearn GNB: 0.5550935550935551
My GNB: 0.6923076923076923

penguins_af.csv
SKLearn GNB: 0.7544910179640718
My GNB: 0.9700598802395209

restaurant.csv
SKLearn GNB: 0.3333333333333333
My GNB: 0.5

survival.csv
SKLearn GNB: 0.7254901960784313
My GNB: 0.7254901960784313

Swimming.csv
SKLearn GNB: 0.6
My GNB: 0.6

vehicle.csv
SKLearn GNB: 0.42080378250591016
My GNB: 0.42080378250591016

wine.csv
SKLearn GNB: 0.9662921348314607
My GNB: 0.9662921348314607
```