In [102]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [103]:
df = pd.read_csv("heart.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [104]:
#Preprocessing
def onehot(ser, num_classes=None):
    """
    One-hot encode the series.
    Example: 
    >>> onehot([1, 0, 2], 3)
    array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])
    """
    if num_classes == None:
        num_classes = len(np.unique(ser))
    return np.identity(num_classes)[ser]

new_col_names = []
need_encode_col = ["restecg", "thal", "slope", "cp"]
no_encode_col = [col for col in df.columns if col not in need_encode_col]
new_df = df[no_encode_col]
for col in need_encode_col:
    num_classes = len(df[col].unique())
    new_col_names = [f"{col}_{i}" for i in range(num_classes)]
    encoded = pd.DataFrame(onehot(df[col], num_classes), columns=new_col_names, dtype=int)
    new_df = pd.concat([new_df, encoded], axis=1)


In [105]:
new_df = new_df[ 160:170] 
new_df.head(50)

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target,restecg_0,restecg_1,restecg_2,thal_0,thal_1,thal_2,thal_3,slope_0,slope_1,slope_2,cp_0,cp_1,cp_2,cp_3
160,56,1,120,240,0,169,0,0.0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0
161,55,0,132,342,0,166,0,1.2,0,1,0,1,0,0,0,1,0,0,0,1,0,1,0,0
162,41,1,120,157,0,182,0,0.0,0,1,0,1,0,0,0,1,0,0,0,1,0,1,0,0
163,38,1,138,175,0,173,0,0.0,4,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0
164,38,1,138,175,0,173,0,0.0,4,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0
165,67,1,160,286,0,108,1,1.5,3,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0
166,67,1,120,229,0,129,1,2.6,2,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0
167,62,0,140,268,0,160,0,3.6,2,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0
168,63,1,130,254,0,147,0,1.4,1,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0
169,53,1,140,203,1,155,1,3.1,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0


In [106]:
data_cols = [col for col in new_df.columns if col != "target"]

print(data_cols)

['age', 'sex', 'trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak', 'ca', 'restecg_0', 'restecg_1', 'restecg_2', 'thal_0', 'thal_1', 'thal_2', 'thal_3', 'slope_0', 'slope_1', 'slope_2', 'cp_0', 'cp_1', 'cp_2', 'cp_3']


In [107]:

X = new_df.drop(['target'],axis='columns')
X.head(3)
X.shape

(10, 23)

In [108]:

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot

new_df_shfl = shuffle(new_df, random_state=443)
# define X and y
X = new_df_shfl[data_cols].values
y = new_df_shfl["target"].values
# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=80)

In [109]:
import numpy as np

class GaussianNBClassifier:
    def __init__(self):
        pass


    def separate_classes(self, X, y):
        separated_classes = {}
        for i in range(len(X)):
            feature_values = X[i]
            class_name = y[i]
            if class_name not in separated_classes:
                separated_classes[class_name] = []
            separated_classes[class_name].append(feature_values)
        return separated_classes

    def summarize(self, X):
        for feature in zip(*X):
            yield {
                'stdev' : np.std(feature),
                'mean' : np.mean(feature)
            }

    def gauss_distribution_function(self, x, mean, stdev):
        exponent = np.exp(-((x-mean)**2 / (2*stdev**2)))
        return exponent / (np.sqrt(2*np.pi)*stdev)


    def fit(self, X, y):
        separated_classes = self.separate_classes(X, y)
        print("separated_classes",separated_classes)
        self.class_summary = {}
        for class_name, feature_values in separated_classes.items():
            self.class_summary[class_name] = {
                'prior_proba': len(feature_values)/len(X),
                'summary': [i for i in self.summarize(feature_values)],
            }   

        print("self.class_summary",self.class_summary)  
        return self.class_summary

    def predict(self, X):
        MAPs = []
        for row in X:
            joint_proba = {}
            for class_name, features in self.class_summary.items():
                total_features = len(features['summary'])
                likelihood = 1
                for idx in range(total_features):
                    feature = row[idx]
                    mean = features['summary'][idx]['mean']
                    stdev = features['summary'][idx]['stdev']
                    normal_proba = self.gauss_distribution_function(feature, \
                    mean, stdev)
                    likelihood *= normal_proba
                prior_proba = features['prior_proba']
                joint_proba[class_name] = prior_proba * likelihood
            MAP = max(joint_proba, key=joint_proba.get)
            MAPs.append(MAP)
        return MAPs

    def accuracy(self, y_test, y_pred):
          true_true = 0
          for y_t, y_p in zip(y_test, y_pred):
              if y_t == y_p:
                  true_true += 1
          return true_true / len(y_test)


In [110]:
model = GaussianNBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print ("GaussianNBClassifier accuracy: {0:.3f}".format(model.accuracy(y_test, y_pred)))

separated_classes {0: [array([ 67. ,   1. , 160. , 286. ,   0. , 108. ,   1. ,   1.5,   3. ,
         1. ,   0. ,   0. ,   0. ,   0. ,   1. ,   0. ,   0. ,   1. ,
         0. ,   1. ,   0. ,   0. ,   0. ]), array([ 67. ,   1. , 120. , 229. ,   0. , 129. ,   1. ,   2.6,   2. ,
         1. ,   0. ,   0. ,   0. ,   0. ,   0. ,   1. ,   0. ,   1. ,
         0. ,   1. ,   0. ,   0. ,   0. ]), array([ 63. ,   1. , 130. , 254. ,   0. , 147. ,   0. ,   1.4,   1. ,
         1. ,   0. ,   0. ,   0. ,   0. ,   0. ,   1. ,   0. ,   1. ,
         0. ,   1. ,   0. ,   0. ,   0. ]), array([ 62. ,   0. , 140. , 268. ,   0. , 160. ,   0. ,   3.6,   2. ,
         1. ,   0. ,   0. ,   0. ,   0. ,   1. ,   0. ,   1. ,   0. ,
         0. ,   1. ,   0. ,   0. ,   0. ])], 1: [array([ 38.,   1., 138., 175.,   0., 173.,   0.,   0.,   4.,   0.,   1.,
         0.,   0.,   0.,   1.,   0.,   0.,   0.,   1.,   0.,   0.,   1.,
         0.]), array([ 56.,   1., 120., 240.,   0., 169.,   0.,   0.,   0.,   0.,   1.,
  