In [None]:
!pip3 install scikit-learn
!pip3 install pandas

In [None]:
from files import get_onehot_dataset
import pandas as pd

input_file = "cardio_train.csv"
df1 = pd.read_csv(input_file, header=0)
Y = df1[["cardio"]]
X = df1.loc[:, df1.columns != 'cardio']
df_num = X.select_dtypes(exclude='object')
fold_size = 20
print("std for each column is for a specific fold:")
print(df_num[0:fold_size].drop(["id", "smoke", "alco", "active"], axis=1).std())
print("mean for each column is for a specific fold:")
print(df_num[0:fold_size].drop(["id", "smoke", "alco", "active"], axis=1).mean())
print("probability of occurrence")
print("gender:")
print(df1[0:fold_size]["gender"].value_counts("women", "man"))
print("above:")
print(df1[0:fold_size]["above"].value_counts("normal"))
print("gluc:")
print(df1[0:fold_size]["gluc"].value_counts("normal"))
print("smoke:")
print(df1[0:fold_size]["smoke"].value_counts(1))
print("alco:")
print(df1[0:fold_size]["alco"].value_counts(1))
print("active:")
print(df1[0:fold_size]["active"].value_counts(1))
print("cardio:")
print(df1[0:fold_size]["cardio"].value_counts(1))

In [None]:
from sklearn.model_selection import train_test_split

df = get_onehot_dataset()
Y = df[["cardio"]]
X = df.loc[:, df.columns != 'cardio']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()
x_test = x_test.to_numpy()
y_test = y_test.to_numpy()
y_train = y_train.ravel()
y_test = y_test.ravel()

In [None]:
from sklearn.model_selection import KFold, GridSearchCV, StratifiedKFold
from sklearn.naive_bayes import GaussianNB

best_kf = 2
best_kf_value = 0 
best_st = 2
best_st_value = 0
model = GaussianNB()

for k in range(2, 72, 10):
    kf = KFold(n_splits=k)
    gs = GridSearchCV(model, cv=kf, param_grid={}, return_train_score=True)
    gs.fit(x_train, y_train)
    if gs.cv_results_['mean_test_score'][0] >= best_kf_value: 
        best_kf_value = gs.cv_results_['mean_test_score'][0]
        best_kf = k 
    
    stf = StratifiedKFold(n_splits=k)
    gs = GridSearchCV(model, cv=stf, param_grid={}, return_train_score=True)
    gs.fit(x_train, y_train)
    if gs.cv_results_['mean_test_score'][0] >= best_st_value: 
        best_st_value = gs.cv_results_['mean_test_score'][0]
        best_st = k
        
print("Best k fold:", best_kf)
print("Accuracy of best k fold:", best_kf_value)
print("Best stratified fold:", best_st)
print("Accuracy of best stratified fold:", best_st_value)

In [None]:
from sklearn.model_selection import cross_val_score

best_kf_cv = KFold(n_splits=best_kf) if best_st_value > best_kf_value else StratifiedKFold(n_splits=best_st)
model = GaussianNB()
gs = GridSearchCV(model, cv=best_kf_cv, param_grid={}, return_train_score=True)
gs.fit(x_train, y_train)
gs.cv_results_

In [None]:
from sklearn.metrics import accuracy_score

best_model = gs.best_estimator_
y_pred = best_model.predict(x_test)

print("accuracy of model is:", accuracy_score(y_test, y_pred))

In [None]:
model = GaussianNB()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("priors: default; accuracy of model:", accuracy_score(y_test, y_pred))


model = GaussianNB(priors=[0.7, 0.3])
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("priors: 0.7, 0.3; accuracy of model:", accuracy_score(y_test, y_pred))

model = GaussianNB(priors=[0.6, 0.4])
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("priors: 0.6, 0.4; accuracy of model:", accuracy_score(y_test, y_pred))

In [None]:
model = GaussianNB()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("var_smoothing: default; accuracy of model:", accuracy_score(y_test, y_pred))


model = GaussianNB(var_smoothing=0.00001)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("var_smoothing:0.00001; accuracy of model:", accuracy_score(y_test, y_pred))

model = GaussianNB(var_smoothing=0.01)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("var_smoothing: 0.01; accuracy of model:", accuracy_score(y_test, y_pred))