In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from sklearn.naive_bayes import GaussianNB

In [2]:
train_df = pd.read_csv('hayes-roth.data')
train_df.head()

Unnamed: 0,92,2,1,1.1,2.1,1.2
0,10,2,1,3,2,2
1,83,3,1,4,1,3
2,61,2,4,2,2,3
3,107,1,1,3,4,3
4,113,1,1,3,2,2


In [3]:
train_df.to_csv('hayes_roth.csv', index = False)

In [4]:
### hayes - roth 
names = 'name, hobby, age, education level, marital status, class'.split(', ')

# train data
train_df = pd.read_csv('hayes-roth.data', header = None, names = names)

# test data
test_df = pd.read_csv('hayes-roth.test', header = None, names = 'hobby, age, education level, marital status, class'.split(', '))


train_df.drop('name', axis = 1, inplace = True)
X_train = train_df.iloc[:, : -1]
y_train = train_df.iloc[:, -1]

X_test = test_df.iloc[:, : -1]
y_test = test_df.iloc[:, -1]

from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)


y_pred  =  classifier.predict(X_test)


from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)


[[ 7  3  4]
 [ 0 10  3]
 [ 0  0  1]]


### Hypothesis Testing

In [5]:
from scipy import stats

def check_normality(data):
    test_stat_normality, p_value_normality=stats.shapiro(data)
    print("p value:%.4f" % p_value_normality)
    if p_value_normality <0.05:
        print("Reject null hypothesis >> The data is not normally distributed")
    else:
        print("Fail to reject null hypothesis >> The data is normally distributed") 
        
def check_variance_homogeneity(group1, group2):
    test_stat_var, p_value_var= stats.levene(group1,group2)
    print("p value:%.4f" % p_value_var)
    if p_value_var <0.05:
        print("Reject null hypothesis >> The variances of the samples are different.")
    else:
        print("Fail to reject null hypothesis >> The variances of the samples are same.")



In [6]:
# check normality for every column:

for column in train_df.columns:
    arr = np.array(train_df[column].tolist())
    print(column, ' normality : ')
    print()
    check_normality(arr)
    print()
    
    
ttest,p_value = stats.ttest_ind(np.array(train_df['age']), np.array(train_df['class']))
print("p value:%.8f" % p_value)
print("since the hypothesis is one sided >> use p_value/2 >> p_value_one_sided:%.4f" %(p_value/2))
if p_value/2 <0.05:
    print("Reject null hypothesis")
else:
    print("Fail to reject null hypothesis") 

hobby  normality : 

p value:0.0000
Reject null hypothesis >> The data is not normally distributed

age  normality : 

p value:0.0000
Reject null hypothesis >> The data is not normally distributed

education level  normality : 

p value:0.0000
Reject null hypothesis >> The data is not normally distributed

marital status  normality : 

p value:0.0000
Reject null hypothesis >> The data is not normally distributed

class  normality : 

p value:0.0000
Reject null hypothesis >> The data is not normally distributed

p value:0.28603684
since the hypothesis is one sided >> use p_value/2 >> p_value_one_sided:0.1430
Fail to reject null hypothesis


In [7]:
from sklearn.model_selection import KFold, cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
clf = GaussianNB()
accuracies = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=1)
print(*["Fold {}: {}".format(i+1,accuracy) for i,accuracy in enumerate(accuracies)],sep="\n")
print()
print("average accuracy :",np.mean(accuracies))
print()
print("average std :",np.std(accuracies))
clf.fit(X_train, y_train)

print("test accuracy :",clf.score(X_test,y_test))

Fold 1: 0.7142857142857143
Fold 2: 0.6428571428571429
Fold 3: 0.6153846153846154
Fold 4: 0.7692307692307693
Fold 5: 0.6153846153846154
Fold 6: 0.8461538461538461
Fold 7: 0.5384615384615384
Fold 8: 0.8461538461538461
Fold 9: 0.5384615384615384
Fold 10: 0.38461538461538464

average accuracy : 0.6510989010989011

average std : 0.13894147590914188
test accuracy : 0.6428571428571429


In [8]:
frames = [train_df, test_df]

result = pd.concat(frames)
display(result)
result.to_csv('hayes_roth.csv', index=False)

Unnamed: 0,hobby,age,education level,marital status,class
0,2,1,1,2,1
1,2,1,3,2,2
2,3,1,4,1,3
3,2,4,2,2,3
4,1,1,3,4,3
...,...,...,...,...,...
23,1,3,1,2,2
24,1,1,1,1,1
25,1,2,2,2,2
26,1,3,3,3,1


In [9]:
# Car Dataset

df_car = pd.read_csv('car.data', header = None, names = 'buying, maint, doors, persons, lug_boot, safety, target'.split(', '))

df_car.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,target
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [10]:
from sklearn import preprocessing


# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
for column in 'buying, maint, doors, persons, lug_boot, safety, target'.split(', '):
    df_car[column]= label_encoder.fit_transform(df_car[column])

X = df_car.iloc[:, : -1]    
y = df_car.iloc[:, -1]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

### Hypothesis tests for Car dataset

In [11]:
# check normality for every column:

for column in df_car.columns:
    arr = np.array(df_car[column].tolist())
    print(column, ' normality : ')
    print()
    check_normality(arr)
    print()
    
    
ttest,p_value = stats.ttest_ind(np.array(df_car['safety']), np.array(df_car['target']))
print("p value:%.8f" % p_value)
print("since the hypothesis is one sided >> use p_value/2 >> p_value_one_sided:%.4f" %(p_value/2))
if p_value/2 <0.05:
    print("Reject null hypothesis")
else:
    print("Fail to reject null hypothesis") 

buying  normality : 

p value:0.0000
Reject null hypothesis >> The data is not normally distributed

maint  normality : 

p value:0.0000
Reject null hypothesis >> The data is not normally distributed

doors  normality : 

p value:0.0000
Reject null hypothesis >> The data is not normally distributed

persons  normality : 

p value:0.0000
Reject null hypothesis >> The data is not normally distributed

lug_boot  normality : 

p value:0.0000
Reject null hypothesis >> The data is not normally distributed

safety  normality : 

p value:0.0000
Reject null hypothesis >> The data is not normally distributed

target  normality : 

p value:0.0000
Reject null hypothesis >> The data is not normally distributed

p value:0.00000000
since the hypothesis is one sided >> use p_value/2 >> p_value_one_sided:0.0000
Reject null hypothesis


In [12]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [13]:
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
clf = GaussianNB()
accuracies = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=1)
print(*["Fold {}: {}".format(i+1,accuracy) for i,accuracy in enumerate(accuracies)],sep="\n")
print()
print("average accuracy :",np.mean(accuracies))
print()
print("average std :",np.std(accuracies))
clf.fit(X_train, y_train)

print("test accuracy :",clf.score(X_test,y_test))

Fold 1: 0.6258992805755396
Fold 2: 0.5683453237410072
Fold 3: 0.717391304347826
Fold 4: 0.6159420289855072
Fold 5: 0.5942028985507246
Fold 6: 0.6594202898550725
Fold 7: 0.644927536231884
Fold 8: 0.5434782608695652
Fold 9: 0.6014492753623188
Fold 10: 0.6811594202898551

average accuracy : 0.62522156188093

average std : 0.04971207669132962
test accuracy : 0.6127167630057804


In [14]:
# Breast cancer

df_breast = pd.read_csv('breast-cancer.data', header = None, names = 'Class, age, menopause, tumor-size, inv-nodes, node-caps, deg-malig, breast, breast-quad, irradiat'.split(', '))

df_breast.head()

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [15]:
for i in range(df_breast.shape[1]):
    print(df_breast.columns[i], df_breast.iloc[:, i].unique())

Class ['no-recurrence-events' 'recurrence-events']
age ['30-39' '40-49' '60-69' '50-59' '70-79' '20-29']
menopause ['premeno' 'ge40' 'lt40']
tumor-size ['30-34' '20-24' '15-19' '0-4' '25-29' '50-54' '10-14' '40-44' '35-39'
 '5-9' '45-49']
inv-nodes ['0-2' '6-8' '9-11' '3-5' '15-17' '12-14' '24-26']
node-caps ['no' 'yes' '?']
deg-malig [3 2 1]
breast ['left' 'right']
breast-quad ['left_low' 'right_up' 'left_up' 'right_low' 'central' '?']
irradiat ['no' 'yes']


In [16]:
# we replace '?' with zero in node-caps column

df_breast["node-caps"].replace({"?": '0'}, inplace=True)
df_breast["node-caps"].unique()

array(['no', 'yes', '0'], dtype=object)

In [17]:
df_breast.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Class        286 non-null    object
 1   age          286 non-null    object
 2   menopause    286 non-null    object
 3   tumor-size   286 non-null    object
 4   inv-nodes    286 non-null    object
 5   node-caps    286 non-null    object
 6   deg-malig    286 non-null    int64 
 7   breast       286 non-null    object
 8   breast-quad  286 non-null    object
 9   irradiat     286 non-null    object
dtypes: int64(1), object(9)
memory usage: 22.5+ KB


In [18]:
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
for column in 'Class, age, menopause, tumor-size, inv-nodes, node-caps, breast, breast-quad, irradiat'.split(', '):
    df_breast[column]= label_encoder.fit_transform(df_breast[column])


In [19]:
# check normality for every column:

for column in df_breast.columns:
    arr = np.array(df_breast[column].tolist())
    print(column, ' normality : ')
    print()
    check_normality(arr)
    print()
    
    
ttest,p_value = stats.ttest_ind(np.array(df_breast['irradiat']), np.array(df_breast['Class']))
print("p value:%.8f" % p_value)
print("since the hypothesis is one sided >> use p_value/2 >> p_value_one_sided:%.4f" %(p_value/2))
if p_value/2 <0.05:
    print("Reject null hypothesis")
else:
    print("Fail to reject null hypothesis") 

Class  normality : 

p value:0.0000
Reject null hypothesis >> The data is not normally distributed

age  normality : 

p value:0.0000
Reject null hypothesis >> The data is not normally distributed

menopause  normality : 

p value:0.0000
Reject null hypothesis >> The data is not normally distributed

tumor-size  normality : 

p value:0.0000
Reject null hypothesis >> The data is not normally distributed

inv-nodes  normality : 

p value:0.0000
Reject null hypothesis >> The data is not normally distributed

node-caps  normality : 

p value:0.0000
Reject null hypothesis >> The data is not normally distributed

deg-malig  normality : 

p value:0.0000
Reject null hypothesis >> The data is not normally distributed

breast  normality : 

p value:0.0000
Reject null hypothesis >> The data is not normally distributed

breast-quad  normality : 

p value:0.0000
Reject null hypothesis >> The data is not normally distributed

irradiat  normality : 

p value:0.0000
Reject null hypothesis >> The data 

In [20]:
    
X = df_breast.iloc[:, 1:]    
y = df_breast.iloc[:, 0]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [21]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
clf = GaussianNB()
accuracies = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=1)
print(*["Fold {}: {}".format(i+1,accuracy) for i,accuracy in enumerate(accuracies)],sep="\n")
print()
print("average accuracy :",np.mean(accuracies))
print()
print("average std :",np.std(accuracies))
clf.fit(X_train, y_train)

print("test accuracy :",clf.score(X_test,y_test))

Fold 1: 0.6956521739130435
Fold 2: 0.782608695652174
Fold 3: 0.6956521739130435
Fold 4: 0.8260869565217391
Fold 5: 0.6086956521739131
Fold 6: 0.782608695652174
Fold 7: 0.6956521739130435
Fold 8: 0.782608695652174
Fold 9: 0.5454545454545454
Fold 10: 0.7272727272727273

average accuracy : 0.7142292490118577

average std : 0.08207745671200403
test accuracy : 0.6896551724137931
