In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import arff
from math import sqrt
from math import pi
from math import exp


In [68]:
data = arff.loadarff('./hypothyroid.arff')
df = pd.DataFrame(data[0])
df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
df.shape

(3772, 30)

In [69]:
df = df.iloc[:, [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,29]]
df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,query hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH,Class
0,41.0,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,1.3,negative
1,23.0,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,4.1,negative
2,46.0,M,f,f,f,f,f,f,f,f,f,f,f,f,f,f,0.98,negative
3,70.0,F,t,f,f,f,f,f,f,f,f,f,f,f,f,f,0.16,negative
4,70.0,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,0.72,negative


In [70]:
df.replace({'F': 0, 'M': 1, 'f': 0, 'm': 1, 't': 1, 'f': 0,
               'y': 1, 'n': 0, 'yes': 1, 'no': 0}, inplace=True)
df.head()


Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,query hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH,Class
0,41.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.3,negative
1,23.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.1,negative
2,46.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.98,negative
3,70.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.16,negative
4,70.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.72,negative


In [71]:
df['Class'].replace({'negative': 0, 'compensated_hypothyroid': 1, 'primary_hypothyroid': 1, 'secondary_hypothyroid': 1}, inplace=True)
# df.pop('referral source')
df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,query hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH,Class
0,41.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.3,0
1,23.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.1,0
2,46.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.98,0
3,70.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.16,0
4,70.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.72,0


In [72]:
df = df.dropna()
for col in df.columns:
    df = df.drop(df[df[col] == '?'].index)
df.shape

(3269, 18)

In [73]:
test = df.sample(50)
df = df.drop(test.index)

In [74]:
a = df.sample(100)
b = df.sample(200)
c = df.sample(300)

In [75]:
def mean(numbers):
	return sum(numbers)/float(len(numbers))

def stdev(numbers):
	avg = mean(numbers)
	variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
	return sqrt(variance)


def calculate_probability(x, mean, stdev):
	exponent = exp(-((x-mean)**2 / (2 * stdev**2)))
	return (1 / (sqrt(2 * pi * stdev**2))) * exponent


In [76]:
inputs = df.drop('Class', axis='columns')
target = df['Class']

In [77]:
target.iloc[target[target == 0]]

test['age'][test['Class'] == 1].mean()

46.5

In [78]:
def get_class_counts(data):
    counts = {}
    for row in data:
        label = row[1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts


class_counts = get_class_counts(df.values)
class_counts

{0: 2185, 1: 1034}

In [81]:
def predict(data, row):
    print(row)
    results = []
    probability = {}
    class_counts = get_class_counts(data.values)
    for column in data.columns: 
        if column != 'Class':
            print(column)
            probability[column] = {"attr": column}
            for label in data['Class'].unique():
                probability[column][label] = 1
                if(type(row[column]) == float or isinstance(row[column], np.floating)):
                    probability[column][label] *= calculate_probability(row[column], data[column][data['Class'] == label].mean(), data[column][data['Class'] == label].std())
                    print(label, ': ' ,probability[column][label])
                else:
                    probability[column][label] *= data[column][data[column] == row[column]][data['Class'] == label].count() / len(data[column][data['Class'] == label])
                    print(label, ': ',data[column][data[column] == row[column]][data['Class'] == label].count(), '/', len(data[column][data['Class'] == label]))
            results.append(probability[column])
    return results

predict(test, test.iloc[0])

age                          33.0
sex                             1
on thyroxine                    0
query on thyroxine              0
on antithyroid medication       0
sick                            0
pregnant                        0
thyroid surgery                 0
I131 treatment                  0
query hypothyroid               0
query hyperthyroid              0
lithium                         0
goitre                          0
tumor                           0
hypopituitary                   0
psych                           1
TSH                           1.0
Class                           0
Name: 1846, dtype: object
age
0 :  0.008527142971661638
1 :  0.016906659270793534
sex
0 :  14 / 44
1 :  1 / 6
on thyroxine
0 :  35 / 44
1 :  5 / 6
query on thyroxine
0 :  44 / 44
1 :  6 / 6
on antithyroid medication
0 :  44 / 44
1 :  6 / 6
sick
0 :  43 / 44
1 :  6 / 6
pregnant
0 :  44 / 44
1 :  6 / 6
thyroid surgery
0 :  43 / 44
1 :  6 / 6
I131 treatment
0 :  43 / 44
1 :  6 / 6
query h

[{'attr': 'age', 0: 0.008527142971661638, 1: 0.016906659270793534},
 {'attr': 'sex', 0: 0.3181818181818182, 1: 0.16666666666666666},
 {'attr': 'on thyroxine', 0: 0.7954545454545454, 1: 0.8333333333333334},
 {'attr': 'query on thyroxine', 0: 1.0, 1: 1.0},
 {'attr': 'on antithyroid medication', 0: 1.0, 1: 1.0},
 {'attr': 'sick', 0: 0.9772727272727273, 1: 1.0},
 {'attr': 'pregnant', 0: 1.0, 1: 1.0},
 {'attr': 'thyroid surgery', 0: 0.9772727272727273, 1: 1.0},
 {'attr': 'I131 treatment', 0: 0.9772727272727273, 1: 1.0},
 {'attr': 'query hypothyroid', 0: 0.9090909090909091, 1: 0.8333333333333334},
 {'attr': 'query hyperthyroid', 0: 0.9772727272727273, 1: 0.8333333333333334},
 {'attr': 'lithium', 0: 0.9772727272727273, 1: 0.8333333333333334},
 {'attr': 'goitre', 0: 1.0, 1: 1.0},
 {'attr': 'tumor', 0: 1.0, 1: 1.0},
 {'attr': 'hypopituitary', 0: 1.0, 1: 1.0},
 {'attr': 'psych', 0: 0.022727272727272728, 1: 0.0},
 {'attr': 'TSH', 0: 0.2065777155425502, 1: 0.004383130617780932}]