In [458]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import arff
from math import sqrt
from math import pi
from math import exp


In [459]:
data = arff.loadarff('./hypothyroid.arff')
df = pd.DataFrame(data[0])
df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
df.shape

(3772, 30)

In [460]:
df = df.iloc[:, [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,29]]
df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,query hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH,Class
0,41.0,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,1.3,negative
1,23.0,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,4.1,negative
2,46.0,M,f,f,f,f,f,f,f,f,f,f,f,f,f,f,0.98,negative
3,70.0,F,t,f,f,f,f,f,f,f,f,f,f,f,f,f,0.16,negative
4,70.0,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,0.72,negative


In [461]:
df['Class'].replace({'negative': 0, 'compensated_hypothyroid': 1, 'primary_hypothyroid': 1, 'secondary_hypothyroid': 1}, inplace=True)
# df.pop('referral source')
df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,query hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH,Class
0,41.0,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,1.3,0
1,23.0,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,4.1,0
2,46.0,M,f,f,f,f,f,f,f,f,f,f,f,f,f,f,0.98,0
3,70.0,F,t,f,f,f,f,f,f,f,f,f,f,f,f,f,0.16,0
4,70.0,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,0.72,0


In [462]:
df = df.dropna()
for col in df.columns:
    df = df.drop(df[df[col] == '?'].index)
df.shape

(3269, 18)

In [463]:
test = df.sample(50)
df = df.drop(test.index)
target = test.pop('Class')
test.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,query hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH
2344,65.0,M,f,f,f,f,f,f,f,f,f,f,f,f,f,f,1.1
3366,65.0,F,f,f,f,f,f,f,f,f,t,f,f,f,f,f,0.02
2763,55.0,M,f,f,f,f,f,f,f,f,f,f,f,f,f,f,0.91
267,76.0,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,3.9
1336,72.0,M,f,f,f,t,f,f,f,f,f,f,f,f,f,f,5.4


In [464]:
a = df.sample(100)
b = df.sample(200)
c = df.sample(300)

In [465]:
def mean(numbers):
	return sum(numbers)/float(len(numbers))

def stdev(numbers):
	avg = mean(numbers)
	variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
	return sqrt(variance)


def calculate_probability(x, mean, stdev):
	exponent = exp(-((x-mean)**2 / (2 * stdev**2)))
	return (1 / (sqrt(2 * pi * stdev**2))) * exponent


In [466]:
def get_class_counts(data, i):
    counts = {}
    for row in data:
        label = row[i]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts


class_counts = get_class_counts(df.values, 1)
class_counts

{'F': 2189, 'M': 1030}

In [467]:
def get_probabilities(data, row):
    # print(row)
    results = []
    probability = {}
    for column in data.columns: 
        if column != 'Class':
            # print(column)
            probability[column] = {"attr": column}
            for label in data['Class'].unique():
                probability[column][label] = 1
                if(type(row[column]) == float or isinstance(row[column], np.floating)):
                    probability[column][label] *= calculate_probability(row[column], data[column][data['Class'] == label].mean(), data[column][data['Class'] == label].std())
                    # print(label, ': ' ,probability[column][label])
                else:
                    probability[column][label] *= data[column][data[column] == row[column]][data['Class'] == label].count() / len(data[column][data['Class'] == label])
                    # print(label, ': ',data[column][data[column] == row[column]][data['Class'] == label].count(), '/', len(data[column][data['Class'] == label]))
                class_counts = get_class_counts(data.values, -1)
                probability[column][label] *= class_counts[label] / len(data)
                # print(class_counts[label] / len(data))
            results.append(probability[column])
    return results

get_probabilities(a, test.iloc[0])

[{'attr': 'age', 0: 0.014076035152823487, 1: 0.0014893967351206924},
 {'attr': 'sex', 0: 0.21000000000000002, 1: 0.03},
 {'attr': 'on thyroxine', 0: 0.7200000000000001, 1: 0.1},
 {'attr': 'query on thyroxine', 0: 0.9, 1: 0.1},
 {'attr': 'on antithyroid medication', 0: 0.89, 1: 0.1},
 {'attr': 'sick', 0: 0.85, 1: 0.1},
 {'attr': 'pregnant', 0: 0.89, 1: 0.1},
 {'attr': 'thyroid surgery', 0: 0.88, 1: 0.1},
 {'attr': 'I131 treatment', 0: 0.89, 1: 0.1},
 {'attr': 'query hypothyroid', 0: 0.8600000000000001, 1: 0.08000000000000002},
 {'attr': 'query hyperthyroid', 0: 0.8600000000000001, 1: 0.08000000000000002},
 {'attr': 'lithium', 0: 0.9, 1: 0.1},
 {'attr': 'goitre', 0: 0.9, 1: 0.1},
 {'attr': 'tumor', 0: 0.88, 1: 0.1},
 {'attr': 'hypopituitary', 0: 0.9, 1: 0.1},
 {'attr': 'psych', 0: 0.81, 1: 0.1},
 {'attr': 'TSH', 0: 0.14779052873964563, 1: 0.0006515470613405417}]

In [468]:
def evaluate_algorithm(data, test):
    predictions = {'True': 0, 'False': 0}
    for i in range(test.shape[0]):
        probabilities = get_probabilities(data, test.iloc[i])
        # print(probabilities)
        best_label, best_prob = None, -1
        for label in data['Class'].unique():
            probability = 1
            for attr in probabilities:
                probability *= attr[label]
            if best_label is None or probability > best_prob:
                best_prob = probability
                best_label = label
        # print('Expected %d, Got %d.' % (target.iloc[i], best_label))
        if(target.iloc[i] == best_label):
            predictions['True'] += 1
        else:
            predictions['False'] += 1
    return predictions

In [469]:
evaluate_algorithm(a, test)

{'True': 46, 'False': 4}

In [470]:
evaluate_algorithm(b, test)

{'True': 45, 'False': 5}

In [471]:
evaluate_algorithm(c, test)

{'True': 46, 'False': 4}

In [472]:
evaluate_algorithm(df, test)

{'True': 45, 'False': 5}

In [473]:
evaluate_algorithm(df.sample(5), test)

{'True': 45, 'False': 5}