In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import arff
from math import sqrt
from math import pi
from math import exp


In [14]:
data = arff.loadarff('./hypothyroid.arff')
df = pd.DataFrame(data[0])
df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

In [15]:
df = df.iloc[:, [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,29]]
df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,query hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH,Class
0,41.0,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,1.3,negative
1,23.0,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,4.1,negative
2,46.0,M,f,f,f,f,f,f,f,f,f,f,f,f,f,f,0.98,negative
3,70.0,F,t,f,f,f,f,f,f,f,f,f,f,f,f,f,0.16,negative
4,70.0,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,0.72,negative


In [16]:
df['Class'].replace({'negative': 0, 'compensated_hypothyroid': 1, 'primary_hypothyroid': 1, 'secondary_hypothyroid': 1}, inplace=True)
# df.pop('referral source')
df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,query hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH,Class
0,41.0,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,1.3,0
1,23.0,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,4.1,0
2,46.0,M,f,f,f,f,f,f,f,f,f,f,f,f,f,f,0.98,0
3,70.0,F,t,f,f,f,f,f,f,f,f,f,f,f,f,f,0.16,0
4,70.0,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,0.72,0


In [17]:
test = df.sample(50)
df = df.drop(test.index)

In [18]:
a = df.sample(100)
b = df.sample(200)
c = df.sample(300)

In [19]:
def mean(numbers):
	return sum(numbers)/float(len(numbers))

def stdev(numbers):
	avg = mean(numbers)
	variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
	return sqrt(variance)


def calculate_probability(x, mean, stdev):
	exponent = exp(-((x-mean)**2 / (2 * stdev**2)))
	return (1 / (sqrt(2 * pi * stdev**2))) * exponent


In [20]:
def get_class_counts(data):
    counts = {}
    for row in data:
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

class_counts = get_class_counts(test.values)
class_counts

{0: 48, 1: 2}

In [21]:
def get_attr_counts(column_name, data):
    count = {}
    for attr in data[column_name].unique():
        count[attr] = {}
        for label in data['Class'].unique():
            count[attr][label] = len(data[(data[column_name] == attr) & (data['Class'] == label)])
    return count

get_attr_counts('sex', test)

{'M': {0: 11, 1: 1}, 'F': {0: 35, 1: 1}, '?': {0: 2, 1: 0}}

In [35]:
def predict(data, row):
    results = []
    probability = {}
    for column in data.columns:
        if column != 'Class':
            probability[column] = {"attr" : column}
            for label in data['Class'].unique():
                probability[column][label] = 1
                if(type(row[column]) == float and row[column] != 0):
                    probability[column][label] *= calculate_probability(row[column], data[(data[column] == attr) & (
                        data['Class'] == label)][column].mean(), data[(data[column] == attr) & (data['Class'] == label)][column].std())
                else:
                    probability[column][label] *= data[(data[column] == row[column]) & (
                        data['Class'] == label)][column].count() / data[data['Class'] == label][column].count()
                probability[column][label] *= data[data['Class'] == label][column].count() / data[column].count()
            results.append(probability[column])
    return results

predict(test, test.iloc[0])

[{'attr': 'age', 0: 0.019999999999999997, 1: 0.0},
 {'attr': 'sex', 0: 0.039999999999999994, 1: 0.0},
 {'attr': 'on thyroxine', 0: 0.92, 1: 0.04},
 {'attr': 'query on thyroxine', 0: 0.96, 1: 0.04},
 {'attr': 'on antithyroid medication', 0: 0.94, 1: 0.04},
 {'attr': 'sick', 0: 0.039999999999999994, 1: 0.0},
 {'attr': 'pregnant', 0: 0.96, 1: 0.04},
 {'attr': 'thyroid surgery', 0: 0.8799999999999999, 1: 0.04},
 {'attr': 'I131 treatment', 0: 0.8999999999999999, 1: 0.04},
 {'attr': 'query hypothyroid', 0: 0.92, 1: 0.02},
 {'attr': 'query hyperthyroid', 0: 0.92, 1: 0.04},
 {'attr': 'lithium', 0: 0.96, 1: 0.04},
 {'attr': 'goitre', 0: 0.96, 1: 0.04},
 {'attr': 'tumor', 0: 0.96, 1: 0.04},
 {'attr': 'hypopituitary', 0: 0.96, 1: 0.04},
 {'attr': 'psych', 0: 0.8999999999999999, 1: 0.04},
 {'attr': 'TSH', 0: 0.021276595744680854, 1: 0.0}]

In [92]:
def p(data, row):
    probability = []
    for label in data['Class'].unique():
        probability.append(label)
        probability[label] = 1
        for column in data.columns:
            if column != 'Class' and row[column] != '?':
                if(isinstance(row[column], np.floating)):
                    prob = calculate_probability(
                    row[column], data[column].mean(), data[column].std())
                    if(prob > 0):
                        print(probability[label])
                        probability[label] *= prob
                else:
                    probability[label] *= data[(data[column] == row[column]) & (data['Class'] == label)].count() / data[data['Class'] == label].count()
        probability[label] *= get_class_counts(test.values)[label] / data['Class'].count()
    return probability

si = p(test, test.iloc[0])

1
age                          0.000095
sex                          0.000095
on thyroxine                 0.000095
query on thyroxine           0.000095
on antithyroid medication    0.000095
sick                         0.000095
pregnant                     0.000095
thyroid surgery              0.000095
I131 treatment               0.000095
query hypothyroid            0.000095
query hyperthyroid           0.000095
lithium                      0.000095
goitre                       0.000095
tumor                        0.000095
hypopituitary                0.000095
psych                        0.000095
TSH                          0.000098
Class                        0.000095
dtype: float64
1
age                          0.0
sex                          0.0
on thyroxine                 0.0
query on thyroxine           0.0
on antithyroid medication    0.0
sick                         0.0
pregnant                     0.0
thyroid surgery              0.0
I131 treatment               0.0


In [12]:
def naive_bayes(train, test):
    predictions = []
    for row in test.values:
        output = predict(train, row)
        predictions.append(output)
    return(predictions)

In [13]:
prueba = [125,100,70,120,95,60,220,85,75,90]
prueba = pd.DataFrame(prueba)
prueba.mean()
stdev(prueba.loc[:,0])**2
prueba.std()**2

0    2082.222222
dtype: float64

In [14]:
calculate_probability(120, prueba.mean(), prueba.std())

# calculate_probability(120, mean(prueba.iloc[:, 0]), stdev(prueba.iloc[:, 0]))

# test.loc[:,'TSH'].mean()
# test.loc[:,'TSH'].std()

0.008221465444217167

In [15]:
#Calcualting probabilites for inputs independantly
def get_probabilities(column_name, data):
    n = len(data) 
    column = data[column_name] #isolate targetted column
    column = column.value_counts() #get counts of occurences of each input variable
    return (column/n) #return probiblity of occurence by dividing with total no. of data points


def P(attribute, class_value, data):
    column = data[attribute]
    column = len(column)
    

#calculating conditional probability
def get_conditional_probabilities(data, n, target, given):
    # isolate target column an dfocus input column

    focused_data = data[[target, given]]
    targets_unique = data[target].unique()  # list of unique outputs in data
    inputs_unique = data[given].unique()
    groups = focused_data.groupby(by=[given, target]).size().reset_index()
    groups[0] = groups[0] / n

    for targets in targets_unique:
        current_target_length = len(
            focused_data[focused_data[target] == targets])
        groups[0] = np.where(groups[target] == targets,
                            groups[0].div(current_target_length), groups[0])

    return groups


In [16]:
P('sex', 'Class', test)

In [17]:
get_probabilities('Class', test)
get_conditional_probabilities(test, 50, 'Class', 'sex')

Unnamed: 0,sex,Class,0
0,?,0,0.000833
1,F,0,0.01375
2,F,1,0.01
3,M,0,0.005417
4,M,1,0.01


In [18]:
def get_probabilities_attr(data, target, given):
    for column in data.columns:
        for attr in data[column].unique():
            countAttr = len(data[data[column]])
            for target in data[target].unique():
                countTarget = len(data[data['Class']])
                print(countAttr/countTarget)

get_probabilities_attr(test, 1, 'sex')


KeyError: "None of [Float64Index([23.0, 31.0, 65.0, 69.0, 55.0, 33.0, 74.0, 62.0, 77.0, 74.0, 64.0,\n              58.0, 29.0, 59.0, 45.0, 63.0, 59.0, 76.0, 71.0, 73.0, 52.0, 39.0,\n              66.0, 77.0, 57.0, 68.0, 83.0, 30.0, 42.0, 57.0, 70.0, 20.0, 52.0,\n              82.0, 75.0, 78.0, 15.0, 78.0, 89.0, 28.0, 77.0, 48.0, 42.0, 56.0,\n              53.0, 73.0, 64.0, 69.0, 84.0, 78.0],\n             dtype='float64')] are in the [columns]"