# KNN implementation

In [25]:
import pandas as pd

df = pd.read_csv('fruits.csv')

def euclid(x, y):
    return ((x[0] - y[0]) ** 2 + (x[1] - y[1]) ** 2) ** 0.5

def manhatten(x, y):
    return abs(x[0] - y[0]) + abs(x[1] - y[1])

def minkowski(x, y, k):
    return ((x[0] - y[0]) ** k + (x[1] - y[1]) ** k) ** (1 / k)

n_weight = 165
n_sweet = 5.5
k = 3

# note how the function is applied to the entire dataframe
df['distance'] = euclid((df['weight'], df['sweetness level']), (165, 5.5))

# to sort values use sort_values here; don't forget about the 'by' and 'ascending' parameters
df = df.sort_values(by='distance', ascending= True)
print(df)

# keep k enteries; in pandas slicing includes the last element as well
enteries = min(k, len(df))
print(df.iloc[: enteries])

# note how value counts is used
counts = df.iloc[:enteries, ].value_counts('label')
print(counts)

# use idxmax for the answer
print(counts.idxmax())

   weight  sweetness level   label   distance
3     170                5  Orange   5.024938
4     160                6   Apple   5.024938
2     150                4  Orange  15.074813
0     180                7   Apple  15.074813
5     140                3  Orange  25.124689
1     200                6   Apple  35.003571
   weight  sweetness level   label   distance
3     170                5  Orange   5.024938
4     160                6   Apple   5.024938
2     150                4  Orange  15.074813
label
Orange    2
Apple     1
Name: count, dtype: int64
Orange


In [1]:
import pandas as pd
import numpy as np

p_data = pd.read_csv('patient_data.csv')
p_data = p_data.drop('Patient ID', axis = 1)
print(p_data)

# class to store details about each node in the tree
# self.head stores the attribute which is used to classify a patient as healthy or sick. e.g - age, blood pressure, cholesterol
# self.end will be true if the node is a leaf node; meaning that we can return the decision we found - whether sick or healthy
# self.ans will store sick or healthy based on the data if self.end is true; this value has to be returned once we reach the leaf node
# self.next will store links to other nodes in the tree; no. of links depends on no. of unique values in the column of the attribute in head
# self.next can have any no. of links.. from 1 to n
class Node:
    
    def __init__(self, ):
        self.head = None
        self.ans = None
        self.next = {}
        self.end = False

class Decision_Tree:
    
    # self.root is to store the root node of the tree
    def __init__(self,):
        self.root = None

    # we are using ID3, so info gain formula is used
    # classifier represents the decision column, meaning the column we want to predict
    # attribute is the current column we are considering for the split
    def info_gain(self, data, classifier, attribute):
        # gain_data represents the entropy of the entire dataset
        gain_data = 0
        
        for x in data[classifier].unique():
            
            # p is probability
            p = len(data[data[classifier] == x]) / len(data)
            
            # note log2 is used
            gain_data += -p * np.log2(p)

        # gain_att is used to represent the weighted sum of the entropy of values in the attribute column
        gain_att = 0
        
        for x in data[attribute].unique():
            
            # val will store the entropy of the 'x' value of attribute we are considering
            val = 0
            
            for y in data[classifier].unique():
                # p is probability
                p = len(data[(data[classifier] == y) & (data[attribute] == x)]) / len(data[data[classifier] == y])
                # handling situation when p == 0; if let be, this will raise an error
                if p == 0:
                    continue
                # note log2
                val += -p * np.log2(p)

            # p_att is probability of x
            p_x = len(data[data[attribute] == x]) / len(data)
            
            gain_att += p_x * val

        # returning information gain
        return gain_data - gain_att

    # for choosing the attribute that will split the dataset
    def split(self, data, classifier):
        
        # if only 1 value in the column we want to predict, then no need to further split, just return a leaf node
        # with value in the predictor column
        if len(data[classifier].unique()) == 1:
            ob = Node()
            ob.end = True
            ob.ans = data[classifier].unique()[0]
            return ob
        else:
            # ob -> creating of tree node
            # att -> attribute choosed for splitting
            ob = Node()
            att = None
            info_gain = float('-inf')
            for x in data.columns:
                if x == classifier:
                    continue
                val = self.info_gain(data, classifier, x)
                if val > info_gain:
                    info_gain = val
                    att = x
            ob.head = att
            # now that the attribute is selected, we need to find the datasets for each link of this node
            # this depends on the values of the attribute itself
            for x in data[att].unique():
                ndata = data[data[att] == x]
                ndata = ndata.drop(att, axis = 1)
                ob.next[x] = self.split(ndata, classifier)
            return ob

tree = Decision_Tree()
tree.root = tree.split(p_data, 'Diagnosis')

   Age Blood Pressure Cholesterol Diagnosis
0   30           High        High      Sick
1   45            Low      Normal   Healthy
2   50           High        High      Sick
3   35            Low      Normal   Healthy
4   60           High        High      Sick
5   55            Low      Normal   Healthy
6   40           High        High      Sick
7   25            Low      Normal   Healthy
8   65           High        High      Sick
9   45            Low      Normal   Healthy


In [42]:
u_age = input(f'person age = ')
u_blood_pressure = input(f'person blood pressure = ')
u_cholesterol = input(f'person cholesterol = ')

# dfs traversal to reach leaf node.
def dfs(curr, age, blood, cholest):
    if curr.end:
        return curr.ans
    if curr.head == 'Age':
        return dfs(curr.next[age], age, blood, cholest)
    if curr.head == 'Blood Pressure':
        return dfs(curr.next[blood], age, blood, cholest)
    return dfs(curr.next[cholest], age, blood, cholest)
    
print(dfs(tree.root, u_age, u_blood_pressure, u_cholesterol))

person age =  30
person blood pressure =  High
person cholesterol =  High


Sick
