In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 1. Write a Python code to calculate the linear discriminant function for binary class. Your code should be able to predict the Y class based on the input value 𝑋! (in the lecture note page 9).

In [2]:
 def LDA(data, class_col_name, p1, p2, group1, group2, X_new) :
    
    cat_list = data[class_col_name].tolist()
    X1 = data.iloc[np.where(np.array(cat_list) == group1)[0], 2:]
    X2 = data.iloc[np.where(np.array(cat_list) == group2)[0], 2:]
    
    n1 = len(X1)
    n2 = len(X2)

    mean_1 = X1.mean()
    mean_2 = X2.mean()
    s_1 = X1.cov()
    s_2 = X2.cov()
    s_p = ((n1-1) / (n1+n2-2)) * s_1 + ((n2-1) / (n1+n2-2)) * s_2

    a = (mean_1 - mean_2) @ np.linalg.inv(s_p)
    ldf = a @ X_new - (1/2)*(a @ mean_1 + a @ mean_2)
    prior = np.log(p2/p1)

    if ldf >= prior:
        result = group1
    else :
        result = group2
        
    return ldf, prior, result


## 2. Write a Python code to perform the ‘leave-one-out’ method to calculate the accuracy of the LDA model you wrote in #1.

In [3]:
def leave_one_out(data, class_col_name, p1, p2, group1, group2):
    y_pred_list = []

    for i in range(len(data)):
        Test = data.drop(data.index[i])
        drop_row = data.iloc[i,2:]
        y_pred = LDA(Test, class_col_name, p1, p2, group1, group2,drop_row)[2]
        y_pred_list.append(y_pred)
        
    data['y_pred']  = y_pred_list
    n = len(data)
    correct_pred = len(data[data[class_col_name] == data['y_pred']])
    accuracy = correct_pred / n
    return accuracy

## 3. 

In [4]:
# 데이터 불러오기

turkey = pd.read_csv('turkey.dat', delim_whitespace=True)
turkey.head()

Unnamed: 0,ID,SEX,TYPE,WGT,HUM,RAD,ULN,FEMUR,TIB,TIN,CAR,D3P,STL,STB,COR,PEL,MAX,MIN,SCA
0,K766,MALE,WILD,.,.,.,.,.,.,.,.,.,.,.,.,.,142,107,.
1,N399,MALE,WILD,.,153,138,153,139,246,162,810,307,196,74,.,.,145,104,.
2,NEX1,MALE,WILD,.,.,.,.,.,.,.,.,.,224,72,.,.,.,.,.
3,NEX2,MALE,WILD,.,.,.,.,.,.,.,.,.,220,74,.,.,.,.,.
4,NEX3,MALE,WILD,.,.,.,.,.,.,.,.,.,228,78,.,.,.,.,.


In [5]:
# 분석용 버전으로 가공하기

turkey1 = turkey.loc[turkey['SEX'] == 'MALE',
    ['ID','TYPE','HUM','RAD','ULN','FEMUR','TIN','CAR','D3P','COR','SCA']]
turkey1 = turkey1.replace('.',np.nan).dropna().reset_index(drop=True)

turkey1_obj = turkey1.iloc[:,0:2]
turkey1_int = turkey1.iloc[:,2:].astype(int)
turkey1 = pd.concat([turkey1_obj, turkey1_int], axis = 1)
turkey1.head()

Unnamed: 0,ID,TYPE,HUM,RAD,ULN,FEMUR,TIN,CAR,D3P,COR,SCA
0,B710,WILD,153,140,147,142,151,817,305,102,128
1,B790,WILD,156,137,151,146,155,814,305,111,137
2,B819,WILD,158,135,151,146,152,790,289,111,125
3,B085,WILD,148,129,146,139,147,767,287,106,123
4,B089,WILD,157,140,154,140,159,818,301,116,136


### a. Which turkeys in this data set were misclassified by the discriminant rule when the rule was applied to the training data?

In [6]:
y_pred = []
for i in range(len(turkey1)):
    y_pred.append(LDA(turkey1, 'TYPE', 0.4, 0.6, 'WILD', 'DOMESTIC', turkey1.iloc[i, 2:])[2])
turkey_pred = pd.DataFrame()
turkey_pred['y_pred']  = y_pred
turkey_concat = pd.concat([turkey1, turkey_pred], axis = 1)
turkey_concat.loc[turkey_concat['TYPE'] != turkey_concat['y_pred']]

Unnamed: 0,ID,TYPE,HUM,RAD,ULN,FEMUR,TIN,CAR,D3P,COR,SCA,y_pred
0,B710,WILD,153,140,147,142,151,817,305,102,128,DOMESTIC
24,L750,DOMESTIC,149,130,147,140,147,770,300,104,126,WILD


### b. What are the posterior probabilities for both domestic and wild classifications for those turkeys that were misclassified in (a)?

In [7]:
# 특정 ID에 대한 posterior probability 계산하는 함수 작성

def posterior(ID): 

    cat_list = turkey1['TYPE'].tolist()
    X1 = turkey1.iloc[np.where(np.array(cat_list) == 'WILD')[0], 2:]
    X2 = turkey1.iloc[np.where(np.array(cat_list) == 'DOMESTIC')[0], 2:]

    n1 = len(X1)
    n2 = len(X2)

    u1 = X1.mean()
    u2 = X2.mean()

    s_1 = X1.cov()
    s_2 = X2.cov()
    s_p = ((n1-1) / (n1+n2-2)) * s_1 + ((n2-1) / (n1+n2-2)) * s_2

    X_new = turkey1.iloc[np.where(turkey1['ID'] == ID)[0], 2:]

    f1 = np.exp(-1/2 * (np.array(X_new - u1) @ np.linalg.inv(s_p) @ np.array(X_new - u1).T))[0,0]
    f2 = np.exp(-1/2 * (np.array(X_new - u2) @ np.linalg.inv(s_p) @ np.array(X_new - u2).T))[0,0]
    
    pos_wild = f1 / (f1 + f2)
    pos_domestic = f2 / (f1 + f2)
    
    return pos_wild, pos_domestic

In [8]:
print("ID B710 : posterior for wild = {} , posterior for domestic = {}".format(posterior('B710')[0], posterior('B710')[1]))
print("ID L750 : posterior for wild = {} , posterior for domestic = {}".format(posterior('L750')[0], posterior('L750')[1]))

ID B710 : posterior for wild = 0.4337557044636779 , posterior for domestic = 0.5662442955363222
ID L750 : posterior for wild = 0.8066042300946215 , posterior for domestic = 0.1933957699053785


### c. Determine the value of each of the linear discriminant function for turkeys whose IDs are B710 and L674. How do you classify these two turkeys?

In [9]:
# ID B710

LDA(turkey1, 'TYPE', 0.4, 0.6, 'WILD', 'DOMESTIC', turkey1.iloc[0, 2:])

(-0.26654411996259064, 0.4054651081081642, 'DOMESTIC')

In [10]:
# ID L674

LDA(turkey1, 'TYPE', 0.4, 0.6, 'WILD', 'DOMESTIC', turkey1.iloc[14, 2:])

(-8.859415781653924, 0.4054651081081642, 'DOMESTIC')

### d. Calculate the ‘leave-one-out’ accuracy of the LDA model.

In [11]:
leave_one_out(turkey1, 'TYPE', 0.4, 0.6, 'WILD', 'DOMESTIC')

0.8484848484848485