# Linear Discriminanat Analysis (LDA)

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [11]:
def feat_lda(indep_x,dep_y,x_test):
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
    valid_n_components = min(indep_x.shape[1], len(np.unique(dep_y)) - 1)
    
    lda = LinearDiscriminantAnalysis(n_components = 1)
    indep1 = lda.fit_transform(indep_x,dep_y)
    test1 = lda.transform(x_test)
    return indep1

def split_scaler(indep_X,dep_Y):
    x_train, x_test, y_train, y_test=train_test_split(indep_X,dep_Y,test_size=0.25,random_state=0)
    sc=StandardScaler()
    x_train=sc.fit_transform(x_train)
    x_test=sc.transform(x_test)
    return x_train, x_test, y_train, y_test

# prediction and evalation metrics
def cm_prediction(classifier,x_test):
    y_pred=classifier.predict(x_test)
    
    from sklearn.metrics import confusion_matrix
    cm=confusion_matrix(y_test,y_pred)
    
    from sklearn.metrics import classification_report
    clf_report=classification_report(y_test,y_pred)
    
    from sklearn.metrics import accuracy_score
    accuracy=accuracy_score(y_test,y_pred)
    return classifier,cm,clf_report,accuracy,x_test,y_test

def logistic(x_train, y_train, x_test):
    from sklearn.linear_model import LogisticRegression
    classifier=LogisticRegression(random_state=0)
    classifier.fit(x_train, y_train)
    classifier,cm,clf_report,accuracy,x_test,y_test = cm_prediction(classifier,x_test)
    return classifier,cm,clf_report,accuracy,x_test,y_test

def svm_linear(x_train, y_train, x_test):
    from sklearn.svm import SVC
    classifier=SVC(C=1.0, kernel='linear', degree=3, gamma='scale',random_state=0)
    classifier.fit(x_train, y_train)
    classifier,cm,clf_report,accuracy,x_test,y_test = cm_prediction(classifier,x_test)
    return classifier,cm,clf_report,accuracy,x_test,y_test

def svm_nonlinear(x_train, y_train, x_test):
    from sklearn.svm import SVC
    classifier=SVC(C=1.0, kernel='rbf', degree=3, gamma='scale',random_state=0)
    classifier.fit(x_train, y_train)
    classifier,cm,clf_report,accuracy,x_test,y_test = cm_prediction(classifier,x_test)
    return classifier,cm,clf_report,accuracy,x_test,y_test

def naivebayes(x_train, y_train, x_test):
    from sklearn.naive_bayes import GaussianNB
    classifier=GaussianNB()
    classifier.fit(x_train, y_train)
    classifier,cm,clf_report,accuracy,x_test,y_test = cm_prediction(classifier,x_test)
    return classifier,cm,clf_report,accuracy,x_test,y_test

def knn(x_train, y_train, x_test):
    from sklearn.neighbors import KNeighborsClassifier
    classifier=KNeighborsClassifier(n_neighbors=10, p=4, metric='minkowski',weights='distance')
    classifier.fit(x_train, y_train)
    classifier,cm,clf_report,accuracy,x_test,y_test = cm_prediction(classifier,x_test)
    return classifier,cm,clf_report,accuracy,x_test,y_test

def randomForest(x_train, y_train, x_test):
    from sklearn.ensemble import RandomForestClassifier
    classifier=RandomForestClassifier(n_estimators=50, max_features='sqrt')
    classifier.fit(x_train, y_train)
    classifier,cm,clf_report,accuracy,x_test,y_test = cm_prediction(classifier,x_test)
    return classifier,cm,clf_report,accuracy,x_test,y_test

def decisionTree(x_train, y_train, x_test):
    from sklearn.tree import DecisionTreeClassifier
    classifier=DecisionTreeClassifier(criterion='entropy', splitter= 'random', max_depth= 6)
    classifier.fit(x_train, y_train)
    classifier,cm,clf_report,accuracy,x_test,y_test = cm_prediction(classifier,x_test)
    return classifier,cm,clf_report,accuracy,x_test,y_test

def lda_table(acclog,accsvm_l,accsvm_nl,accknn,accnaive,accdes,accrf):
    dataframe=pd.DataFrame(index=['LDA'],columns=['logistic','svm_linear','svm_nonlinear','Naive','KNN','DTree','RForest'])
    for number,idex in enumerate(dataframe.index):
        dataframe['logistic'][idex]=acclog[number]
        dataframe['svm_linear'][idex]=accsvm_l[number]
        dataframe['svm_nonlinear'][idex]=accsvm_nl[number]
        dataframe['Naive'][idex]=accnaive[number]
        dataframe['KNN'][idex]=accknn[number]
        dataframe['DTree'][idex]=accdes[number]
        dataframe['RForest'][idex]=accrf[number]
        return dataframe


In [12]:
# Data Collection
dataset=pd.read_csv('prep.csv',index_col=None)
df=dataset
df

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.000000,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,yes
1,3.000000,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.000000,12300.000000,4.705597,no,no,no,yes,poor,no,yes
2,4.000000,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.000000,...,34.000000,8408.191126,4.705597,no,no,no,yes,poor,no,yes
3,5.000000,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,yes
4,5.000000,50.000000,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.000000,12400.000000,4.705597,no,no,no,yes,poor,no,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,219.000000,...,37.000000,9800.000000,4.400000,no,no,no,yes,poor,no,yes
395,51.492308,70.000000,c,0.0,2.0,normal,normal,notpresent,notpresent,220.000000,...,27.000000,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes
396,51.492308,70.000000,c,3.0,0.0,normal,normal,notpresent,notpresent,110.000000,...,26.000000,9200.000000,3.400000,yes,yes,no,poor,poor,no,yes
397,51.492308,90.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,207.000000,...,38.868902,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes


In [13]:
# Data Preprocessing
df=pd.get_dummies(df,drop_first=True)

In [14]:
df

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.000000,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,False,False,False,False,False,False,True,True,False,True
1,3.000000,76.459948,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,True,False,False,False,False,False,True,False,False,True
2,4.000000,76.459948,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,True,False,False,False,False,False,True,False,False,True
3,5.000000,76.459948,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,True,False,False,False,False,False,True,False,True,True
4,5.000000,50.000000,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,True,False,False,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,True,False,False,False,False,False,True,False,False,True
395,51.492308,70.000000,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,True,False,False,True,True,False,True,False,True,True
396,51.492308,70.000000,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,True,False,False,True,True,False,False,False,False,True
397,51.492308,90.000000,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,True,False,False,True,True,False,True,False,True,True


In [15]:
# input output split
indep_x=df.drop('classification_yes',axis=1)
dep_y=df['classification_yes']

In [16]:
x_train,x_test,y_train,y_test=split_scaler(indep_x,dep_y)

In [17]:
dep_y

0       True
1       True
2       True
3       True
4       True
       ...  
394     True
395     True
396     True
397     True
398    False
Name: classification_yes, Length: 399, dtype: bool

In [18]:
print("Number of features:", indep_x.shape[1])
print("Number of unique classes:", len(np.unique(dep_y)))

Number of features: 27
Number of unique classes: 2


In [19]:
indep1 = feat_lda(indep_x,dep_y,x_test)



In [20]:
acclog=[]
accsvm_l=[]
accsvm_nl=[]
accknn=[]
accnaive=[]
accdes=[]
accrf=[]

classifier,cm,clf_report,accuracy,x_test,y_test=logistic(x_train,y_train,x_test)
acclog.append(accuracy)

classifier,cm,clf_report,accuracy,x_test,y_test=svm_linear(x_train,y_train,x_test)
accsvm_l.append(accuracy)

classifier,cm,clf_report,accuracy,x_test,y_test=svm_nonlinear(x_train,y_train,x_test)
accsvm_nl.append(accuracy)

classifier,cm,clf_report,accuracy,x_test,y_test=naivebayes(x_train,y_train,x_test)
accnaive.append(accuracy)

classifier,cm,clf_report,accuracy,x_test,y_test=knn(x_train,y_train,x_test)
accknn.append(accuracy)

classifier,cm,clf_report,accuracy,x_test,y_test=randomForest(x_train,y_train,x_test)
accrf.append(accuracy)

classifier,cm,clf_report,accuracy,x_test,y_test=decisionTree(x_train,y_train,x_test)
accdes.append(accuracy)

In [21]:
result=lda_table(acclog,accsvm_l,accsvm_nl,accknn,accnaive,accdes,accrf)

In [22]:
result

Unnamed: 0,logistic,svm_linear,svm_nonlinear,Naive,KNN,DTree,RForest
LDA,0.99,0.98,0.99,0.98,0.93,0.98,0.99
