In [23]:
#Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split 
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
import pickle
import matplotlib.pyplot as plt

In [24]:
#PCA FUNCTION
def princomp(indep_x,dep_y,n):#Create a function princomp with 3 parameter
   pca = PCA(n_components=n)#select best feature using PCA
   pca_fit=pca.fit(indep_x)#create a model based on best feature
   pca_features=pca_fit.transform(indep_x)#Create a new indep_x dataset based on k values
   #explained_variance=pca.explained_variance_ratio_  
   return pca_features

In [25]:
#STANDARDIZATION FUNCTION
def split_scalar(indep_x,dep_y):
   X_train, X_test, y_train, y_test = train_test_split(indep_x, dep_y, test_size = 0.25, random_state = 0)#split training set and test set
   sc = StandardScaler() # call standardization function
   X_train = sc.fit_transform(X_train)#find mean and standard divation for x_train   
   X_test = sc.transform(X_test) 
   return X_train, X_test, y_train, y_test
     

In [26]:
#FIND EVALUATION METRICS FOR CLASSIFICATION
def cm_prediction(classifier,X_test):
   y_pred=classifier.predict(X_test)#using predict method we findout the predict output that is stored in y_pred
   cm=confusion_matrix(y_test,y_pred)#Confusion matrix compare actual value and predicted value.
   accuracy=accuracy_score(y_test,y_pred)#Accuracy = (TP + TN) / (TP + TN + FP + FN)Proportion of true prediction out of total(true+False) prediction
   report=classification_report(y_test,y_pred)#Classification report include precision ,recall,f1_score,,support  
   return  classifier,accuracy,report,X_test,y_test,cm

In [27]:
#CLASSIFICATION ALGORITHM
def logistic(X_train,y_train,X_test):  
   classifier = LogisticRegression(random_state = 0)#Call LogisticRegression Function
   classifier.fit(X_train, y_train)# create a model for classifier
   classifier,accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
   return  classifier,accuracy,report,X_test,y_test,cm      

def svm_linear(X_train,y_train,X_test):
    classifier = SVC(kernel = 'linear', random_state = 0)#Call Suppotr Vector Classification Function
    classifier.fit(X_train, y_train)# create a model for classifier
    classifier,accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
    return  classifier,accuracy,report,X_test,y_test,cm

def svm_NL(X_train,y_train,X_test):
    classifier = SVC(kernel = 'rbf', random_state = 0)
    classifier.fit(X_train, y_train)
    classifier,accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
    return  classifier,accuracy,report,X_test,y_test,cm
   
def Navie(X_train,y_train,X_test):       
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    classifier,accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
    return  classifier,accuracy,report,X_test,y_test,cm         
        
def knn(X_train,y_train,X_test):
     classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
     classifier.fit(X_train, y_train)
     classifier,accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
     return  classifier,accuracy,report,X_test,y_test,cm

def Decision(X_train,y_train,X_test):     
    classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, y_train)
    classifier,accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
    return  classifier,accuracy,report,X_test,y_test,cm      


def random(X_train,y_train,X_test):  
    classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, y_train)
    classifier,accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
    return  classifier,accuracy,report,X_test,y_test,cm
    

In [28]:
#GIVE THE OUTPUT IN TABLE FORMAT
def pca_Classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf): #Create a function  with 7 arguments
    #Create a table,row named as Chisquare,column named as logistic,SVMl,SVMnl,KNN,Navie,Decision,Random  
    dataframe=pd.DataFrame(index=['PCA'],columns=['Logistic','SVMl','SVMnl','KNN','Navie','Decision','Random'])
    for number,idex in enumerate(dataframe.index):     
        dataframe['Logistic'][idex]=acclog[number] #dataframe['Logistic'][PCA]=acclog[0]  
        dataframe['SVMl'][idex]=accsvml[number] #dataframe['SVMl'][PCA]=accsvml[1]
        dataframe['SVMnl'][idex]=accsvmnl[number] #dataframe['SVMnl'][ChiSquare]=accsvmnl[2]
        dataframe['KNN'][idex]=accknn[number] #dataframe['KNN'][PCA]=accknn[3]
        dataframe['Navie'][idex]=accnav[number] # dataframe['Navie'][PCAe]=accnav[4]
        dataframe['Decision'][idex]=accdes[number]# dataframe['Decision'][PCA]=accdes[5]
        dataframe['Random'][idex]=accrf[number]  #dataframe['Random']PCA]=accrf[6]
    return dataframe#return all the values ie stored in dataframe

In [29]:
dataset1=pd.read_csv("prep.csv",index_col=None)#read the csv file,index_col=None means index number starts from 0,1...

df2=dataset1#Assign dataset1 into df2

df2 = pd.get_dummies(df2, drop_first=True)#Convert categorical data into numercial data,drop_first=true means drop on dummy column.

indep_x=df2.drop('classification_yes',axis= 1)#drop classification_yes,1 means drop column wise
dep_y=df2['classification_yes']#Select classification_yes from df2 and stored it dep_y 


In [30]:
warnings.filterwarnings("ignore")

In [33]:

principle=princomp(indep_x,dep_y,5) #Take 2 features(ie) columns and ckeck the accuracy     

acclog=[]#Empty list ,that is stored accuracy value of logistic regression 
accsvml=[]#Empty list ,that is stored accuracy value of support vector machine (linear)
accsvmnl=[]#Empty list ,that is stored accuracy value of support vector machine (non linear)
accknn=[]#Empty list ,that is stored accuracy value of K nearest neighbour
accnav=[]#Empty list ,that is stored accuracy value of Navie Bayes 
accdes=[]#Empty list ,that is stored accuracy value ofDecision Tree
accrf=[]#Empty list ,that is stored accuracy value of Random Forest

X_train, X_test, y_train, y_test=split_scalar(principle,dep_y)#Kbest select bsst 5 feature,and split into training set and test set   
    
        
classifier,accuracy,report,X_test,y_test,cm=logistic(X_train,y_train,X_test)
acclog.append(accuracy)#add the accuracy value to corresponding list

classifier,accuracy,report,X_test,y_test,cm=svm_linear(X_train,y_train,X_test)  
accsvml.append(accuracy)
    
classifier,accuracy,report,X_test,y_test,cm=svm_NL(X_train,y_train,X_test)  
accsvmnl.append(accuracy)
    
classifier,accuracy,report,X_test,y_test,cm=knn(X_train,y_train,X_test)  
accknn.append(accuracy)
    
classifier,accuracy,report,X_test,y_test,cm=Navie(X_train,y_train,X_test)  
accnav.append(accuracy)
    
classifier,accuracy,report,X_test,y_test,cm=Decision(X_train,y_train,X_test)  
accdes.append(accuracy)
    
classifier,accuracy,report,X_test,y_test,cm=random(X_train,y_train,X_test)  
accrf.append(accuracy)
    
result=pca_Classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf)#call selectk_Classification fun

result

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
PCA,0.83,0.83,0.87,0.87,0.81,0.86,0.86
