In [1]:
#Import libraries
import pandas as pd
import numpy as np
import time
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [2]:
#It selects the best input (columns) from the dataset that are most related to output variable.
#This function looks at all input columns and picks the best n columns that have the strongest relationship with the output variable.
#It helps to reduce unnecessary input and make the model simpler and faster.
def selectkbest(indep_X,dep_Y,n):
    test = SelectKBest(score_func=chi2, k=n)
    fit1= test.fit(indep_X,dep_Y)
    selectk_features = fit1.transform(indep_X)
    return selectk_features

#It’s a function that splits the data into training and testing sets and then scales (standardizes) the input values.
#Splits data into training (75%) and testing (25%) parts. Scales the input values so they’re all in a similar range
#Gives the 4 parts: X_train, X_test, y_train, y_test
def split_scalar(indep_X,dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)    
    return X_train, X_test, y_train, y_test

#Uses trained model to make predictions on test data. Checks how accurate those predictions are.
#Gives a summary of results (accuracy, report, confusion matrix). To evaluate how good the model is at making predictions.    
def cm_prediction(classifier,X_test):
    y_pred = classifier.predict(X_test)
        
    
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, y_pred)
        
    from sklearn.metrics import accuracy_score 
    from sklearn.metrics import classification_report 
       
    Accuracy=accuracy_score(y_test, y_pred )
        
    report=classification_report(y_test, y_pred)
    return  classifier,Accuracy,report,X_test,y_test,cm

In [3]:
#Each function is used to train and test a different machine learning model.
#All functions follow the same steps: Create the model, Train the model using training data (X_train, y_train)
#Test the model using test data (X_test), Evaluate the model using accuracy, confusion matrix, and classification report.
#These functions help in comparing different machine learning algorithms on the same dataset.
#By checking their accuracy and reports, we can identify which model performs best for a particular problem.


#Used for Yes/No type predictions.Finds the relationship between input features and the output.
def logistic(X_train,y_train,X_test):
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression(random_state = 0)
    classifier.fit(X_train, y_train)
    classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
    return  classifier,Accuracy,report,X_test,y_test,cm      

#Uses a straight line to separate data. Works when data can be divided clearly.
def svm_linear(X_train,y_train,X_test):
                
        from sklearn.svm import SVC
        classifier = SVC(kernel = 'linear', random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

#Uses a curved line to separate data. Used when data cannot be split by a straight line.    
def svm_NL(X_train,y_train,X_test):
                
        from sklearn.svm import SVC
        classifier = SVC(kernel = 'rbf', random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

#A fast, simple model based on probability. Good for text or simple classification.   
def Navie(X_train,y_train,X_test):
        from sklearn.naive_bayes import GaussianNB
        classifier = GaussianNB()
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm         
    
#Looks at nearby data points to decide the class. Simple and works well for small datasets.    
def knn(X_train,y_train,X_test):
           
        from sklearn.neighbors import KNeighborsClassifier
        classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

#It splits the data into smaller parts using decision rules (like a flowchart). The model uses the entropy criterion to decide how to split the data.
def Decision(X_train,y_train,X_test):
        
        from sklearn.tree import DecisionTreeClassifier
        classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm      

#It combines multiple Decision Trees to make a stronger and more accurate model. It reduces overfitting and improves prediction performance.
def random(X_train,y_train,X_test):
        
        from sklearn.ensemble import RandomForestClassifier
        classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm    

In [4]:
#This function makes a table that shows how well different machine learning models worked. 
#Create a DataFrame with one row (ChiSquare) and columns for each model
#This table shows accuracy of each model after selecting features with Chi-Square.
def selectk_Classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrff):
    
    dataframe = pd.DataFrame({
        'Logistic': acclog,
        'SVMl': accsvml,
        'SVMnl': accsvmnl,
        'KNN': accknn,
        'Navie': accnav,
        'Decision': accdes,
        'Random': accrff
    }, index=['ChiSquare'])
    
    return dataframe

In [5]:
#Load the dataset
dataset1=pd.read_csv("prep.csv",index_col=None)
df2=dataset1
#Convert categorical columns to numbers
df2 = pd.get_dummies(df2, drop_first=True)

df2

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.000000,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,False,False,False,False,False,False,True,True,False,True
1,3.000000,76.459948,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,True,False,False,False,False,False,True,False,False,True
2,4.000000,76.459948,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,True,False,False,False,False,False,True,False,False,True
3,5.000000,76.459948,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,True,False,False,False,False,False,True,False,True,True
4,5.000000,50.000000,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,True,False,False,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,True,False,False,False,False,False,True,False,False,True
395,51.492308,70.000000,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,True,False,False,True,True,False,True,False,True,True
396,51.492308,70.000000,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,True,False,False,True,True,False,False,False,False,True
397,51.492308,90.000000,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,True,False,False,True,True,False,True,False,True,True


In [6]:
#Split data into inputs and output
indep_X = df2.drop('classification_yes', axis=1)
dep_Y=df2['classification_yes']

In [22]:
#Select the best features
kbest=selectkbest(indep_X,dep_Y,6)       
#These empty lists will store accuracy scores for different models after training.
acclog=[]
accsvml=[]
accsvmnl=[]
accknn=[]
accnav=[]
accdes=[]
accrf=[]

In [23]:
kbest

array([[3.00000000e+00, 1.48112676e+02, 5.74821053e+01, 3.07735602e+00,
        3.88689024e+01, 8.40819113e+03],
       [2.00000000e+00, 1.48112676e+02, 2.20000000e+01, 7.00000000e-01,
        3.40000000e+01, 1.23000000e+04],
       [1.00000000e+00, 9.90000000e+01, 2.30000000e+01, 6.00000000e-01,
        3.40000000e+01, 8.40819113e+03],
       ...,
       [3.00000000e+00, 1.10000000e+02, 1.15000000e+02, 6.00000000e+00,
        2.60000000e+01, 9.20000000e+03],
       [0.00000000e+00, 2.07000000e+02, 8.00000000e+01, 6.80000000e+00,
        3.88689024e+01, 8.40819113e+03],
       [0.00000000e+00, 1.00000000e+02, 4.90000000e+01, 1.00000000e+00,
        5.30000000e+01, 8.50000000e+03]])

In [24]:
#Split the data into training and testing sets
X_train, X_test, y_train, y_test=split_scalar(kbest,dep_Y)   
    
#Train different models and get accuracy         
classifier,Accuracy,report,X_test,y_test,cm=logistic(X_train,y_train,X_test)
acclog.append(Accuracy)

classifier,Accuracy,report,X_test,y_test,cm=svm_linear(X_train,y_train,X_test)  
accsvml.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=svm_NL(X_train,y_train,X_test)  
accsvmnl.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=knn(X_train,y_train,X_test)  
accknn.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=Navie(X_train,y_train,X_test)  
accnav.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=Decision(X_train,y_train,X_test)  
accdes.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=random(X_train,y_train,X_test)  
accrf.append(Accuracy)

In [25]:
#Compare all models in a table
result=selectk_Classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf)

In [11]:
result
#5

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
ChiSquare,0.94,0.94,0.95,0.89,0.83,0.96,0.95


In [21]:
result
#4

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
ChiSquare,0.85,0.82,0.83,0.86,0.79,0.89,0.89


In [26]:
result
#6

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
ChiSquare,0.95,0.96,0.96,0.93,0.89,0.97,0.97
