In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

Feature Selection with selectkbest
-
The function selectkbest(indep_X, dep_Y, n) uses SelectKBest to select the top n features from the dataset indep_X based on the Chi-Square test with respect to the target variable dep_Y.

This is a feature selection technique to reduce the dimensionality of the dataset before applying the classification models.

In [2]:
    def selectkbest(indep_X, dep_Y, n):
        test = SelectKBest(score_func=chi2, k=n)
        fit1 = test.fit(indep_X, dep_Y)
        selectk_features = fit1.transform(indep_X)
        selected_columns = indep_X.columns[test.get_support()]
        return selectk_features, selected_columns

Data Splitting and Scaling: split_scalar
-
This function splits the dataset into training and testing sets using train_test_split (75% training and 25% testing).

It then scales the feature data using StandardScaler, which standardizes the features to have a mean of 0 and a standard deviation of 1.

In [3]:
    def split_scalar(indep_X, dep_Y):
        X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
        sc=StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
        return X_train, X_test, y_train, y_test, sc

Confusion Matrix and Accuracy: cm_prediction
-
This function takes a trained classifier and the test set X_test, predicts the labels, and then calculates:
1.The confusion matrix (cm).
2.The accuracy score (accuracy).
3.The classification report, which includes precision, recall, and F1-score.

It returns the classifier, accuracy, classification report, confusion matrix, and test sets.

In [4]:
    def cm_prediction(classifier, X_test):
        y_pred = classifier.predict(X_test)
        #from sklearn.metrics import confusion_matrix
        cm= confusion_matrix(y_test, y_pred)
        #from sklearn.metrics import accuracy_score
        #from sklearn.metrics import classification_report
        accuracy=accuracy_score(y_test, y_pred)
        report=classification_report(y_test, y_pred)
        return classifier, accuracy, report, X_test, y_test, cm

Multiple Classifier Functions
-
These functions train different classifiers on the training data and use cm_prediction to evaluate their performance on the test set:

The goal of this function is to train a Logistic Regression model on X_train and y_train, and then evaluate it using X_test.

A Logistic Regression model is instantiated and assigned to the variable classifier

Training the Model:

The fit method is called on the classifier object to train the model.
The model is trained on the input data X_train and the corresponding labels y_train.
This step adjusts the model's parameters (weights) based on the training data, allowing it to learn the relationship between the features and the target.

Evaluating the Model: Calling cm_prediction:

After the model is trained, the function cm_prediction is called to make predictions and evaluate the performance of the logistic regression model on the test set.

Inputs to cm_prediction:

classifier: The trained logistic regression model.
X_test: The test features used to make predictions.

Outputs from cm_prediction:

classifier: The trained classifier (unchanged).
Accuracy: The accuracy score of the model on the test data (percentage of correct predictions).
report: The classification report (which includes precision, recall, F1-score, etc.).
X_test: The test feature set (unchanged).
y_test: The true labels for the test set (passed internally within cm_prediction).
cm: The confusion matrix, which shows the distribution of predicted versus actual labels.

In [5]:
    def logistic(X_train, y_train, X_test):
        from sklearn.linear_model import LogisticRegression
        classifier=LogisticRegression(random_state=0)
        classifier.fit(X_train, y_train)
        classifier, Accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test)
        return classifier, Accuracy, report, X_test, y_test, cm

In [6]:
    def svm_linear(X_train, y_train, X_test):
        from sklearn.svm import SVC
        classifier=SVC(kernel='linear', random_state=0)
        classifier.fit(X_train, y_train)
        classifier, Accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test)
        return classifier, Accuracy, report, X_test, y_test, cm

In [7]:
    def svm_NL(X_train, y_train, X_test):
        from sklearn.svm import SVC
        classifier=SVC(kernel='rbf', random_state=0)
        classifier.fit(X_train, y_train)
        classifier, Accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test)
        return classifier, Accuracy, report, X_test, y_test, cm

In [8]:
    def Navie(X_train, y_train, X_test):
        from sklearn.naive_bayes import GaussianNB
        classifier=GaussianNB()
        classifier.fit(X_train, y_train)
        classifier, Accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test)
        return classifier, Accuracy, report, X_test, y_test, cm

In [9]:
    def knn(X_train, y_train, X_test):
        from sklearn.neighbors import KNeighborsClassifier
        classifier=KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
        classifier.fit(X_train, y_train)
        classifier, Accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test)
        return classifier, Accuracy, report, X_test, y_test, cm

In [10]:
    def Decision(X_train, y_train, X_test):
        from sklearn.tree import DecisionTreeClassifier
        classifier=DecisionTreeClassifier(criterion='entropy', random_state=0)
        classifier.fit(X_train, y_train)
        classifier, Accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test)
        return classifier, Accuracy, report, X_test, y_test, cm

In [11]:
    def random(X_train, y_train, X_test):
        from sklearn.ensemble import RandomForestClassifier
        classifier=RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
        classifier.fit(X_train, y_train)
        classifier, Accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test)
        return classifier, Accuracy, report, X_test, y_test, cm

Result Aggregation: selectk_Classification
-
This function takes the accuracies from the different classifiers (Logistic Regression, SVM, KNN, etc.) and organizes them into a pandas DataFrame.

Each classifier's accuracy is stored under the corresponding column name (e.g., 'Logistic', 'SVMl', etc.).

The DataFrame is returned, displaying the results for each classifier.

In [12]:
    def selectk_Classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf):
        dataframe = pd.DataFrame(index=['ChiSquare'], columns=['Logistic', 'SVMl', 'SVMnl', 'KNN', 'Navie', 'Decision', 'Random'])
        for number, index in enumerate(dataframe.index):
            dataframe.loc[index, 'Logistic'] = acclog[number]
            dataframe.loc[index, 'SVMl'] = accsvml[number]
            dataframe.loc[index, 'SVMnl'] = accsvmnl[number]
            dataframe.loc[index, 'KNN'] = accknn[number]
            dataframe.loc[index, 'Navie'] = accnav[number]
            dataframe.loc[index, 'Decision'] = accdes[number]
            dataframe.loc[index, 'Random'] = accrf[number]
        return dataframe

In [13]:
dataset1=pd.read_csv("prep.csv", index_col=None)

In [14]:
df2=dataset1

Data Preparation and Execution
-

pd.get_dummies(df2, drop_first=True) converts categorical variables into binary variables.

indep_X = df2.drop('classification_yes', axis=1) selects the independent variables (features), and dep_Y = df2['classification_yes'] selects the target variable (dependent variable).

In [15]:
df2=pd.get_dummies(df2, drop_first=True)
df2

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.000000,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,False,False,False,False,False,False,True,True,False,True
1,3.000000,76.459948,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,True,False,False,False,False,False,True,False,False,True
2,4.000000,76.459948,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,True,False,False,False,False,False,True,False,False,True
3,5.000000,76.459948,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,True,False,False,False,False,False,True,False,True,True
4,5.000000,50.000000,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,True,False,False,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,True,False,False,False,False,False,True,False,False,True
395,51.492308,70.000000,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,True,False,False,True,True,False,True,False,True,True
396,51.492308,70.000000,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,True,False,False,True,True,False,False,False,False,True
397,51.492308,90.000000,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,True,False,False,True,True,False,True,False,True,True


In [16]:
indep_X=df2.drop('classification_yes', axis=1)
dep_Y=df2['classification_yes']

The top 5 features are selected using kbest = selectkbest(indep_X, dep_Y, 5).

Then, for each classifier (Logistic Regression, SVM, KNN, etc.), the accuracy is calculated and stored in the respective lists (acclog, accsvml, etc.).

In [17]:
#kbest=selectkbest(indep_X, dep_Y, 5)
#kbest

kbest_features, kbest_columns = selectkbest(indep_X, dep_Y, 5)
print("Selected Columns:", kbest_columns)

Selected Columns: Index(['bgr', 'bu', 'sc', 'pcv', 'wc'], dtype='object')


In [18]:
acclog=[]
accsvml=[]
accsvmnl=[]
accknn=[]
accnav=[]
accdes=[]
accrf=[]

In [19]:
X_train, X_test, y_train, y_test, sc = split_scalar(kbest_features, dep_Y)

In [20]:
classifier, Accuracy, report, X_test, y_test, cm = logistic(X_train, y_train, X_test)
acclog.append(Accuracy)

In [21]:
classifier, Accuracy, report, X_test, y_test, cm = svm_linear(X_train, y_train, X_test)
accsvml.append(Accuracy)

In [22]:
classifier, Accuracy, report, X_test, y_test, cm = svm_NL(X_train, y_train, X_test)
accsvmnl.append(Accuracy)

In [23]:
classifier, Accuracy, report, X_test, y_test, cm = knn(X_train, y_train, X_test)
accknn.append(Accuracy)

In [24]:
classifier, Accuracy, report, X_test, y_test, cm = Navie(X_train, y_train, X_test)
accnav.append(Accuracy)

In [25]:
classifier, Accuracy, report, X_test, y_test, cm = Decision(X_train, y_train, X_test)
accdes.append(Accuracy)

In [26]:
classifier, Accuracy, report, X_test, y_test, cm = random(X_train, y_train, X_test)
accrf.append(Accuracy)

In [27]:
result = selectk_Classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf)
result

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
ChiSquare,0.94,0.94,0.95,0.89,0.83,0.96,0.95
