# PYTHON FOR ML

# Individual contribution:
1) P Sri Vaishno - Gaussian Naive Bayes algorithm and Logistic regression algorithm

2) B Manjunath - KNN algorithm

3) K V Sai Krishna - Linear Discriminant analysis (LDA) and Model analysis 

4) P Shankar - Support Vector Machine algorithm (SVM)

# Logistic Regression

Importing libraries

In [1]:
import numpy as np
import pandas as pd

Funtion to classify data for better understanding

In [2]:
def dataClassification(dataset):
    C = dataset.iloc[:, 10]
    B = []
    M = []
    
    for i in range(0, len(C)):
        if C[i] == 2:
            B.append("B")
            l1 = len(B)
        
        elif C[i] == 4:
            M.append("M")
            l2 = len(M)
    test_patients = dataset.shape[0]
    
    a = str("The data consists of " + str(test_patients) + " different diagnosis in which " + str(l1) + " are Benign and " + 
            str(l2) + " are Malignant")
    return a

Functions to split the dataset into training and testing data

In [3]:
def trainingDataset(dataset, splitRatio):
    
    total_data = dataset.shape[0]
    train_data = dataset.iloc[0:(int(splitRatio*total_data)), 1:11]
    d = pd.DataFrame(train_data)
    return d

In [4]:
def testingData(dataset, splitRatio):
    
    total_data = dataset.shape[0]
    test_data = dataset.iloc[int(total_data - total_data*(1 - splitRatio)):total_data, 1:11]
    d1 = pd.DataFrame(test_data)
    return d1

Function to understand the how the split happened

In [5]:
def DataDivision(data1, data2):
    
    lt1 = len(data1)
    lt2 = len(data2)
    b = str("The data is divided into " + str(lt1) + " training data samples and " + str(lt2) + " testing data samples") 
    return b

In [35]:
def testdataclassification(test_data):
    
    Bdata, Mdata =  (test_data['Class'].value_counts())
    testshape = test_data.shape[0]
    
    l1 = str("The testing data consists of " + str(testshape) + " different diagnosis in which " + str(Bdata) + 
          " are Benign and " + str(Mdata) + " are Malignant")
    return l1

The sigmoid function to reduce all the features values in the range (0,1)

In [6]:
def SigmoidFunction(x):
    return 1 / (1 + np.exp(-x))

Function to optimize the parameters by calculating the gradient descent

In [12]:
def Optimize(dataset, classLabel, LearningRate,No_Of_iterations):
    
    size = dataset.shape[0]
    weights = np.zeros(dataset.shape[1]) 
    bias = 0

    # gradient descent
    for i in range(No_Of_iterations):
        # approximate y with linear combination of weights and x, plus bias
        sigma = SigmoidFunction(np.dot(dataset, weights) + bias)

        # compute gradients
        dw = (1 / size) * np.dot(dataset.T, (sigma - classLabel))
        db = (1 / size) * np.sum(sigma - classLabel)
        # update parameters
        weights -= LearningRate * dw
        bias -= LearningRate * db
    
    return weights, bias

Function to predict the class labels for the testing data

In [13]:
def predict(test_data, weights, bias):
    
    y_predicted_class = []
    # Testing the data
    for i in range(0, 9):
        pred = SigmoidFunction(np.dot(test_data.iloc[:, i], weights[i]) + bias)
    for i in pred:
        # The decision boundary
        if i < 0.9965361244452257:
            y_predicted_class.append(2)
        else:
            y_predicted_class.append(4)
    
    return y_predicted_class

Function to evaluate the model performance using different metrices

In [14]:
def Classification_report(testData, y_predicted):
    
   #Intialize the true positives, false positives, true negatives and the false negatives
    TP = 0
    TN = 0
    FP = 0
    FN = 0

    # Iterate through the p and q lists comparing the predicted labels with the actual ones and computing 
    # the TP, TN, FP and FN
    for i in range(0, len(testData.index)):
        if (y_predicted[i] == 2  and testData.iloc[i,9] == 2):
            TP += 1
        elif (y_predicted[i] == 4 and testData.iloc[i,9] == 4):
            TN += 1
        elif (y_predicted[i] == 4 and testData.iloc[i,9] == 2):   
            FN += 1
        elif (y_predicted[i] == 2 and testData.iloc[i,9] == 4):
            FP += 1
    
    # Accuracy is the measure of many correct predictions the model made        
    A = (TP + TN)/(TP + TN + FP + FN) * 100.0
    
    # precision expresses the proportion of the data points our model says was relevant actually were relevant.
    P1 = (TP)/(TP + FP) * 100.0
    P2 = (TN)/(TN + FN) * 100.0
    
    # Recall expresses the ability to find all relevant instances in a dataset
    R1 = (TP)/(TP + FN) * 100.0
    R2 = (TN)/(TN + FP) * 100.0
    
    # Harmonic mean of recall and precision 
    F1 = (2*P1*R1)/(R1 + P1)
    F2 = (2*P2*R2)/(P2 + R2)
    
    # A matrix to represent all types of predictions 
    CM = np.array([[TP, FP],[FN, TN]])
    
    return A, P1, P2, R1, R2, F1, F2, CM

The main() function

In [36]:
def main():
    
    # Reading the data file and converting it into a dataframe
    data = pd.read_excel(r"C:\Users\sriva\OneDrive\Desktop\Python_for_ML_project_dataset\breast-cancer-wisconsin.xlsx")
    df = pd.DataFrame(data)
    
    # Giving the constants
    LearningRate = 0.001
    No_Of_iterations = 1000
    splitRatio = 0.75
    
    # Processing the data
    m = dataClassification(data)
    df1 = trainingDataset(df, splitRatio)
    df2 = testingData(df, splitRatio)
    l = DataDivision(df1, df2)
    t = testdataclassification(df2)
    
    # Parameter calculation
    weights, bias = Optimize(df1.iloc[:, 0:9], df1.iloc[:,9], LearningRate, No_Of_iterations)
    
    # Predictions
    prediction_class = predict(df2, weights, bias)
    
    # Model evaluation
    Acc, PrecisionB, PrecisionM, RecallB, RecallM, F1_scoreB, F1_scoreM, cm = Classification_report(df2, prediction_class)
    
    print(m)
    print(l)
    print(t)
    print("\n" + "The Accuracy of the Logistic Regression model is: "+ str(Acc) + "%")
    print("\n" + "The Precision of the Logistic Regression model is: "+ str((PrecisionB + PrecisionM)/2) + "%")
    print("\n" + "The Recall of the Logistic Regression model is: "+ str((RecallB + RecallM)/2) + "%")
    print("\n" + "The F1 score of the Logistic Regression model is: "+ str((F1_scoreB + F1_scoreM)/2) + "%")
    print("\n" + "The Confusion matrix of the Logistic Regression model is: " + "\n" + str(cm))

In [37]:
import time

# starting time
start = time.time()

main()

# stoping time
end = time.time()

The data consists of 699 different diagnosis in which 458 are Benign and 241 are Malignant
The data is divided into 524 training data samples and 175 testing data samples
The testing data consists of 175 different diagnosis in which 137 are Benign and 38 are Malignant

The Accuracy of the Logistic Regression model is: 84.57142857142857%

The Precision of the Logistic Regression model is: 82.10188933873144%

The Recall of the Logistic Regression model is: 68.27698809066462%

The F1 score of the Logistic Regression model is: 71.708280941261%

The Confusion matrix of the Logistic Regression model is: 
[[133  23]
 [  4  15]]


In [38]:
# Print the runtime
print(f"The runtime of the Logistic Regression model is {end - start}" + " sec")

The runtime of the Logistic Regression model is 0.6323411464691162 sec
