In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA


In [2]:
df = pd.read_csv('dataset.csv')
cols = list(df.columns)
np.random.shuffle(cols)
df = df[cols]

In [3]:
df.head()

Unnamed: 0,texture_mean,diagnosis,concave points_worst,radius_se,perimeter_mean,concavity_mean,concavity_se,area_se,smoothness_se,radius_worst,...,area_mean,fractal_dimension_mean,area_worst,symmetry_se,id,symmetry_worst,concavity_worst,texture_se,symmetry_mean,compactness_worst
0,10.38,M,0.2654,1.095,122.8,0.3001,0.05373,153.4,0.006399,25.38,...,1001.0,0.07871,2019.0,0.03003,842302,0.4601,0.7119,0.9053,0.2419,0.6656
1,17.77,M,0.186,0.5435,132.9,0.0869,0.0186,74.08,0.005225,24.99,...,1326.0,0.05667,1956.0,0.01389,842517,0.275,0.2416,0.7339,0.1812,0.1866
2,21.25,M,0.243,0.7456,130.0,0.1974,0.03832,94.03,0.00615,23.57,...,1203.0,0.05999,1709.0,0.0225,84300903,0.3613,0.4504,0.7869,0.2069,0.4245
3,20.38,M,0.2575,0.4956,77.58,0.2414,0.05661,27.23,0.00911,14.91,...,386.1,0.09744,567.7,0.05963,84348301,0.6638,0.6869,1.156,0.2597,0.8663
4,14.34,M,0.1625,0.7572,135.1,0.198,0.05688,94.44,0.01149,22.54,...,1297.0,0.05883,1575.0,0.01756,84358402,0.2364,0.4,0.7813,0.1809,0.205


# Feature Engineering Task 1

In [4]:
for i in df.columns[df.isnull().any(axis=0)]:
    mean = df[i].mean()
    print(f'Filling\t{df[i].isna().sum()} null value(s) in {i} with mean value -->\t{mean}')
    df[i].fillna(mean , inplace = True)

Filling	1 null value(s) in perimeter_mean with mean value -->	92.02346830985917
Filling	1 null value(s) in concavity_mean with mean value -->	0.08892480757042255
Filling	1 null value(s) in radius_mean with mean value -->	14.116125000000011
Filling	1 null value(s) in compactness_se with mean value -->	0.02546582922535212
Filling	2 null value(s) in area_worst with mean value -->	881.4024691358021
Filling	1 null value(s) in concavity_worst with mean value -->	0.27245536443661955


# SPLITTING INTO TRAIN-TEST SPLIT

In [5]:
df = df.sample(frac = 1)
train_size=int(0.67*len(df))
X = df.drop(columns = ['diagnosis', 'id']).values
#X = np.c_[np.ones(len(X)), X]
y = df['diagnosis'].values

X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]
#if you do not want to shuffle, simply do not run this again

# NORMALIZING -- FEATURE ENGINEERING TASK 2

In [6]:
X_train = (X_train - X_train.mean(axis=0))/(X_train.std(axis=0))
X_test = (X_test - X_test.mean(axis=0))/(X_test.std(axis=0)) 

# FISCHER's LINEAR DISCRIMINANT

In [7]:
class LinearDisc:

    def _init_(self):
        self.pt = 0

    def solve(self , m1 , m2 , std1 , std2 , p1 , p2):
        a = 1/(std1**2) - 1/(std2**2)
        b = (-2*m1)/(std1**2) + (2*m2)/(std2**2)
        c = (m1**2)/(std1**2) + (m2**2)/(2*std2**2) - (2)*np.log(std1*p1/std2*p2)
        
        arr = np.roots([a,b,c])

        if(arr[0] > 0):
            print('The discriminant point is ',arr[0])
            return arr[0]
        else:
            print('The discriminant point is ',arr[1])
            return arr[1]

    def p1(x , mew , sigma):
        exp = ((-1) * (x - mew)**2)/(2 * (sigma)**2)
        val = ((np.exp ** exp)/(sigma * (2 * np.pi)**(0.5))[0])
        return val

    def find_point(self , X_train_malignant , X_train_benign , y_train):

        prior_malignant = (np.sum(y_train == 'M'))/((np.sum(y_train == 'M')) + (np.sum(y_train == 'B')))
        prior_benign = (np.sum(y_train == 'B'))/((np.sum(y_train == 'M')) + (np.sum(y_train == 'B')))

        self.pt = self.solve(np.mean(X_train_malignant) , np.mean(X_train_benign) , np.std(X_train_malignant) , np.std(X_train_benign) , prior_malignant , prior_benign)
    
    def predict(self , x):
        if(x > self.pt):
            return 'M'
        else:
            return 'B'

    def accuracy(self , y_pred , y_actual):
        acc = np.sum(y_actual == y_pred)/len(y_actual)
        return acc
    
    def metricscore(self, y_actual, y_out):
        truepos = 0;
        falsepos = 0;
        trueneg = 0;
        falseneg = 0;
        for i in range(len(y_actual)):
            if(y_actual[i] == y_out[i]):
                if(y_actual[i] == 'M'):
                    trueneg = trueneg + 1;
                else:
                    truepos = truepos + 1;
            else:
                if(y_out[i] == 'B'):
                    falseneg = falseneg + 1;
                else:
                    falsepos = falsepos + 1;
        return truepos, falsepos, trueneg, falseneg
    
    def precision(self, y_actual, y_out):
        tp, fp, tn, fn = self.metricscore(y_actual, y_out)
        prec = tp/(tp + fp)
        return prec
    
    def recall(self, y_actual, y_out):
        tp, fp, tn, fn = self.metricscore(y_actual, y_out)
        rec = tp/(tp + fn)
        return rec

In [8]:
lda = LDA(n_components=1)

X_train_1 = lda.fit_transform(X_train, y_train)
X_test_1 = lda.transform(X_test)

X_train_malignant = X_train_1[y_train == 'M']
X_train_benign = X_train_1[y_train == 'B']

ld = LinearDisc()

ld.find_point(X_train_malignant , X_train_benign , y_train)

y_predicted_train = np.array([ld.predict(i) for i in X_train_1])
y_predicted_test = np.array([ld.predict(i) for i in X_test_1])

#print('Training accuracy: ', ld.accuracy(y_predicted_train , y_train))
print('Testing accuracy: ', ld.accuracy(y_predicted_test , y_test))
#print('Training precision ',ld.precision(y_predicted_train, y_train))
print('Testing precision ',ld.precision(y_predicted_test, y_test))
#print('Training recall ', ld.recall(y_predicted_train, y_train))
print('Testing recall ',ld.recall(y_predicted_test, y_test))

Testing accuracy:  0.9521276595744681
Testing precision  0.9291338582677166
Testing recall  1.0
