In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
import copy
import seaborn as sns
import tensorflow as tf
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

##Data

In [2]:
df=pd.read_csv("cancer_clean.csv")
df.head()

Unnamed: 0,Diagnosis,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_point1,symmetry1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


##Implementation

In [17]:
def compute_parameters(X,y):
  X = np.array(X)
  y = np.array(y)

  n = len(y)       # number of exemple
  n_1 = sum(y)     # number of exemple with y = 0
  n_0 = n - n_1    # numbers of exemple with y = 0
  d = X.shape[1]   # numbers of features

  phi = n_1/n           #proba of y == 1
  mu_0 = (1/n_0)*(sum(X[y == 0]))    # mean of the parameters of y == 0
  mu_1 = (1/n_1)*(sum(X[y == 1]))    # mean of the parameters of y == 1
  sigma = np.zeros((d,d))

  for i in range(n):
    if y[i] == 1:
      temp = X[i] - mu_1
    else:
      temp = X[i] - mu_0
    sigma += np.outer(temp,temp)  #covariance matrix

  return phi, mu_0, mu_1, sigma

def classify(X, phi, mu_0, mu_1, sigma):
  X = np.array(X)
  y_preds = np.zeros(len(X))

  d = X.shape[1]

  temp_01 = 1/( ((2*np.pi)**(d/2)) * np.linalg.det(sigma) )
  temp_11 = temp_01

  for i in range(len(X)):
    temp_02 = np.exp((-1/2) * (X[i] - mu_0) @ (np.linalg.inv(sigma) @ ((X[i] - mu_0).T)))
    temp_12 = np.exp((-1/2) * (X[i] - mu_1) @ (np.linalg.inv(sigma) @ ((X[i] - mu_1).T)))

    p_y_0_given_x = temp_01 * temp_02
    p_y_1_given_x = temp_11 * temp_12

    if p_y_1_given_x > p_y_0_given_x:
      y_preds[i] = 1
    else:
      y_preds[i] = 0

  return y_preds


def scale_features(X):    #normalize features to avoid the gradient to diverge
    X = np.array(X)
    means = X.mean(axis=0)
    stds = X.std(axis=0)
    return (X - means) / stds

def split_data(df,target_name): # take the whole df and target's name and return the split data in an adapted form for the gradient_descent function
  df=copy.deepcopy(df)

  train, val = np.split(df.sample(frac=1),[int(0.8*len(df))]) #split at 0.6 of the df lenght and 0.8 of the df lenght

  df_1=train.drop([target_name],axis=1)
  X_train = df_1.values.tolist()         #list of list of the features for train
  y_train = train[target_name].tolist()   #list of target for train

  df_2=val.drop([target_name],axis=1)
  X_val = df_2.values.tolist()          #list of list of the features for val
  y_val = val[target_name].tolist()    #list of target for val

  #df_3=test.drop([target_name],axis=1)
  #X_test = df_3.values.tolist()          #list of list of the features for test
  #y_test = test[target_name].tolist()    #list of target for test

  return X_train,y_train,X_val,y_val


def GDA(df,target_name):      #return the coeff of the function
  X_train_1,y_train_1,X_val_1,y_val_1 = split_data(df,target_name)
  X_train,y_train,X_val,y_val = scale_features(X_train_1),y_train_1,scale_features(X_val_1),y_val_1

  phi, mu_0, mu_1, sigma = compute_parameters(X_train,y_train)

  X_val = np.array(X_val)
  y_val = np.array(y_val)
  y_preds = classify(X_val, phi, mu_0, mu_1, sigma)

  p = 0
  for i in range(len(y_val)):
    if y_val[i] == y_preds[i]:
      p=p+1
  accuracy=p/(len(y_val))

  return phi,mu_0,mu_1,sigma,accuracy






##Result

In [37]:
phi,mu_0,mu_1,sigma,accuracy = GDA(df,"Diagnosis")
print(accuracy)


0.9473684210526315


  return bound(*args, **kwds)


##Implementation with sklearn library

In [38]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score

In [49]:
X_train_1,y_train_1,X_val_1,y_val_1 = split_data(df,"Diagnosis")
X_train,y_train,X_val,y_val = scale_features(X_train_1),y_train_1,scale_features(X_val_1),y_val_1

model = LinearDiscriminantAnalysis()
model = model.fit(X_train,y_train)

  return bound(*args, **kwds)


In [50]:
y_pred = model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))

Accuracy: 0.9649122807017544
