In [1]:
#Importing libraries
import pandas as pd
import numpy as np
from scipy.io import arff
pd.set_option("display.max_columns",None)
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

### Loading Data

In [2]:
#Importing dataset
data = arff.loadarff(r'ressources/dataset/chronic_kidney_disease.arff')
df = pd.DataFrame(data[0])


df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,b'1.020',b'1',b'0',b'?',b'normal',b'notpresent',b'notpresent',121.0,36.0,1.2,,,15.4,44.0,7800.0,5.2,b'yes',b'yes',b'no',b'good',b'no',b'no',b'ckd'
1,7.0,50.0,b'1.020',b'4',b'0',b'?',b'normal',b'notpresent',b'notpresent',,18.0,0.8,,,11.3,38.0,6000.0,,b'no',b'no',b'no',b'good',b'no',b'no',b'ckd'
2,62.0,80.0,b'1.010',b'2',b'3',b'normal',b'normal',b'notpresent',b'notpresent',423.0,53.0,1.8,,,9.6,31.0,7500.0,,b'no',b'yes',b'no',b'poor',b'no',b'yes',b'ckd'
3,48.0,70.0,b'1.005',b'4',b'0',b'normal',b'abnormal',b'present',b'notpresent',117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,b'yes',b'no',b'no',b'poor',b'yes',b'yes',b'ckd'
4,51.0,80.0,b'1.010',b'2',b'0',b'normal',b'normal',b'notpresent',b'notpresent',106.0,26.0,1.4,,,11.6,35.0,7300.0,4.6,b'no',b'no',b'no',b'good',b'no',b'no',b'ckd'


In [3]:
#we must remove the b' in present in the sg,al,su,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane,class

In [4]:
#run only once
columns_to_decode=['sg','al','su','rbc','pc','pcc','ba','htn','dm','cad','appet','pe','ane','class']
def fun_decode(columns,df):
    for col in columns :
        df[col]=df[col].str.decode('utf-8')
fun_decode(columns_to_decode,df)

In [5]:
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1,0,?,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4,0,?,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2,3,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4,0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2,0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35.0,7300.0,4.6,no,no,no,good,no,no,ckd


In [6]:
df=df.replace('?', np.nan)

In [7]:
df["sg"] = df["sg"].astype(float)
df["al"] = df["al"].astype(float)
df["su"] = df["su"].astype(float)


In [8]:
df.dtypes

age      float64
bp       float64
sg       float64
al       float64
su       float64
rbc       object
pc        object
pcc       object
ba        object
bgr      float64
bu       float64
sc       float64
sod      float64
pot      float64
hemo     float64
pcv      float64
wbcc     float64
rbcc     float64
htn       object
dm        object
cad       object
appet     object
pe        object
ane       object
class     object
dtype: object

In [9]:
#replacing the missing values with the mean for the numerical features
numeric_cols = df.columns[df.dtypes=="float64"].tolist()
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].mean())

In [10]:
object_cols=df.columns[df.dtypes=="object"].tolist()

In [11]:
#encoding categorical features
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
for col in object_cols:
    df[col]= label_encoder.fit_transform(df[col])

In [12]:
#replacing the missing values with the KNN imputer for the categorical encoded features
imputer = KNNImputer(n_neighbors=3)
imputer.fit(df)

In [13]:
imputed_data = imputer.transform(df)

In [14]:
df = pd.DataFrame(imputed_data, index=df.index, columns=df.columns)

In [15]:
df.isnull().sum()

age      0
bp       0
sg       0
al       0
su       0
rbc      0
pc       0
pcc      0
ba       0
bgr      0
bu       0
sc       0
sod      0
pot      0
hemo     0
pcv      0
wbcc     0
rbcc     0
htn      0
dm       0
cad      0
appet    0
pe       0
ane      0
class    0
dtype: int64

## Dimensionality Reduction

In [16]:
#input split
X=df.drop(columns=["class"],axis=1)
y=df["class"]

#### Data Normalization

In [17]:
sc = StandardScaler()
X = sc.fit_transform(X)

In [18]:
X

array([[-0.20546364,  0.26233836,  0.48335471, ..., -0.50798193,
        -0.48451878, -0.4203414 ],
       [-2.62380991, -1.96658024,  0.48335471, ..., -0.50798193,
        -0.48451878, -0.4203414 ],
       [ 0.62031314,  0.26233836, -1.38139079, ...,  1.91097964,
        -0.48451878,  2.29153859],
       ...,
       [-2.32888964,  0.26233836,  0.48335471, ..., -0.50798193,
        -0.48451878, -0.4203414 ],
       [-2.03396936, -1.22360737,  1.41572747, ..., -0.50798193,
        -0.48451878, -0.4203414 ],
       [ 0.38437691,  0.26233836,  1.41572747, ..., -0.50798193,
        -0.48451878, -0.4203414 ]])

#### Dimensionality reduction with pca

In [19]:
pca = PCA(svd_solver="full")
pc = pca.fit_transform(X)

#####  explained_variance_ratio_ représente le pourcentage de variance expliquée par chacune des composantes principales

In [20]:
pca.explained_variance_ratio_*100

array([26.92066282,  8.38287913,  7.74020506,  5.84336492,  5.32259613,
        4.79450681,  4.42233907,  4.03316101,  3.68833243,  3.59092459,
        3.14772835,  3.12774567,  2.65433309,  2.25847306,  2.20685549,
        1.75882974,  1.73251939,  1.68438103,  1.47417901,  1.37176176,
        1.21512035,  1.15964635,  0.90887833,  0.56057641])

#### We clearly see  :
####  PC1 represents 28,58 % of the total variance (information).
#### the other components represnet  71,42 % of the total variance.
#### So  what's the number of components to retain ?
#### With the Kaiser rule , it is recommanded to retain the pca with eigen values superior to 1.
#### We can also use the albow rule
#### However, as this is a classification problem, the number of principal components to retain should be based on the performance of the classification.

# Classification

##### We define a pipeline Gridsearch to determine the best combinisation of  PCA n_components and the hyperparamètres of the classifier .

## Logistic Regression

In [21]:
pca = PCA()
logistic_regression = LogisticRegression(max_iter=10000)
pipe = Pipeline(steps=[("pca", pca), ("logistic", logistic_regression)])

hyperparameters = {
    "pca__n_components": list(range(1,25)),
    "logistic__C": np.logspace(-3, 3, 7)
}
cv = GridSearchCV(pipe, hyperparameters, cv=10, n_jobs=2)
cv.fit(X, y)
print(cv.best_params_)

{'logistic__C': 0.1, 'pca__n_components': 17}


In [22]:
pca = PCA(n_components=18)
pc = pca.fit_transform(X)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(pc, y, test_size=0.3, random_state=1024)

In [24]:
logistic_regression = LogisticRegression(C = 10.0)
logistic_regression.fit(X_train, y_train)

In [25]:
y_pred_lr = logistic_regression.predict(X_test)
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

         0.0       0.99      0.97      0.98        74
         1.0       0.96      0.98      0.97        46

    accuracy                           0.97       120
   macro avg       0.97      0.98      0.97       120
weighted avg       0.98      0.97      0.98       120



## KNN

In [26]:
pca = PCA()
knn = KNeighborsClassifier()
pipe = Pipeline(steps=[("pca", pca), ("knn", knn)])

hyperparameters = {
    "pca__n_components": list(range(1,25)),
    'knn__leaf_size' : list(range(1,30)),
    'knn__n_neighbors' : list(range(1,25)),
    'knn__p' : [1,2]
}
cv = GridSearchCV(pipe, hyperparameters, cv=10, n_jobs=-1)
cv.fit(X, y)
print(cv.best_params_)

{'knn__leaf_size': 1, 'knn__n_neighbors': 1, 'knn__p': 2, 'pca__n_components': 3}


In [27]:
pca = PCA(n_components=5)
pc = pca.fit_transform(X)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(pc, y, test_size=0.3, random_state=1024)

In [29]:
knn = KNeighborsClassifier(leaf_size=1, p=1, n_neighbors=6)
knn.fit(X_train, y_train)

In [30]:
y_pred_knn = knn.predict(X_test)
print(classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98        74
         1.0       0.98      0.96      0.97        46

    accuracy                           0.97       120
   macro avg       0.98      0.97      0.97       120
weighted avg       0.98      0.97      0.97       120



## SVM

In [31]:
pca = PCA()
svm = SVC()
pipe = Pipeline(steps=[("pca", pca), ("svm", svm)])

hyperparameters = {
    "pca__n_components": list(range(1,25)),
    'svm__C': [1, 10, 100, 1000],
    'svm__gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
    'svm__kernel': ['linear', 'poly', 'sigmoid', 'rbf']
}

cv = GridSearchCV(pipe, hyperparameters, cv=10, n_jobs=2)
cv.fit(X, y)
print(cv.best_params_)

{'pca__n_components': 23, 'svm__C': 10, 'svm__gamma': 0.1, 'svm__kernel': 'rbf'}


In [32]:
pca = PCA(n_components=19)
pc = pca.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(pc, y, test_size=0.3, random_state=1024)

In [33]:
svm = SVC(C=10, gamma=0.01, kernel='sigmoid')
svm.fit(X_train, y_train)

In [34]:
y_pred_svm = svm.predict(X_test)
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99        74
         1.0       0.98      0.98      0.98        46

    accuracy                           0.98       120
   macro avg       0.98      0.98      0.98       120
weighted avg       0.98      0.98      0.98       120



## Decision Tree

In [35]:
pca = PCA()
decision_tree = DecisionTreeClassifier()
pipe = Pipeline(steps=[("pca", pca), ("dct", decision_tree)])

hyperparameters = {
    "pca__n_components": list(range(1,25)),
    'dct__max_features': ['sqrt', 'log2'],
    'dct__ccp_alpha': [0.1, 0.01, 0.001],
    'dct__max_depth' : [5, 6, 7, 8, 9],
    'dct__criterion' :['gini', 'entropy']
}

cv = GridSearchCV(pipe, hyperparameters, cv=10, n_jobs=-1)
cv.fit(X, y)
print(cv.best_params_)

{'dct__ccp_alpha': 0.01, 'dct__criterion': 'entropy', 'dct__max_depth': 7, 'dct__max_features': 'sqrt', 'pca__n_components': 6}


In [36]:
pca = PCA(n_components=2)
pc = pca.fit_transform(X)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(pc, y, test_size=0.3, random_state=1024)

In [38]:
decision_tree = DecisionTreeClassifier(ccp_alpha=0.001, max_depth=5, max_features='sqrt', criterion="entropy")
decision_tree.fit(X_train, y_train)

In [39]:
y_pred_dt = decision_tree.predict(X_test)
print(classification_report(y_pred_dt, y_test))

              precision    recall  f1-score   support

         0.0       0.97      0.97      0.97        74
         1.0       0.96      0.96      0.96        46

    accuracy                           0.97       120
   macro avg       0.96      0.96      0.96       120
weighted avg       0.97      0.97      0.97       120



#### We can clearly see that dimensionality reduction using PCA gives really good results for different classification algorithms achieving 100% precision in logistic regression.