# Load Libraries

In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import LabelEncoder
from imblearn.combine import SMOTETomek
from sklearn.model_selection import KFold
from sklearn.semi_supervised import LabelPropagation
from sklearn.neighbors import KNeighborsClassifier
from sklearn.semi_supervised import LabelSpreading
import statistics
from sklearn.metrics import roc_curve,auc,precision_recall_curve,confusion_matrix,accuracy_score,classification_report

# Load Data

In [2]:
diabetes=pd.read_csv(r'C:\Users\ADMIN\Desktop\Diabetes_copy.csv')

diabetes = pd.DataFrame(diabetes)

print(diabetes.shape)

(69972, 22)


# Preprocessing

In [3]:
label_encoder = LabelEncoder()
diabetes['readmitted']= label_encoder.fit_transform(diabetes['readmitted']) 
print(diabetes['readmitted'].shape)
print(diabetes['readmitted'].head(5))

(69972,)
0    1
1    2
2    2
3    2
4    1
Name: readmitted, dtype: int64


In [4]:
#Split dataset in features and target variable
feature_cols = ['race','gender','age','admission_type_id','discharge_disposition_id','admission_source_id','time_in_hospital','medical_specialty','num_lab_procedures','num_procedures','num_medications','number_outpatient','number_emergency','number_inpatient', 'diag_1','number_diagnoses','max_glu_serum','insulin','A1Cresult','change','diabetesMed']
X = diabetes[feature_cols] # Features
y = diabetes.readmitted # Target variable

In [5]:
X = pd.get_dummies(X,drop_first=True)
print(X.shape)
X = X
Y = y

(69972, 53)


In [6]:
# user-defined unlabelling function

import random

def maskfunc(true_target, percentage):
    if (percentage >=0 and percentage <= 100):
       
        n_total_samples = len(true_target)
        unlabel = (100-percentage)
        n_labeled_points = int((unlabel * len(y))/100)
        indices = np.arange(n_total_samples)
        unlabeled_set = indices[n_labeled_points:]
        
        labels = true_target.copy()
        labels[unlabeled_set] = -1
        
        return labels,unlabeled_set
    else: 
        return 'Percentage not in range of (0,100)'
    

In [7]:
def class_report(unlabeled_set_value, percen):
    unlabeled_set = unlabeled_set_value
    predicted_labels = label_prop_model.transduction_[unlabeled_set]
    true_labels = Y[unlabeled_set]
    
    cm = confusion_matrix(true_labels, predicted_labels, labels=label_prop_model.classes_)
    print('Classifcation report \n')
    print(classification_report(true_labels, predicted_labels))

 

    print('Confusion matrix \n',cm)

In [8]:
def class_report_spread(unlabeled_set_value, percen):
    unlabeled_set = unlabeled_set_value
    predicted_labels = label_prop_model.transduction_[unlabeled_set]
    true_labels = Y[unlabeled_set]
    
    cm = confusion_matrix(true_labels, predicted_labels, labels=label_prop_model.classes_)
    print('Classifcation report \n')
    print(classification_report(true_labels, predicted_labels))

 

    print('Confusion matrix \n',cm)

# Label Propagation

In [9]:
warnings.filterwarnings('ignore')
label_prop_model = LabelPropagation(kernel='knn', n_neighbors=8, max_iter=100, gamma=20,n_jobs=None)

In [10]:
# 100% labelled data
label_prop_model.fit(X, Y)
score_zero = label_prop_model.score(X, Y)
print('Accuracy with 100 % labelled data\t',round(score_zero,4))

Accuracy with 100 % labelled data	 0.6453


In [11]:
# 90% labelled data
ul,unlabeled_set = maskfunc(Y, 10)
label_prop_model.fit(X, ul)
score_ten = label_prop_model.score(X, ul)
class_report(unlabeled_set,10)

Classifcation report 

              precision    recall  f1-score   support

           0       0.11      0.05      0.07       384
           1       0.22      0.34      0.27      1323
           2       0.78      0.70      0.74      5291

    accuracy                           0.60      6998
   macro avg       0.37      0.36      0.36      6998
weighted avg       0.63      0.60      0.61      6998

Confusion matrix 
 [[  18  126  240]
 [  44  451  828]
 [ 103 1471 3717]]


In [12]:
# 80% labelled data
ul,unlabeled_set = maskfunc(Y, 20)
label_prop_model.fit(X, ul)
score_twenty = label_prop_model.score(X, ul)
class_report(unlabeled_set,20)

Classifcation report 

              precision    recall  f1-score   support

           0       0.08      0.18      0.11       935
           1       0.25      0.28      0.26      3187
           2       0.72      0.60      0.66      9873

    accuracy                           0.50     13995
   macro avg       0.35      0.35      0.34     13995
weighted avg       0.57      0.50      0.53     13995

Confusion matrix 
 [[ 167  257  511]
 [ 520  906 1761]
 [1439 2515 5919]]


In [13]:
# 50% labelled data
ul,unlabeled_set = maskfunc(Y, 50)
label_prop_model.fit(X, ul)
score_fifty = label_prop_model.score(X, ul)
class_report(unlabeled_set,50)

Classifcation report 

              precision    recall  f1-score   support

           0       0.08      0.97      0.15      2799
           1       0.30      0.00      0.01     10669
           2       0.70      0.03      0.06     21518

    accuracy                           0.10     34986
   macro avg       0.36      0.34      0.07     34986
weighted avg       0.53      0.10      0.05     34986

Confusion matrix 
 [[ 2719    15    65]
 [10390    53   226]
 [20721   108   689]]


In [14]:
# 10% labelled data
ul,unlabeled_set = maskfunc(Y, 90)
label_prop_model.fit(X, ul)
score_ninety = label_prop_model.score(X, ul)
class_report(unlabeled_set,90)

Classifcation report 

              precision    recall  f1-score   support

           0       0.09      1.00      0.16      5600
           1       0.00      0.00      0.00     19598
           2       0.84      0.00      0.00     37777

    accuracy                           0.09     62975
   macro avg       0.31      0.33      0.06     62975
weighted avg       0.51      0.09      0.02     62975

Confusion matrix 
 [[ 5599     0     1]
 [19591     0     7]
 [37734     1    42]]


In [15]:
# 5% labelled data
ul,unlabeled_set = maskfunc(Y, 95)
label_prop_model.fit(X, ul)
score_ninetyfive = label_prop_model.score(X, ul)
class_report(unlabeled_set,95)

Classifcation report 

              precision    recall  f1-score   support

           0       0.09      1.00      0.16      5937
           1       0.00      0.00      0.00     20855
           2       0.71      0.00      0.00     39682

    accuracy                           0.09     66474
   macro avg       0.27      0.33      0.05     66474
weighted avg       0.43      0.09      0.01     66474

Confusion matrix 
 [[ 5937     0     0]
 [20853     0     2]
 [39677     0     5]]


# Label Spreading

In [16]:
label_spread_model = LabelSpreading(kernel='knn',gamma=20, n_neighbors=7, alpha=0.2, max_iter=30, tol=0.001, n_jobs=None)

In [17]:
label_spread_model.fit(X, Y)
score_hundred = label_spread_model.score(X, Y)
print('Accuracy with 100 % labelled data\t',round(score_ten,4))

Accuracy with 100 % labelled data	 0.5794


In [18]:
#90% Labelled Data
ul,unlabeled_set = maskfunc(Y, 10)
label_spread_model.fit(X, ul)
score_ninety = label_spread_model.score(X, ul)
class_report_spread(unlabeled_set,10)

Classifcation report 

              precision    recall  f1-score   support

           0       0.05      1.00      0.10       384
           1       0.00      0.00      0.00      1323
           2       0.00      0.00      0.00      5291

    accuracy                           0.05      6998
   macro avg       0.02      0.33      0.03      6998
weighted avg       0.00      0.05      0.01      6998

Confusion matrix 
 [[ 384    0    0]
 [1323    0    0]
 [5291    0    0]]


In [19]:
#80% Labelled Data
ul,unlabeled_set = maskfunc(Y, 20)
label_spread_model.fit(X, ul)
score_eighty = label_spread_model.score(X, ul)
class_report_spread(unlabeled_set,20)

Classifcation report 

              precision    recall  f1-score   support

           0       0.07      1.00      0.13       935
           1       0.00      0.00      0.00      3187
           2       0.00      0.00      0.00      9873

    accuracy                           0.07     13995
   macro avg       0.02      0.33      0.04     13995
weighted avg       0.00      0.07      0.01     13995

Confusion matrix 
 [[ 935    0    0]
 [3187    0    0]
 [9873    0    0]]


In [20]:
#50% Labelled Data
ul,unlabeled_set = maskfunc(Y, 50)
label_spread_model.fit(X, ul)
score_fifty = label_spread_model.score(X, ul)
class_report_spread(unlabeled_set,50)

Classifcation report 

              precision    recall  f1-score   support

           0       0.08      1.00      0.15      2799
           1       0.00      0.00      0.00     10669
           2       1.00      0.00      0.00     21518

    accuracy                           0.08     34986
   macro avg       0.36      0.33      0.05     34986
weighted avg       0.62      0.08      0.01     34986

Confusion matrix 
 [[ 2799     0     0]
 [10669     0     0]
 [21517     0     1]]


In [21]:
#10% Labelled Data
ul,unlabeled_set = maskfunc(Y, 90)
label_spread_model.fit(X, ul)
score_ten = label_spread_model.score(X, ul)
class_report_spread(unlabeled_set,90)

Classifcation report 

              precision    recall  f1-score   support

           0       0.09      1.00      0.16      5600
           1       0.00      0.00      0.00     19598
           2       1.00      0.00      0.00     37777

    accuracy                           0.09     62975
   macro avg       0.36      0.33      0.05     62975
weighted avg       0.61      0.09      0.01     62975

Confusion matrix 
 [[ 5600     0     0]
 [19598     0     0]
 [37773     0     4]]


In [22]:
#5% Labelled Data
ul,unlabeled_set = maskfunc(Y, 95)
label_spread_model.fit(X, ul)
score_five = label_spread_model.score(X, ul)
class_report_spread(unlabeled_set,95)

Classifcation report 

              precision    recall  f1-score   support

           0       0.09      1.00      0.16      5937
           1       0.00      0.00      0.00     20855
           2       0.71      0.00      0.00     39682

    accuracy                           0.09     66474
   macro avg       0.27      0.33      0.05     66474
weighted avg       0.43      0.09      0.01     66474

Confusion matrix 
 [[ 5937     0     0]
 [20853     0     2]
 [39677     0     5]]


# KNN

In [23]:
knn_model = KNeighborsClassifier(n_neighbors=7)

In [24]:
#100% Labelled Data
knn_model.fit(X, Y)
score_0 = knn_model.score(X, Y)
print('Accuracy with 100 % labelled data\t',round(score_0,2))

Accuracy with 100 % labelled data	 0.65


In [25]:
#90% Labelled Data
ul,unlabeled_set = maskfunc(Y, 10)
knn_model.fit(X, ul)
score_10 = knn_model.score(X, ul)
print('Accuracy with 90 % labelled data\t',round(score_10,4))

Accuracy with 90 % labelled data	 0.5946


In [26]:
#80% Labelled Data
ul,unlabeled_set = maskfunc(Y, 20)
knn_model.fit(X, ul)
score_20 = knn_model.score(X, ul)
print('Accuracy with 80 % labelled data\t',round(score_20,4))

Accuracy with 80 % labelled data	 0.5637


In [27]:
#50% Labelled Data
ul,unlabeled_set = maskfunc(Y, 50)
knn_model.fit(X, ul)
score_50 = knn_model.score(X, ul)
print('Accuracy with 50 % labelled data\t',round(score_50,4))

Accuracy with 50 % labelled data	 0.6341


In [28]:
#10% Labelled Data
ul,unlabeled_set = maskfunc(Y, 90)
knn_model.fit(X, ul)
score_90 = knn_model.score(X, ul)
print('Accuracy with 10 % labelled data\t',round(score_90,4))

Accuracy with 10 % labelled data	 0.9013


In [29]:
#5% Labelled Data
ul,unlabeled_set = maskfunc(Y, 95)
knn_model.fit(X, ul)
score_95 = knn_model.score(X, ul)
print('Accuracy with 5 % labelled data\t',round(score_95,4))

Accuracy with 5 % labelled data	 0.9501
