In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Loading the required dataset
medical_data = pd.read_csv("dataset_diabetes/diabetic_data.csv",na_values="?",low_memory=False)
medical_data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [3]:
# Dropping columns with high percentage of missing values
medical_data.drop(columns=["weight","payer_code","medical_specialty"],inplace= True)

In [4]:
# Dropping column encounter_id
print("Unique values in column encounter_id: {}\nNumber of values in column encounter_id: {}".
      format(len(medical_data["encounter_id"].unique()),len(medical_data["encounter_id"])))
print("Hence being dropped")
medical_data.drop(columns=["encounter_id"],inplace= True)

Unique values in column encounter_id: 101766
Number of values in column encounter_id: 101766
Hence being dropped


In [34]:
# Splliting the dataset into features and target

X = medical_data.iloc[:,:-1] # Features
Y = medical_data.iloc[:,-1] # Target variabels

Y=Y.apply(lambda x: 1 if x != "NO" else 0)

In [35]:
# # Filling in all the missing values remaining

# # Using simple imputer and filling in the most frequtn values inplace of nan
# from sklearn.impute import SimpleImputer
# imputer = SimpleImputer(strategy= "most_frequent")
# filled_X= imputer.fit_transform(X)

# Assigning filled values
X.iloc[:,:]=filled_X


In [36]:
# Converting categorical variables into numerical variables 

from sklearn.preprocessing import OrdinalEncoder

def gender_encoding (a):
    """
    Input: Gender value    
    Output: Encoded gender value
    
    Function to encode Male as 1, Female as -1 
    and Other as 0    
    """
    # Returning encoded values
    if a ==  "Male":
        return 1
    elif a == "Female":
        return -1
    else:
        return 0

def medicine_status_encoding(b):
    """
    Input: Medicine status    
    Output: Encoded Medicine value
    
    Function to encode Up as 2, down as 1,
    no as -1 and steady as 0
    
    """
    # Returning encoded values
    if b == "Up":
        return 2
    elif b == "Down":
        return 1
    elif b == "Steady":
        return 0
    elif b == "No":
        return -1
    
def cat_to_num(X, to_ordinal =[],medicines = []):
    '''
    Input:  X - original dataset
            to_ordinal  - the columns that need to be oridinal encoded
            medicines = The columns containing medicine status 
    
    Output: X - after all the processing done
    
    Function to change the categorical variables present in 
    the dataset to encoded numericals  
    
    '''
    Oren = OrdinalEncoder(dtype = np.int64)
    n = X.shape[1]
    # looping through the columns and encodering the categorical column     
    for i in range(n):
        
        # Ordinal encoding for ordinal columns
        if X.iloc[:,i].name in to_ordinal:
            X.loc[:,X.iloc[:,i].name] = Oren.fit_transform(X.iloc[:,[i]])
            
        # Gender encoding the gender columns
        elif X.iloc[:,i].name == "gender":
            X[X.iloc[:,i].name] = X[X.iloc[:,i].name].apply(gender_encoding) 
        
        # Encoding the medcine status
        elif X.iloc[:,i].name in medicines:
            X[X.iloc[:,i].name] = X[X.iloc[:,i].name].apply(medicine_status_encoding)         
    
    return X



# Defining the columns to be ordinal encoded and columns with medicine status
medicine_columns = X.iloc[:,20:-2].columns
ordinal_columns = ["patient_nbr","race","age","diag_1","diag_2","diag_3","max_glu_serum","A1Cresult","change","diabetesMed"]

# Converting all the categorical variables to encoded numericals
X = cat_to_num(X,to_ordinal=ordinal_columns,medicines=medicine_columns)


### Semi-supervised learning

In [42]:
from sklearn.preprocessing import StandardScaler

normalizer = StandardScaler()

X.iloc[:,:] = normalizer.fit_transform(X)

In [43]:
from sklearn.model_selection import train_test_split
def SSL_load_data(data,target,percentage):
    """
    Input: data - the features for the semi-supervised learning task
           target - the target variable
           percentage - the percentage of data should not be labelled
    Output:
           data_labelled - data for the targets that are labelled
           data_unlabelled - data for the targets that are unlabelled
           target_labelled - targets that are labelled
           target_unlabelled - targets that are unlabelled
    """
    data_labelled,data_unlabelled,target_labelled,target_unlabelled = train_test_split(data,target,
                                                                                       test_size = percentage,
                                                                                       random_state = 123)
    
    return  data_labelled,data_unlabelled,target_labelled,target_unlabelled

In [44]:
from sklearn.semi_supervised import LabelPropagation,LabelSpreading

In [47]:
# 10 percent
X_labelled,X_unlabelled,Y_labelled,Y_unlabelled = SSL_load_data(X,Y,.1)

lp_model_1 = LabelPropagation(kernel="knn",n_jobs=-1)
ls_model_1 = LabelSpreading(kernel="knn",n_jobs=-1)

lp_model_1.fit(X_labelled,Y_labelled)

ls_model_1.fit(X_labelled,Y_labelled)

LabelPropagation(kernel='knn', n_jobs=-1)

In [55]:
lp_1_accuracy = lp_model_1.score(X_unlabelled,Y_unlabelled)
lp_1_pred = lp_model_1.predict(X_unlabelled)
ls_1_accuracy=ls_model_1.score(X_unlabelled,Y_unlabelled)
ls_1_pred = ls_model_1.predict(X_unlabelled)

In [49]:
# 20 percent
X_labelled,X_unlabelled,Y_labelled,Y_unlabelled = SSL_load_data(X,Y,.2)

lp_model_2 = LabelPropagation(kernel="knn",n_jobs=-1)
ls_model_2 = LabelSpreading(kernel="knn",n_jobs=-1)

lp_model_2.fit(X_labelled,Y_labelled)
ls_model_2.fit(X_labelled,Y_labelled)

LabelSpreading(kernel='knn', n_jobs=-1)

In [56]:
lp_2_accuracy = lp_model_2.score(X_unlabelled,Y_unlabelled)
lp_2_pred = lp_model_2.predict(X_unlabelled)
ls_2_accuracy=ls_model_2.score(X_unlabelled,Y_unlabelled)
ls_2_pred = ls_model_2.predict(X_unlabelled)

In [50]:
# 50 percent
X_labelled,X_unlabelled,Y_labelled,Y_unlabelled = SSL_load_data(X,Y,.5)

lp_model_3 = LabelPropagation(kernel="knn",n_jobs=-1)
ls_model_3 = LabelSpreading(kernel="knn",n_jobs=-1)

lp_model_3.fit(X_labelled,Y_labelled)
ls_model_3.fit(X_labelled,Y_labelled)

LabelSpreading(kernel='knn', n_jobs=-1)

In [57]:
lp_3_accuracy = lp_model_3.score(X_unlabelled,Y_unlabelled)
lp_3_pred = lp_model_3.predict(X_unlabelled)
ls_3_accuracy=ls_model_3.score(X_unlabelled,Y_unlabelled)
ls_3_pred = ls_model_3.predict(X_unlabelled)

In [51]:
# 90 percent
X_labelled,X_unlabelled,Y_labelled,Y_unlabelled = SSL_load_data(X,Y,.9)

lp_model_4 = LabelPropagation(kernel="knn",n_jobs=-1)
ls_model_4 = LabelSpreading(kernel="knn",n_jobs=-1)

lp_model_4.fit(X_labelled,Y_labelled)
ls_model_4.fit(X_labelled,Y_labelled)

LabelSpreading(kernel='knn', n_jobs=-1)

In [None]:
lp_4_accuracy = lp_model_4.score(X_unlabelled,Y_unlabelled)
lp_4_pred = lp_model_4.predict(X_unlabelled)
ls_4_accuracy=ls_model_4.score(X_unlabelled,Y_unlabelled)
ls_4_pred = ls_model_4.predict(X_unlabelled)

In [52]:
# 95 percent
X_labelled,X_unlabelled,Y_labelled,Y_unlabelled = SSL_load_data(X,Y,.95)

lp_model_5 = LabelPropagation(kernel="knn",n_jobs=-1)
ls_model_5 = LabelSpreading(kernel="knn",n_jobs=-1)

lp_model_5.fit(X_labelled,Y_labelled)
ls_model_5.fit(X_labelled,Y_labelled)

LabelSpreading(kernel='knn', n_jobs=-1)

In [None]:
lp_5_accuracy = lp_model_5.score(X_unlabelled,Y_unlabelled)
lp_5_pred = lp_model_5.predict(X_unlabelled)
ls_5_accuracy=ls_model_5.score(X_unlabelled,Y_unlabelled)
ls_5_pred = ls_model_5.predict(X_unlabelled)