In [35]:
# importing the required packages
import pandas as pd
import numpy as np
from scipy.stats import skew
from scipy.fftpack import fft
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans 
from sklearn.cluster import DBSCAN
from sklearn import metrics
import statistics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate,train_test_split,StratifiedKFold,KFold
import pickle

def DataPreProcessing(CGM_Data):
    no_of_rows=CGM_Data.shape[0]
    no_of_columns = CGM_Data.shape[1]
    CGM_Data.dropna(axis=0, how='all', thresh=no_of_columns/4, subset=None, inplace=True)
    CGM_Data.dropna(axis=1, how='all', thresh=no_of_rows/4, subset=None, inplace=True)
    CGM_Data.interpolate(axis=0, method ='linear', limit_direction ='forward', inplace=True)
    CGM_Data.bfill(axis=1,inplace=True)
    return CGM_Data

def ExtractFeatures(CGM_Data):
    
    Feature_Matrix = pd.DataFrame() 

    # Feature 1 - Fast Fourier Transform
    FFT = pd.DataFrame()
    def calculate_fft_vals(series):
        FFT_abs = abs(fft(series))
        FFT_abs.sort()
        return np.flip(FFT_abs)[0:8]

    FFT['FFT_vals'] = CGM_Data.apply(lambda series: calculate_fft_vals(series), axis=1)
    FFT_Vals= pd.DataFrame(FFT.FFT_vals.tolist(), columns=['FFT1', 'FFT2', 'FFT3', 'FFT4', 'FFT5', 'FFT6', 'FFT7','FFT8'],index=FFT.FFT_vals.index)
    Feature_Matrix = pd.concat([Feature_Matrix,FFT_Vals],axis=1)
    
    
    # Feature 2 - Max of CGM Velocity 
    
    Velocity_Data = pd.DataFrame()
    win_size=6
    total_vals=CGM_Data.shape[1]-win_size

    for index in range(0, total_vals):
        dv = CGM_Data.iloc[:, index + win_size] - CGM_Data.iloc[:, index]
        Velocity_Data['vel'+str(index)] = dv

    Feature_Matrix['Max CGM Vel']=Velocity_Data.max(axis = 1,skipna=True)
    
        
    # Feature 3 - polyfit   
    def calculate_polyfit(series,degree=3):
        row_arr = np.array(series.index)
        return np.polyfit(row_arr, series, degree)
    
    Polyfit_vals = CGM_Data.apply(calculate_polyfit,axis=1,result_type='expand')
    Feature_Matrix = pd.concat([Feature_Matrix,Polyfit_vals],axis=1)
     
 
    return Feature_Matrix


In [36]:
def train_Kmeans_model(X_principal):
    
    print("----------------- K-means---------------")
      
    clusterNum = 6
    k_means = KMeans(init = "k-means++", n_clusters = clusterNum, n_init = 12)
    k_means.fit(X_principal)
    kmeans_labels = k_means.labels_
    print("K-means labels", kmeans_labels)
    
    sse = k_means.inertia_
    print("-----------------------------------------")
    print("SSE of K_means clustering is : ", sse)
    print("-----------------------------------------")
    
    # Categorize all the rows into clusters formed by K-means
    KMeans_Clusters = []                                                           
    for bin in range (0, 6):                               
        new = []                  
        for i in range (0,len(kmeans_labels)):
            if(kmeans_labels[i]==bin):
                new.append(i)     
        KMeans_Clusters.append(new)
        
    # Match K-means labels with ground truth labels and update the K-means labels
    def most_frequent(List): 
        return max(set(List), key = List.count)
    
    Updated_kmeans_labels = kmeans_labels.copy()
    # Loop through each K-means cluster   
    for c in range(0,6):
        kmeans_cluster = KMeans_Clusters[c]
        updated_label = 0 
        true_labels = []
        # Determine the ground truth label for the kmeans cluster based on majority
        for i in range(0,len(kmeans_cluster)):
            val = kmeans_cluster[i]
            true_labels.append(Ground_Truth_list[val])
        updated_label = most_frequent(true_labels)
        # Update the kmeans labels
        for i in range(0,len(kmeans_cluster)):
            val = kmeans_cluster[i]
            Updated_kmeans_labels[val] = updated_label
        
    print("Updated K-means labels", Updated_kmeans_labels)
    
    knn=KNeighborsClassifier(algorithm='auto', leaf_size=10, n_neighbors=20, p=2, weights='uniform')
    knn.fit(X_principal,Updated_kmeans_labels)

    accuracy = accuracy_score(Updated_kmeans_labels,Ground_Truth_list)*100
    print("-----------------------------------------")
    print("Accuracy of K-Means clustering ",accuracy)
    print("-----------------------------------------")
    
    handler = open("KMeans.model","wb")
    knn = KNeighborsClassifier(n_neighbors=20)
    knn.fit(X_principal,Updated_kmeans_labels)
    pickle.dump(knn,handler)
    handler.close()
    
    

In [37]:
def train_DBSCAN_model(X_principal):
    
    print("----------------- DBSCAN---------------")
    db = DBSCAN(eps=0.157, min_samples=7)
    db.fit(X_principal)
    db_labels = db.labels_
    print("DBSCAN labels", db_labels)
    
    # Categorize all the rows into clusters formed by DBSCAN
    DBSCAN_Clusters = []                                                           
    for bin in range (-1, 6):                               
        new = []                  
        for i in range (0,len(db_labels)):
            if(db_labels[i]==bin):
                new.append(i)     
        DBSCAN_Clusters.append(new)
        
    # Match DBSCAN labels with ground truth labels and update the DBSCAN labels
    def most_frequent(List): 
        return max(set(List), key = List.count)
    
    Updated_dbscan_labels = db_labels.copy()
    # Loop through each DBSCAN cluster   
    for c in range(0,7):
        db_cluster = DBSCAN_Clusters[c]
        updated_label = 0 
        true_labels = []
        # Determine the ground truth label for the dbscan cluster based on majority
        for i in range(0,len(db_cluster)):
            val = db_cluster[i]
            true_labels.append(Ground_Truth_list[val])
        updated_label = most_frequent(true_labels)
        # Update the dbscan labels
        for i in range(0,len(db_cluster)):
            val = db_cluster[i]
            Updated_dbscan_labels[val] = updated_label    
        
    print("Updated DBSCAN labels", Updated_dbscan_labels)
    
    knn=KNeighborsClassifier(algorithm='auto', leaf_size=10, n_neighbors=20, p=2, weights='uniform')
    knn.fit(X_principal,Updated_dbscan_labels)

    accuracy = accuracy_score(Updated_dbscan_labels,Ground_Truth_list)*100
    print("-----------------------------------------")
    print("Accuracy of DBSCAN clustering ",accuracy)
    print("-----------------------------------------")
    
    handler = open("DBSCAN.model","wb")
    knn = KNeighborsClassifier(n_neighbors=20)
    knn.fit(X_principal,Updated_dbscan_labels)
    pickle.dump(knn,handler)
    handler.close()
    
                


In [38]:
if __name__=="__main__":
    # Read Meal Data
    column_names = [i for i in range(0,31)]
    meal_data_1 = pd.read_csv("mealData1.csv",names=column_names)
    meal_data_2 = pd.read_csv("mealData2.csv",names=column_names)
    meal_data_3 = pd.read_csv("mealData3.csv",names=column_names)
    meal_data_4 = pd.read_csv("mealData4.csv",names=column_names)
    meal_data_5 = pd.read_csv("mealData5.csv",names=column_names)
    CGM_Meal_Data = pd.concat([meal_data_1,meal_data_2,meal_data_3,meal_data_4,meal_data_5],axis=0,ignore_index=True)
    
    # Data Pre-Processing
    CGM_Meal_Data = DataPreProcessing(CGM_Meal_Data)
    
    # Read meal amounts 
    meal_amt_1 = pd.read_csv("mealAmountData1.csv",names= ["Meal Amt"],nrows=len(meal_data_1))
    meal_amt_2 = pd.read_csv("mealAmountData2.csv",names= ["Meal Amt"],nrows=len(meal_data_2))
    meal_amt_3 = pd.read_csv("mealAmountData3.csv",names= ["Meal Amt"],nrows=len(meal_data_3))
    meal_amt_4 = pd.read_csv("mealAmountData4.csv",names= ["Meal Amt"],nrows=len(meal_data_4))
    meal_amt_5 = pd.read_csv("mealAmountData5.csv",names= ["Meal Amt"],nrows=len(meal_data_5))
    CGM_Meal_Amt = pd.concat([meal_amt_1,meal_amt_2,meal_amt_3,meal_amt_4,meal_amt_5],axis=0, ignore_index=True)
    
    # Assign ground truth labels to meal amounts
    Meal_Amt_label = pd.DataFrame()
    
    def label_val(x):
        if(x==0):
            return int(x)+1
        elif(x%20==0):
            return int(x/20)+1
        else:
            return int(x/20)+2

    Meal_Amt_label['BINS'] = CGM_Meal_Amt.apply(lambda row: label_val(row['Meal Amt']), axis=1)   

    # Join Meal Data and Meal Amount
    Meal_Data_and_Amt = CGM_Meal_Data.join(Meal_Amt_label)
    Meal_Data_and_Amt = Meal_Data_and_Amt.reset_index(drop=True)
    
    # Categorize all the ground truth labels into clusters 
    Ground_Truth_list = Meal_Data_and_Amt["BINS"].tolist()
    
    Ground_Truth_Clusters = []                                                           
    for bin in range (0, 6):                               
        new = []                  
        for i in range (0,len(Ground_Truth_list)):
            if(Ground_Truth_list[i]==bin):
                new.append(i)     
        Ground_Truth_Clusters.append(new)  
    print("Ground Truth labels", Ground_Truth_list)

    #Extract Features
    Feature_Matrix = ExtractFeatures(CGM_Meal_Data) 
    Feature_Matrix = Feature_Matrix.reset_index(drop=True)
    
    # Standardize feature matrix
    Feature_Matrix_std = StandardScaler().fit_transform(Feature_Matrix)
    
    # Normalize the data so that the data follows a Gaussian distribution
    Feature_Matrix_norm = normalize(Feature_Matrix_std)
    Feature_Matrix_norm = pd.DataFrame(Feature_Matrix_norm)
    
    # Do PCA
    pca=PCA(n_components=2)
    X_principal = pca.fit_transform(Feature_Matrix_norm)
    X_principal = pd.DataFrame(X_principal)
    X_principal.columns = ['PCA1','PCA2']
    
    # Train KMeans model
    train_Kmeans_model(X_principal)
    
    # Train DBSCAN model
    train_DBSCAN_model(X_principal)

    

Ground Truth labels [4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 6, 3, 5, 4, 6, 3, 5, 4, 6, 3, 5, 4, 6, 1, 5, 2, 6, 1, 5, 2, 6, 1, 5, 2, 6, 1, 5, 3, 6, 4, 3, 2, 1, 3, 2, 1, 1, 5, 1, 2, 3, 2, 3, 2, 1, 5, 1, 2, 3, 2, 3, 2, 1, 5, 1, 2, 3, 2, 3, 2, 1, 5, 1, 3, 2, 3, 2, 1, 5, 1, 2, 3, 2, 3, 2, 1, 5, 1, 1, 1, 1, 1, 4, 1, 3, 4, 1, 4, 1, 3, 4, 1, 4, 1, 3, 4, 1, 1, 3, 4, 1, 4, 1, 3, 4, 1, 1, 1, 4, 4, 2, 1, 1, 1, 1, 1, 4, 4, 2, 1, 1, 1, 1, 4, 4, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 1, 3, 4, 2, 2, 2, 4, 2, 3, 1, 3, 4, 2, 2, 2, 4, 2, 3, 1, 3, 4, 2, 2, 2, 4, 2, 3, 1, 3, 4, 2, 2, 2, 4, 2, 3, 1, 3, 4, 2, 2, 2, 1, 4, 4, 1, 4, 4, 4, 1, 4, 4, 4, 1, 4, 4, 4, 1, 4, 4, 2, 2, 1, 4, 4, 2, 1, 4, 4, 2, 2, 1, 4, 4, 2, 2, 1, 4, 4, 2, 2, 1, 4, 3, 1, 1, 4, 1, 4, 3, 1, 1]
----------------- K-means---------------
K-means labels [4 0 1 0 0 5 1 3 4 5 3 1 2 0 5 3 3 1 5 2 3 3 4 0 5 1 2 2 0 3 1 3 3 4 1 5 3
 3 3 0 5 4 5 1 4 1 1 4 0 3 3 2 4 2 3 2 0 0 4 1 1 3 3 1 1 5 3 0 3 3 3 1 2 5
 0 3 2 0 2 2 2 4 0 4 5 4 2 0 0 4 4 4 2