In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

from sklearn.svm import LinearSVC, SVC, OneClassSVM

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report

In [3]:
diabetes_data = pd.read_csv('data/diabetes/diabetic_data.csv')

In [4]:
diabetes_data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [5]:
diabetes_data.dtypes

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

In [6]:
diabetes_data.shape

(101766, 50)

In [7]:
diabetes_target = diabetes_data['readmitted']
diabetes_attributes = diabetes_data.drop('readmitted', axis = 1)

In [8]:
diabetes_attributes = pd.get_dummies(diabetes_attributes)

In [9]:
diabetes_attributes.shape

(101766, 2472)

In [10]:
diabetes_attributes_scaled = MinMaxScaler().fit_transform(diabetes_attributes)

In [11]:
all_data, _, all_targets,_ = train_test_split(diabetes_attributes_scaled, diabetes_target, train_size = 0.1)

In [12]:
len(all_data)

10176

In [13]:
attributes_train, attributes_test, targets_train, targets_test = train_test_split(all_data, all_targets, test_size = 0.2, stratify = all_targets)

In [14]:
attributes_train.shape

(8140, 2472)

In [15]:
svm = LinearSVC(C = 10, max_iter = 1e3)

In [16]:
svm.fit(attributes_train, targets_train)



LinearSVC(C=10, max_iter=1000.0)

In [17]:
svm.coef_

array([[-0.14537085,  0.16782875, -0.10371594, ..., -0.02733191,
        -0.02276941, -0.06151356],
       [-0.47481036,  0.54290679,  0.06912833, ...,  0.00697716,
        -0.04554885,  0.04499636],
       [ 0.57841854, -0.54910028, -0.05182861, ..., -0.07869577,
         0.0902217 , -0.12181672]])

In [18]:
gaussian_svm = SVC(kernel = 'rbf', C = 10)

In [19]:
gaussian_svm.fit(attributes_train, targets_train)

SVC(C=10)

In [20]:
print(classification_report(targets_train, svm.predict(attributes_train)))

              precision    recall  f1-score   support

         <30       0.77      0.16      0.26       909
         >30       0.65      0.48      0.55      2823
          NO       0.67      0.89      0.76      4408

    accuracy                           0.66      8140
   macro avg       0.69      0.51      0.52      8140
weighted avg       0.67      0.66      0.63      8140



In [21]:
print(classification_report(targets_train, gaussian_svm.predict(attributes_train)))

              precision    recall  f1-score   support

         <30       0.99      0.42      0.59       909
         >30       0.85      0.78      0.81      2823
          NO       0.82      0.96      0.88      4408

    accuracy                           0.84      8140
   macro avg       0.89      0.72      0.76      8140
weighted avg       0.85      0.84      0.83      8140



In [22]:
print(classification_report(targets_test, svm.predict(attributes_test)))

              precision    recall  f1-score   support

         <30       0.12      0.03      0.04       228
         >30       0.44      0.32      0.37       706
          NO       0.60      0.80      0.69      1102

    accuracy                           0.55      2036
   macro avg       0.39      0.38      0.37      2036
weighted avg       0.49      0.55      0.51      2036



In [23]:
print(classification_report(targets_test, gaussian_svm.predict(attributes_test)))

              precision    recall  f1-score   support

         <30       0.16      0.02      0.04       228
         >30       0.47      0.39      0.42       706
          NO       0.61      0.78      0.69      1102

    accuracy                           0.56      2036
   macro avg       0.41      0.40      0.38      2036
weighted avg       0.51      0.56      0.52      2036



In [24]:
knn = KNeighborsClassifier(n_neighbors = 5)

In [25]:
knn.fit(attributes_train, targets_train)

KNeighborsClassifier()

In [26]:
knn.predict(attributes_train)

array(['NO', 'NO', 'NO', ..., 'NO', '<30', 'NO'], dtype=object)

In [27]:
print(classification_report(targets_train, knn.predict(attributes_train)))

              precision    recall  f1-score   support

         <30       0.45      0.31      0.37       909
         >30       0.58      0.62      0.60      2823
          NO       0.74      0.75      0.74      4408

    accuracy                           0.66      8140
   macro avg       0.59      0.56      0.57      8140
weighted avg       0.65      0.66      0.65      8140



In [28]:
print(classification_report(targets_test, knn.predict(attributes_test)))

              precision    recall  f1-score   support

         <30       0.16      0.11      0.13       228
         >30       0.37      0.40      0.39       706
          NO       0.56      0.57      0.57      1102

    accuracy                           0.46      2036
   macro avg       0.37      0.36      0.36      2036
weighted avg       0.45      0.46      0.46      2036



In [29]:
anomaly_detector = OneClassSVM(nu = 0.02)

In [30]:
anomaly_detector.fit(attributes_train)

OneClassSVM(nu=0.02)

In [31]:
predictions = anomaly_detector.predict(attributes_train)

In [32]:
predictions.sum() / len(predictions)

0.9601965601965602