### Data modeling imports

In [24]:
# Natural Language Processing libraries
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

# Machine learning libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, svm, naive_bayes
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV

# PyTorch libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, vocab

# General-purpose libraries
import os
import random
import re
import string
import time
from datetime import timedelta, datetime
from tqdm import tqdm

### Constants and paths

In [2]:
DATA_PATH = "/Users/sg/dev/dl4hc_proj/data/data_source"

In [3]:
data_ns = pd.read_csv(os.path.join(DATA_PATH, 'data_ns.csv'))
data = pd.read_csv(os.path.join(DATA_PATH, 'data.csv'))
encoder = LabelEncoder()
vectorizer = TfidfVectorizer(max_features=600)

In [6]:
def run_svm(df, encoder, vectorizer):
    for morbidity in df["class"].unique():
        data = df[df['class'] == morbidity]
        X_train, X_test, y_train, y_test = train_test_split(data['data'], data['judgment'], test_size=0.20, shuffle=True)
        Train_Y  = encoder.fit_transform(y_train)
        Test_Y  = encoder.fit_transform(y_test)
        Train_X_Tfidf = vectorizer.fit_transform(X_train)
        Test_X_Tfidf = vectorizer.fit_transform(X_test)

        SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
        SVM.fit(Train_X_Tfidf, y_train)
        predictions_SVM = SVM.predict(Test_X_Tfidf)
        f1_macro = f1_score(y_test, predictions_SVM, average='macro')
        f1_micro = f1_score(y_test, predictions_SVM, average='micro')

        print(morbidity)
        print("f1-macro", f1_macro)
        print("f1-micro", f1_micro)
    
run_svm(data, encoder, vectorizer)

Asthma
f1-macro 0.4564102564102564
f1-micro 0.839622641509434
CHF
f1-macro 0.5376811594202899
f1-micro 0.696551724137931
Depression
f1-macro 0.4427083333333333
f1-micro 0.794392523364486
Diabetes
f1-macro 0.5860119047619048
f1-micro 0.6355140186915887
Gallstones
f1-macro 0.45137157107231923
f1-micro 0.8227272727272728
Gout
f1-macro 0.4674698795180723
f1-micro 0.8778280542986425
Hypercholesterolemia
f1-macro 0.3014509174456299
f1-micro 0.40414507772020725
Hypertriglyceridemia
f1-macro 0.4807692307692307
f1-micro 0.9259259259259259
OA
f1-macro 0.43699731903485256
f1-micro 0.7761904761904762
OSA
f1-macro 0.45112781954887216
f1-micro 0.821917808219178
Obesity
f1-macro 0.49447197865040027
f1-micro 0.5096153846153846
CAD
f1-macro 0.4771856495994427
f1-micro 0.6220095693779905
Hypertension
f1-macro 0.4482758620689655
f1-micro 0.8125
PVD
f1-macro 0.4535809018567639
f1-micro 0.8300970873786409
Venous Insufficiency
f1-macro 0.47814207650273227
f1-micro 0.9162303664921466
GERD
f1-macro 0.43768996

### SVM performs poorly on some classes. The data is scewed. Training SVM with parameters from the paper and ExtraTreesClassifier.

In [28]:
X_train, X_test, y_train, y_test = train_test_split(data['data'], data['judgment'], test_size=0.20, shuffle=True)
Test_X_Tfidf = vectorizer.fit_transform(X_test)
Train_X_Tfidf = vectorizer.fit_transform(X_train)
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf, y_train)
predictions_SVM = SVM.predict(Test_X_Tfidf)

print("accuracy_score", accuracy_score(predictions_SVM, y_test))
f1_macro = f1_score(y_test, predictions_SVM, average='macro')
f1_micro = f1_score(y_test, predictions_SVM, average='micro')
print("f1-macro", f1_macro)
print("f1-micro", f1_micro)

accuracy_score 0.6802450229709035
f1-macro 0.40484870579657306
f1-micro 0.6802450229709035


### Trying out different hyperparameters

In [29]:
SVM = svm.SVC(verbose=True)
feature_selection_model = ExtraTreesClassifier(n_estimators=100)
transformer = SelectFromModel(feature_selection_model)
Train_X_Tfidf_selected = transformer.fit_transform(Train_X_Tfidf, y_train)
param_grid = {'kernel': ['linear', 'poly', 'rbf'], 'gamma': [0.1, 1, 10]}
grid_search = GridSearchCV(SVM, param_grid, cv=5, verbose = 2)
grid_search.fit(Train_X_Tfidf_selected, y_train)
print("Best Hyperparameters: ", grid_search.best_params_)


Fitting 5 folds for each of 9 candidates, totalling 45 fits
[LibSVM]....
*...
*
optimization finished, #iter = 7630
obj = -6829.997904, rho = -0.999859
nSV = 7174, nBSV = 6429
Total nSV = 7174
[CV] END ...........................gamma=0.1, kernel=linear; total time=  45.3s
[LibSVM]....
*...
*
optimization finished, #iter = 7393
obj = -6829.997120, rho = -0.999673
nSV = 7166, nBSV = 6395
Total nSV = 7166
[CV] END ...........................gamma=0.1, kernel=linear; total time=  45.4s
[LibSVM]....
*...
*
optimization finished, #iter = 7678
obj = -6827.997642, rho = -0.999843
nSV = 7185, nBSV = 6428
Total nSV = 7185
[CV] END ...........................gamma=0.1, kernel=linear; total time=  45.4s
[LibSVM]....
*...
*
optimization finished, #iter = 7724
obj = -6827.996331, rho = -0.999627
nSV = 7174, nBSV = 6422
Total nSV = 7174
[CV] END ...........................gamma=0.1, kernel=linear; total time=  46.1s
[LibSVM]....
*....
*
optimization finished, #iter = 8515
obj = -6827.996861, rho = -

[CV] END ................................gamma=1, kernel=rbf; total time=  48.0s
[LibSVM]....
*...
*
optimization finished, #iter = 7630
obj = -6829.997904, rho = -0.999859
nSV = 7174, nBSV = 6429
Total nSV = 7174
[CV] END ............................gamma=10, kernel=linear; total time=  45.4s
[LibSVM]....
*...
*
optimization finished, #iter = 7393
obj = -6829.997120, rho = -0.999673
nSV = 7166, nBSV = 6395
Total nSV = 7166
[CV] END ............................gamma=10, kernel=linear; total time=  45.5s
[LibSVM]....
*...
*
optimization finished, #iter = 7678
obj = -6827.997642, rho = -0.999843
nSV = 7185, nBSV = 6428
Total nSV = 7185
[CV] END ............................gamma=10, kernel=linear; total time=  45.4s
[LibSVM]....
*...
*
optimization finished, #iter = 7724
obj = -6827.996331, rho = -0.999627
nSV = 7174, nBSV = 6422
Total nSV = 7174
[CV] END ............................gamma=10, kernel=linear; total time=  45.0s
[LibSVM]....
*....
*
optimization finished, #iter = 8515
obj = 

In [30]:
best_svm_model = svm.SVC(kernel=grid_search.best_params_['kernel'], gamma=grid_search.best_params_['gamma'])
best_svm_model.fit(Train_X_Tfidf_selected, y_train)
print("accuracy_score", accuracy_score(predictions_SVM, y_test))
f1_macro = f1_score(y_test, predictions_SVM, average='macro')
f1_micro = f1_score(y_test, predictions_SVM, average='micro')
print("f1-macro", f1_macro)
print("f1-micro", f1_micro)

accuracy_score 0.6802450229709035
f1-macro 0.40484870579657306
f1-micro 0.6802450229709035
