<a href="https://colab.research.google.com/github/Saumye13/Disambiguates-Medical-Abbreviations/blob/main/medical_disambig_data_processing_models_and_cnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Preprocess the data

In [None]:
import shutil
import string
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
import ast
from sklearn import utils
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score, classification_report
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [None]:
train = pd.read_csv('train_500.csv')
valid = pd.read_csv('valid_500.csv')
test = pd.read_csv('test_500.csv')

In [None]:
def createFeature(df):
    return [x.split(' ')[y] for x,y in zip(df['TEXT'], df['LOCATION'])]

In [None]:
train['ABV'] = createFeature(train)
valid['ABV'] = createFeature(valid)
test['ABV'] = createFeature(test)

In [None]:
grouped = train.groupby(by=['ABV', 'LABEL'], as_index = False, sort = False).count()
grouped = grouped.sort_values(by='TEXT', ascending = False)

In [None]:
topAbv = grouped['ABV'][:20]

In [None]:
train = train[train['ABV'].isin(topAbv)]
valid = valid[valid['ABV'].isin(topAbv)]
test = test[test['ABV'].isin(topAbv)]

In [None]:
def removePunctuation(df):
    return [t.translate(str.maketrans('','',string.punctuation)) for t in df['TEXT']]

In [None]:
def createTokens(df):
    return df['TEXT'].apply(lambda x: x.split(' '))

In [None]:
def dropCols(df):
    return df.drop(columns=['ABSTRACT_ID', 'LOCATION', 'TEXT'])

In [None]:
def tolower(df):
    return [t.lower() for t in df['TEXT']]

In [None]:
def preProcessData(df):
    df['TEXT'] = tolower(df)
    df['TEXT'] = removePunctuation(df)
    df['TOKEN'] = createTokens(df)
    df = dropCols(df)
    return df

In [None]:
train = preProcessData(train)
valid = preProcessData(valid)
test = preProcessData(test)

In [None]:
train.head(3)

Unnamed: 0,LABEL,ABV,TOKEN
12,tolerance test,ATT,"[the, antiobesity, and, antidiabetic, effects,..."
36,core biopsy,CB,"[the, wirelocalized, extirpation, is, the, gol..."
46,human promyelocytic,HL-60,"[apoptosis, has, been, investigated, in, nb, a..."


### Lets keep only relevant records in Valid and test set.

In [None]:
    abbrev = list(train['ABV'].unique())
    valid = valid[valid['ABV'].isin(abbrev)]
    test = test[test['ABV'].isin(abbrev)]
    labels = list(train['LABEL'].unique())
    valid = valid[valid['LABEL'].isin(labels)]
    test = test[test['LABEL'].isin(labels)]


In [None]:
train_tagged = train.apply(lambda x: TaggedDocument(words = x['TOKEN'], tags = [x['LABEL']]), axis=1)
valid_tagged = valid.apply(lambda x: TaggedDocument(words = x['TOKEN'], tags = [x['LABEL']]), axis=1)
test_tagged = test.apply(lambda x: TaggedDocument(words = x['TOKEN'], tags = [x['LABEL']]), axis=1)

In [None]:
train_tagged.values[:5]

array([TaggedDocument(words=['the', 'antiobesity', 'and', 'antidiabetic', 'effects', 'of', 'a', 'highly', 'tps', 'beta', 'ar', 'agonist', 'cl', 'cl', 'beta', 'beta', 'beta', 'were', 'investigated', 'in', 'otsuka', 'longevans', 'tokushima', 'fatty', 'fatty', 'and', 'leto', 'control', 'rats', 'daily', 'injection', 'of', 'cl', 'mgkg', 'sc', 'to', 'these', 'rats', 'weeks', 'old', 'for', 'weeks', 'caused', 'a', 'significant', 'reduction', 'in', 'body', 'weight', 'fatty', 'control', 'associated', 'with', 'a', 'marked', 'decrease', 'in', 'fat', 'pad', 'weight', 'inguinal', 'fatty', 'control', 'retroperitoneal', 'fatty', 'control', 'without', 'affecting', 'food', 'ni', 'the', 'c2', 'of', 'uncoupling', 'protein', 'mrna', 'and', 'protein', 'c2', 'of', 'uncoupling', 'protein', 'ucp', 'as', 'well', 'as', 'guanosine', 'diphosphatebinding', 'a', 'reliable', 'index', 'of', 'thermogenesis', 'in', 'brown', 'at', 'were', 'lower', 'in', 'the', 'fatty', 'than', 'in', 'the', 'control', 'rats', 'however', '

## Apply Doc2vec vectorizer on the Dataset

In [None]:
vectorize = Doc2Vec(dm=0, vector_size=100, min_count=2, window = 2)
vectorize.build_vocab(train_tagged.values)

In [None]:
vectorize.train(train_tagged.values, total_examples=len(train_tagged.values), epochs=30)

In [None]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, epochs=30)) for doc in sents])
    return targets, regressors

In [None]:
y_train, X_train = vec_for_learning(vectorize, train_tagged)

In [None]:
def abbrev_predict(model, x_vec, x_abv, abv_dict):
    pred_probs = model.predict_proba(x_vec)

    returned_preds = []

    for pred_prob, x_abv in zip(pred_probs, x_abv):
        probs = dict(zip(model.classes_, pred_prob))
        probs_sorted = {k: v for k, v in sorted(probs.items(), key=lambda item: item[1], reverse=True)}
        for class_, prob in probs_sorted.items():
            if abv_dict[class_] == x_abv:
                returned_preds.append(class_)
                break

    return returned_preds

## Model# 1: Logistic Classifier

### Lets perform a Grid Search to get the best possible combination of Hyperparameters for Logistic Regression Model

In [None]:
param_grid = {'C':[0.001, 0.01, 0.1, 1, 10, 100]}
grid_model = GridSearchCV(LogisticRegression(n_jobs=-1), param_grid)

In [None]:
grid_model.fit(X_train, y_train)

In [None]:
grid_model.best_params_

In [None]:
grid_model.best_score_

### Apply the best parameters to Logistic Regression and train the model.

In [None]:
logreg = LogisticRegression(n_jobs=-1, C=1)
logreg.fit(X_train, y_train)

In [None]:
y_valid, X_valid = vec_for_learning(vectorize, valid_tagged)

abvs = train[["LABEL", "ABV"]].drop_duplicates()
abvs_dict = dict(zip(abvs.LABEL, abvs.ABV))


y_pred_valid = abbrev_predict(logreg, X_valid, valid.ABV, abvs_dict)

In [None]:
valid["Prediction"] = y_pred_valid
valid["Target"] = y_valid
valid.to_csv("Logistic_Regression_Results.csv")

In [None]:
print('Validation Accuracy:', accuracy_score(y_valid, y_pred_valid))
print('Validation F1-Score:', f1_score(y_valid, y_pred_valid, average='weighted'))

Validation Accuracy: 1.0
Validation F1-Score: 1.0


In [None]:
y_test, X_test = vec_for_learning(vectorize, test_tagged)
y_pred_test = logreg.predict(X_test)

### Lets calculate some Performance Metrics on the Test predictions.

In [None]:
accuracy = accuracy_score(y_test, y_pred_test)
f1_scr = f1_score(y_test, y_pred_test, average='weighted')
print('Test Accuracy:', accuracy)
print('Test F1-Score:', f1_scr)

## Model# 2: SVM

### Lets perform a Grid Search to get the best possible combination of Hyperparameters for SVM's

In [None]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']}
grid_svm = GridSearchCV(SVC(), param_grid)

In [None]:
grid_svm.fit(X_train, y_train)

In [None]:
grid_svm.best_params_

In [None]:
grid_svm.best_score_

### Apply the best parameters to SVC and train the model

In [None]:
svcModel = SVC(C=10, gamma=0.01, kernel='rbf', probability=True)
svcModel.fit(X_train, y_train)

In [None]:
y_valid, X_valid = vec_for_learning(vectorize, valid_tagged)
y_pred_valid = abbrev_predict(svcModel, X_valid, valid.ABV, abvs_dict)

In [None]:
print('SVM Validation Accuracy:', accuracy_score(y_valid, y_pred_valid))
print('SVM Validation F1-Score:', f1_score(y_valid, y_pred_valid, average='weighted'))

SVM Validation Accuracy: 1.0
SVM Validation F1-Score: 1.0


In [None]:
y_test, X_test = vec_for_learning(vectorize, test_tagged)
y_pred_test = svcModel.predict(X_test)

### Lets calculate some Performance Metrics on the Test predictions.

In [None]:
accuracy = accuracy_score(y_test, y_pred_test)
f1_scr = f1_score(y_test, y_pred_test, average='weighted')
print('SVM Test Accuracy:', accuracy)
print('SVM Test F1-Score:', f1_scr)

SVM Test Accuracy: 0.6270270270270271
SVM Test F1-Score: 0.6343832143832143


In [None]:
valid["Prediction"] = y_pred_valid
valid["Target"] = y_valid
valid.to_csv("SVM_results.csv")

## Model# 3: XGBoost

### Lets create a parameter grid for XGBoost Model

In [None]:
param_grid = {'n_estimators':[100, 500, 1000], 'max_depth':[5, 6, 7], 'min_child_weight': [3, 5, 8]}

In [None]:
unique = list(set(y_train))
X_train = pd.DataFrame(X_train)
y_train = np.asarray(y_train)

In [None]:
XGBgrid = GridSearchCV(XGBClassifier(learning_rate= 0.1, gamma= 0, objective= 'multi:softmax', num_classes= len(unique), seed= 27), param_grid)

In [None]:
XGBgrid.fit(X_train, y_train)

In [None]:
XGBgrid.best_params_

In [None]:
grid_svm.best_score_

### Train a XGBoost Classifier

In [None]:
XGBModel = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=7,
 min_child_weight=4,
 gamma=0,
 objective= 'multi:softmax',
 seed=27)
XGBModel.fit(X_train, y_train)

AttributeError: 'bool' object has no attribute 'all'

In [None]:
### Apply the above Model on Validation Set
y_valid, X_valid = vec_for_learning(vectorize, valid_tagged)
X_valid = pd.DataFrame(X_valid)
y_valid = np.asarray(y_valid)
y_pred_valid = XGBModel.predict(X_valid)

In [None]:
print('XGBoost Validation Accuracy:', accuracy_score(y_valid, y_pred_valid))
print('XGBoost Validation F1-Score:', f1_score(y_valid, y_pred_valid, average='weighted'))

In [None]:
### Apply the above Model on Test Set
y_test, X_test = vec_for_learning(vectorize, test_tagged)
X_test = pd.DataFrame(X_test)
y_test = np.asarray(y_test)
y_pred_test = XGBModel.predict(X_test)

### Lets calculate some Performance Metrics on the Test predictions.

In [None]:
accuracy = accuracy_score(y_test, y_pred_test)
f1_scr = f1_score(y_test, y_pred_test, average='weighted')
print('XGBoost Test Accuracy:', accuracy)
print('XGBoost Test F1-Score:', f1_scr)