# Symptom Detection by NLP

This is a typical tast that identify whether a specific symptom found in the given text

In [1]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

import pandas as pd
import numpy as np

# Load data

In [11]:
# define the data URL
sample_data_url = 'm_sample.csv'
large_data_url = 'm_large.csv'

# load data by Python Pandas
df_sample = pd.read_csv(sample_data_url)
df_large = pd.read_csv(large_data_url)

print('* loaded %s sample' % len(df_sample))
print('* loaded %s large' % len(df_large))

# preprocessing the data
# for those NaN values, fill with forward method
# more details: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html
df_sample.fillna(method='ffill', inplace=True)
df_large.fillna(method='ffill', inplace=True)

# show it looks
df_sample.head()

* loaded 500 sample
* loaded 59271 large


Unnamed: 0,VAERS_ID,AGE_YRS,SEX,VAX_DATE,SYMPTOM_TEXT,ALLERGIES,VAX_MANU,SYMPTOMS,NUM_SYMS
0,1196106,45.0,M,2021-04-08,Approximately 2.5 minutes post vaccination- pa...,NKDA,JANSSEN,"['Hyperhidrosis', 'Dyskinesia', 'Pallor', 'Foa...",5.0
1,935155,61.0,F,2021-01-09,Lymph node swelling under left arm and aching ...,Nkda,PFIZER\BIONTECH,"['Lymphadenopathy', 'Pain_in_extremity', 'Lymp...",4.0
2,1077407,30.0,F,2021-02-26,Exactly a week later plus a few hours my arm f...,"Amoxicillin, Dairy, and then nickel jewelry al...",MODERNA,"['Injection_site_pruritus', 'Injection_site_sw...",6.0
3,988294,50.0,F,2021-01-18,Moderna COVID-19 Vaccine EUA After receiving ...,None known,MODERNA,"['Pain_in_extremity', 'Injection_site_warmth',...",4.0
4,1096749,68.0,F,2021-03-08,"On MON evening after the shot, at 11:00pm I ex...",Adverse reaction to Levoquin in 2021 - tendoni...,JANSSEN,"['Chills', 'Fatigue', 'Headache', 'Skin_warm',...",13.0


# Model 1: Fatigue Detection

we found top 10 symptoms: 'Headache', 'Chills', 'Fatigue', 'Pyrexia', 'Pain', 'Nausea', 'Dizziness', 'Injection_site_pain', 'Pain_in_extremity', 'Injection_site_erythema'.

So we could make a simple model to detect whether the given text is about a specific symptom

## Create the y label

In [35]:
# we create a new columns calld `has_Fatigue`
df_sample['has_fatigue'] = df_sample['SYMPTOMS'].apply(lambda syms: 0 + ('Fatigue' in syms))

# if has_Fatigue, shows 1
# let's see this column looks
df_sample[['SYMPTOM_TEXT', 'SYMPTOMS', 'has_fatigue']].head(10)

Unnamed: 0,SYMPTOM_TEXT,SYMPTOMS,has_fatigue
0,Approximately 2.5 minutes post vaccination- pa...,"['Hyperhidrosis', 'Dyskinesia', 'Pallor', 'Foa...",0
1,Lymph node swelling under left arm and aching ...,"['Lymphadenopathy', 'Pain_in_extremity', 'Lymp...",0
2,Exactly a week later plus a few hours my arm f...,"['Injection_site_pruritus', 'Injection_site_sw...",0
3,Moderna COVID-19 Vaccine EUA After receiving ...,"['Pain_in_extremity', 'Injection_site_warmth',...",0
4,"On MON evening after the shot, at 11:00pm I ex...","['Chills', 'Fatigue', 'Headache', 'Skin_warm',...",1
5,"Woke up with terrible headache, body aches, ch...","['Chills', 'Headache', 'Hypersomnia', 'Pain', ...",0
6,reported onset of mild headache and bilateral ...,"['Headache', 'Visual_field_defect']",0
7,Started out with a rash behind my knees in ear...,"['Pruritus', 'Rash']",0
8,"I had a very sore arm which is normal, however...","['Fatigue', 'Hypersomnia', 'Migraine', 'Pain_i...",1
9,~36 hours after vaccine had expected reaction ...,"['Injection_site_inflammation', 'Fatigue', 'My...",1


In [43]:
# this time, we use both symptom_text
X = df_sample[['SYMPTOM_TEXT']]

# still use `has_fatigue` as the label
y = df_sample['has_fatigue']

# but the long text it self couldn't be used as feature
# we need to convert the text into a list of numbers (or vector)
# let's use a very popular tool called TF-IDF
# more details about this method could be found here:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
# first, let's get a vectorizer
vcer = TfidfVectorizer(stop_words='english')

# then convert!
X_sym = vcer.fit_transform(X['SYMPTOM_TEXT'])
print('* X_sym:', X_sym.shape)

# And the X is just the X_sym, we don't put other features
X = X_sym

# split the train/test sets, we use 20% of records for test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2
)
print('* get train set', X_train.shape)
print('* get test set', X_test.shape)

* X_sym: (500, 3234)
* get train set (400, 3234)
* get test set (100, 3234)


## Train a classifier

In [37]:
# we use Random Forest Classifier
# now, this time since we have less features (52 features),
# we could use more trees to improve the performance.
clf = RandomForestClassifier(n_estimators=200, random_state=0)

# train the model using our training set
model = clf.fit(X_train, y_train)

# use the trained model to predict the test set
# since we already know the labels for the test set
# it's a test in fact
y_pred = model.predict(X_test)

# get the test results
result = classification_report(y_test, y_pred)

# yes! the performance is much better than previous one!
# the overall F1 is getting better, which is better than the second model!
# and depends on the training set, the result may vary each time.
print(result)

              precision    recall  f1-score   support

           0       0.84      1.00      0.91        82
           1       1.00      0.11      0.20        18

    accuracy                           0.84       100
   macro avg       0.92      0.56      0.56       100
weighted avg       0.87      0.84      0.78       100



# Large Evaluation

## Create the label

In [38]:
# we create a new columns calld `has_Fatigue`
df_large['has_fatigue'] = df_large['SYMPTOMS'].apply(lambda syms: 0 + ('Fatigue' in syms))

# if has_Fatigue, shows 1
# let's see this column looks
df_large[['SYMPTOM_TEXT', 'SYMPTOMS', 'has_fatigue']].head(10)

Unnamed: 0,SYMPTOM_TEXT,SYMPTOMS,has_fatigue
0,"Pt. symptoms included: swelling in the eyes, c...","['Eye_swelling', 'Chest_discomfort', 'Wheezing...",0
1,Anaphylactic like reaction started with dizzin...,"['Consciousness_fluctuating', 'Hyperhidrosis',...",0
2,Vomited suddenly without knowledge of being na...,"['Vomiting', 'Headache', 'Myalgia', 'Nausea']",0
3,"Began to feel tired and sore around 1530, chil...","['Chills', 'Fatigue', 'Mobility_decreased', 'A...",1
4,within 5 minutes of injection patient complain...,"['Muscle_spasms', 'Back_pain', 'Electrocardiog...",0
5,"Terrible dizziness, lightheaded, overall feeli...","['Nausea', 'Nervousness', 'Dizziness', 'Paraes...",0
6,Patient with history of fear of needles and an...,"['Paraesthesia', 'Dizziness', 'Nausea']",0
7,Recipient reports itching to arms and neck app...,['Pruritus'],0
8,"At injection site, at 1202, redness, rash, itc...","['Pruritus', 'Injection_site_pruritus', 'Injec...",0
9,Appeared to have a vasovagal reaction with nau...,"['Dizziness', 'Presyncope', 'Nausea']",0


## Prepare the text features

In [44]:
# this time, we use both symptom_text
X = df_large[['SYMPTOM_TEXT']]

# still use `has_fatigue` as the label
y = df_large['has_fatigue']

# then convert use the same vectorizer
X_sym = vcer.transform(X['SYMPTOM_TEXT'])
print('* X_sym:', X_sym.shape)

# And the X is just the X_sym, we don't put other features
X = X_sym

print('* get evaluation set', X.shape)

* X_sym: (59271, 3234)
* get evaluation set (59271, 3234)


## Evaluate!

In [45]:
# use the trained model3 to predict the test set
y_pred = model.predict(X)

# get the test results
result_large = classification_report(y, y_pred)

# oops! 
print(result_large)

              precision    recall  f1-score   support

           0       0.83      1.00      0.91     47236
           1       0.98      0.19      0.32     12035

    accuracy                           0.83     59271
   macro avg       0.90      0.59      0.61     59271
weighted avg       0.86      0.83      0.79     59271

