# Rule-based classification

In [22]:
import nltk
import re

import pandas as pd
import numpy as np
from joblib import Parallel, delayed

from sklearn.metrics import classification_report

# Define a RegExp Classifier

To evaluate how our classifier works on our dataset, we create a simple RegExp classifier

In [41]:
class RegExpClassifier(object):
    """
    Predict class for X.
        The predicted class of an input sample is a result by the given
        regular expression
        
        Parameters
        ----------
        X : {array-like} or {list}
            
        Returns
        -------
        y : ndarray of shape (n_samples,)
            The predicted classes.
    """
    def __init__(self, rule, mode='binary'):
        super().__init__()
        self.mode = mode
        self.rule = re.compile(rule, re.MULTILINE | re.IGNORECASE)
        
    def predict(self, X):
        if self.mode == 'binary':
            def f(ptn, txt):
                r = ptn.findall(txt)
                return 0 if len(r)==0 else 1
            
        else:
            def f(ptn, txt):
                r = ptn.findall(txt)
                return len(r)
        
        p = self.rule
            
        r = Parallel(n_jobs=1)(
            delayed(f)(p, x) for x in X
        )
        
        r = np.array(r)
        return r

array([1, 1, 0])

## Classifier demo

In [58]:
# the rules created manually
rule = r"""(chill|coldness)"""

# create a very small dataset
df = pd.DataFrame({'TEXT':[
    'I got chill',
    'I felt coldness',
    'What I was feeling is not good',
    'OK, not bad'
]})

# create a classifier based on this rule
model = RegExpClassifier(rule)
df['y_pred'] = model.predict(df['TEXT'])

# let's show how the results are
df

Unnamed: 0,TEXT,y_pred
0,I got chill,1
1,I felt coldness,1
2,What I was feeling is not good,0
3,"OK, not bad",0


# Load sample data

In [24]:
# define the data URL
sample_data_url = 'm_sample.csv'
large_data_url = 'm_large.csv'

# load data by Python Pandas
df_sample = pd.read_csv(sample_data_url)
df_large = pd.read_csv(large_data_url)

print('* loaded %s sample' % len(df_sample))
print('* loaded %s large' % len(df_large))

# preprocessing the data
# for those NaN values, fill with forward method
# more details: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html
df_sample.fillna(method='ffill', inplace=True)
df_large.fillna(method='ffill', inplace=True)

# show it looks
df_sample.head()

* loaded 500 sample
* loaded 16597 large


Unnamed: 0,VAERS_ID,AGE_YRS,SEX,VAX_DATE,SYMPTOM_TEXT,ALLERGIES,VAX_MANU,SYMPTOMS,NUM_SYMS
0,1120756,66.0,M,2021-03-18,"On Friday night, 3/19/21, patient spiked a fev...",Unknown,MODERNA,"['Epistaxis', 'Mouth_haemorrhage', 'Dyspnoea',...",5.0
1,1228750,54.0,F,2021-03-04,"Migrane headache, dizziness, motion sickness, ...",,JANSSEN,"['Chills', 'Fatigue', 'Influenza_A_virus_test'...",9.0
2,1117649,40.0,F,2021-03-19,After 15 minutes of sitting in the waiting are...,I have asthma and year round allergies. NKDA ...,JANSSEN,"['Chills', 'Malaise', 'Pyrexia', 'Pain', 'Dizz...",9.0
3,1112586,39.0,F,2021-03-18,Complaint of left sided deficit( numbness and ...,,JANSSEN,"['Blood_glucose', 'Road_traffic_accident', 'Pa...",5.0
4,1215454,70.0,F,2021-04-10,Pt had Johnsons and Johnsons vaccine Pt prese...,none,JANSSEN,"['Dyspnoea', 'Ultrasound_scan_abnormal', 'Comp...",5.0


# Model: Use RegExp

## Create RegExp

We could define the regular expression for each adverse event here

In [54]:
regexps = [
    ['Pyrexia', r"""(fever|pyrexia|ague|elevated temperature|feverishness|frenzy|hyperpyrexia)"""],
    ['Chill', r"""(chill|coldness)"""],
    ['Headache', r"""(headache|migraine)"""],
    # add more 
]

## Run the classifier

In [55]:
for i in range(len(regexps)):
    symptom = regexps[i][0]
    rule = regexps[i][1]

    # the features is the text
    X = df_sample['SYMPTOM_TEXT']
    
    # for evaluation, we check the symptoms to see whether the target symptom exists
    y = df_sample['SYMPTOMS'].apply(lambda syms: (symptom in syms) + 0)

    # create a model based the rule of current symptom
    model = RegExpClassifier(rule)
    y_pred = model.predict(X)

    # get the test results
    result = classification_report(y, y_pred)

    # and depends on the training set, the result may vary each time.
    print('*'*60)
    print('* %s result' % symptom)
    print(result)

************************************************************
* Pyrexia result
              precision    recall  f1-score   support

           0       1.00      0.96      0.98       352
           1       0.92      0.99      0.95       148

    accuracy                           0.97       500
   macro avg       0.96      0.98      0.97       500
weighted avg       0.97      0.97      0.97       500

************************************************************
* Chill result
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       360
           1       0.99      0.96      0.97       140

    accuracy                           0.98       500
   macro avg       0.98      0.98      0.98       500
weighted avg       0.98      0.98      0.98       500

************************************************************
* Headache result
              precision    recall  f1-score   support

           0       0.98      0.97      0.98       333
    

As we could see, the human expert's experience is very helpful in getting great performance for classification

# Large Evaluation

In [59]:
for i in range(len(regexps)):
    symptom = regexps[i][0]
    rule = regexps[i][1]

    X = df_large['SYMPTOM_TEXT']
    y = df_large['SYMPTOMS'].apply(lambda syms: (symptom in syms) + 0)

    y_pred = model.predict(X)

    # get the test results
    result = classification_report(y, y_pred)

    # and depends on the training set, the result may vary each time.
    print('*'*60)
    print('* %s result' % symptom)
    print(result)

************************************************************
* Pyrexia result
              precision    recall  f1-score   support

           0       0.86      0.86      0.86     12244
           1       0.61      0.62      0.61      4353

    accuracy                           0.80     16597
   macro avg       0.74      0.74      0.74     16597
weighted avg       0.80      0.80      0.80     16597

************************************************************
* Chill result
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     12239
           1       0.95      0.95      0.95      4358

    accuracy                           0.97     16597
   macro avg       0.97      0.97      0.97     16597
weighted avg       0.97      0.97      0.97     16597

************************************************************
* Headache result
              precision    recall  f1-score   support

           0       0.77      0.84      0.80     11209
    