# NLP Classification



In [54]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

import pandas as pd
import numpy as np

# Load data

We could load data from the GitHub repo or other data sources.
When using Google Colab, we could also upload the data file manually.

In [27]:
# define the data URL
sample_data_url = 'https://raw.githubusercontent.com/OHNLP/covid19vaxae/main/sample.csv'
large_data_url = 'https://raw.githubusercontent.com/OHNLP/covid19vaxae/main/large.csv'

# load data by Python Pandas
df_sample = pd.read_csv(sample_data_url)
df_large = pd.read_csv(large_data_url)

print('* loaded %s sample' % len(df_sample))
print('* loaded %s large' % len(df_large))

# preprocessing the data
# for those NaN values, fill with forward method
# more details: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html
df_sample.fillna(method='ffill', inplace=True)
df_large.fillna(method='ffill', inplace=True)

# show it looks
df_sample.head()

* loaded 500 sample
* loaded 15436 large


Unnamed: 0,VAERS_ID,AGE_YRS,SEX,VAX_DATE,SYMPTOM_TEXT,VAX_MANU,SYMPTOM
0,1125968,53.0,F,2021-03-22,Janssen Covid -19 Vaccine EUA Fever for about...,JANSSEN,Pyrexia
1,932320,41.0,F,2021-01-08,fever of 101F that began approximately 12 hour...,PFIZER\BIONTECH,Pyrexia
2,1130037,45.0,F,2021-03-22,Low grade 100.5 fever about 10 hours after vac...,JANSSEN,Pyrexia
3,933191,27.0,F,2021-01-08,"Temperature of 99 by 11:00 PM on 1/8, had a fe...",MODERNA,Pyrexia
4,1138026,33.0,M,2021-03-05,For 24 hours straight he ran a fever over 103 ...,JANSSEN,Pyrexia


# Model 1: Very simple model

Before we start something fancy and complex, let's try a very simple model.
It only uses the age, sex, and the vaccine name to predict the adverse event.
Although we could imagine how poor the performance is, let's give a try.

## Prepare 3 features

In [121]:
# using this dictionary to convert the vaccine name to a number
dict_vax2num = dict(zip(
    df_sample.VAX_MANU.unique().tolist(), 
    np.arange(df_sample.VAX_MANU.nunique())
))
print('* dict_vax2num:', dict_vax2num)

# In this toy model, we use age, sex, the vaccine name as features
X = df_sample[['AGE_YRS', 'SEX', 'VAX_MANU']]
y = df_sample['SYMPTOM']

# convert the sex from text to number
X['SEX'] = X['SEX'].apply(lambda v: 1 if v == 'M' else 0)

# convert the vaccine name to number
X['VAX_MANU'] = X['VAX_MANU'].apply(lambda v: dict_vax2num[v])

# split the train/test sets, we use 20% of records for test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2
)
print('* get train set', X_train.shape)
print(X_train.head(5))

print('* get test set', X_test.shape)
print(X_test.head(5))

* dict_vax2num: {'JANSSEN': 0, 'PFIZER\\BIONTECH': 1, 'MODERNA': 2, 'UNKNOWN MANUFACTURER': 3}
* get train set (400, 3)
     AGE_YRS  SEX  VAX_MANU
381     25.0    0         1
211     78.0    1         2
260     35.0    0         1
276     20.0    0         0
409     43.0    0         0
* get test set (100, 3)
     AGE_YRS  SEX  VAX_MANU
354     71.0    0         1
372     33.0    1         0
452     36.0    1         1
346     21.0    0         2
58      49.0    0         2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


## Train a classifier

In [122]:
# we use Random Forest Classifier
# since we only have 3 features for each records, 40 trees are enough
clf = RandomForestClassifier(n_estimators=40, random_state=0)

# train the model using our training set
model1 = clf.fit(X_train, y_train)

# use the trained model to predict the test set
# since we already know the labels for the test set
# it's a test in fact
y_pred = model1.predict(X_test)

# get the test results
result1 = classification_report(y_test, y_pred)

# OK, we know it won't be not good at all.
# and ... yes, it's not good :p
print(result1)

                     precision    recall  f1-score   support

              Chill       0.00      0.00      0.00        10
          Dizziness       0.25      0.20      0.22        10
            Fatigue       0.00      0.00      0.00         4
           Headache       0.00      0.00      0.00        14
Injection_site_pain       0.12      0.18      0.15        11
            Myalgia       0.09      0.12      0.11         8
             Nausea       0.00      0.00      0.00         8
               Pain       0.22      0.18      0.20        11
  Pain_in_extremity       0.18      0.17      0.17        12
            Pyrexia       0.09      0.08      0.09        12

           accuracy                           0.10       100
          macro avg       0.10      0.09      0.09       100
       weighted avg       0.10      0.10      0.10       100



# Model 2: Better model

Now, let's try a better model by using the text information. 

There are many ways to extract the text features.
In this demo model, we use the basic TF-IDF

## Prepare symptom text features

In [135]:
# this time, we only use the symptom_text to get features.
X = df_sample['SYMPTOM_TEXT']
# still use symptom as the label
y = df_sample['SYMPTOM']

# but the long text it self couldn't be used as feature
# we need to convert the text into a list of numbers (or vector)
# let's use a very popular tool called TF-IDF
# more details about this method could be found here:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
# first, let's get a vectorizer
vcer = TfidfVectorizer(stop_words='english')

# then convert!
X = vcer.fit_transform(X)


# split the train/test sets, we use 20% of records for test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2
)

# as we can see, now we have a very large feature vector
# which contains more than 2000 numbers to represent a report
print('* get train set', X_train.shape)
print('* get test set', X_test.shape)


* get train set (400, 2647)
* get test set (100, 2647)


## Train a classifier

In [136]:
# we use Random Forest Classifier
# now, this time since we have more features (2647 feature!),
# we could use more trees to improve the performance.
clf = RandomForestClassifier(n_estimators=200, random_state=0)

# train the model using our training set
model2 = clf.fit(X_train, y_train)

# use the trained model to predict the test set
# since we already know the labels for the test set
# it's a test in fact
y_pred = model2.predict(X_test)

# get the test results
result2 = classification_report(y_test, y_pred)

# yes! the performance is much better than previous one!
# the overall F1 is not bad
print(result2)

                     precision    recall  f1-score   support

              Chill       0.92      1.00      0.96        11
          Dizziness       0.80      0.67      0.73         6
            Fatigue       1.00      0.62      0.77         8
           Headache       0.83      0.42      0.56        12
Injection_site_pain       0.60      0.90      0.72        10
            Myalgia       0.71      0.56      0.63         9
             Nausea       0.30      0.75      0.43         4
               Pain       0.56      0.36      0.43        14
  Pain_in_extremity       0.50      0.53      0.52        15
            Pyrexia       0.73      1.00      0.85        11

           accuracy                           0.66       100
          macro avg       0.70      0.68      0.66       100
       weighted avg       0.70      0.66      0.65       100



# Model 3: Next model

Now we have text features and other features, how about use all of them?

## Prepare more features

In [147]:
# this time, we use both symptom_text and ages and sex for features.
X = df_sample[['SYMPTOM_TEXT', 'AGE_YRS', 'SEX']]
# still use symptom as the label
y = df_sample['SYMPTOM']

# but the long text it self couldn't be used as feature
# we need to convert the text into a list of numbers (or vector)
# let's use a very popular tool called TF-IDF
# more details about this method could be found here:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
# first, let's get a vectorizer
vcer = TfidfVectorizer(stop_words='english')

# then convert!
X_sym = vcer.fit_transform(X['SYMPTOM_TEXT'])
print('* X_sym:', X_sym.shape)

# but an issue is that the X_sym is too sparse,
# we don't need too many zero features
# so, we could do a feature selection here
# there are a lot of feature selection methods, could be found here:
# https://scikit-learn.org/stable/modules/feature_selection.html
# we use a simple one, and select only 50 features
selector = SelectKBest(chi2, k=50)
X_sym = selector.fit_transform(X_sym, y)

# also convert the sex feature
X_sex = X['SEX'].apply(lambda v: 1 if v == 'M' else 0)

# since the symptom text feature is a sparse matrix,
# we need to convert it to numpy format
# and put age and sex feature in.
# then the final number of features are 52
X = np.concatenate((
    X_sym.toarray(), 
    X['AGE_YRS'].values[:, None],
    X_sex.values[:, None]
), axis=1)

# split the train/test sets, we use 20% of records for test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2
)
print('* get train set', X_train.shape)
print('* get test set', X_test.shape)

* X_sym: (500, 2647)
* get train set (400, 52)
* get test set (100, 52)


## Train a classifier

In [148]:
# we use Random Forest Classifier
# now, this time since we have less features (52 features),
# we could use more trees to improve the performance.
clf = RandomForestClassifier(n_estimators=200, random_state=0)

# train the model using our training set
model3 = clf.fit(X_train, y_train)

# use the trained model to predict the test set
# since we already know the labels for the test set
# it's a test in fact
y_pred = model3.predict(X_test)

# get the test results
result3 = classification_report(y_test, y_pred)

# yes! the performance is much better than previous one!
# the overall F1 is getting better, which is better than the second model!
# and depends on the training set, the result may vary each time.
print(result3)

                     precision    recall  f1-score   support

              Chill       1.00      1.00      1.00        13
          Dizziness       1.00      0.78      0.88         9
            Fatigue       0.86      0.67      0.75         9
           Headache       0.70      0.64      0.67        11
Injection_site_pain       0.64      1.00      0.78         7
            Myalgia       1.00      0.75      0.86        12
             Nausea       0.71      0.71      0.71         7
               Pain       0.75      0.75      0.75        12
  Pain_in_extremity       0.55      0.86      0.67         7
            Pyrexia       0.92      0.92      0.92        13

           accuracy                           0.81       100
          macro avg       0.81      0.81      0.80       100
       weighted avg       0.84      0.81      0.81       100



# Evaluate on the large with model 3

Now let's see how the performance is on the large dataset.

The code is the same, just change `df_sample` to `df_large`

## Prepare the features

This time, we don't need to split the dataset into train and test.
We will use all of them for test.

In [149]:
# this time, we use both symptom_text and ages and sex for features.
X = df_large[['SYMPTOM_TEXT', 'AGE_YRS', 'SEX']]
# use symptom as the label
y = df_large['SYMPTOM']

# then convert!
X_sym = vcer.transform(X['SYMPTOM_TEXT'])
print('* X_sym:', X_sym.shape)

# but an issue is that the X_sym is too sparse,
# we don't need too many zero features
# so, we could do a feature selection here
# there are a lot of feature selection methods, could be found here:
# https://scikit-learn.org/stable/modules/feature_selection.html
# we use a simple one, and select only 50 features
selector = SelectKBest(chi2, k=50)
X_sym = selector.fit_transform(X_sym, y)

# also convert the sex feature
X_sex = X['SEX'].apply(lambda v: 1 if v == 'M' else 0)

# since the symptom text feature is a sparse matrix,
# we need to convert it to numpy format
# and put age and sex feature in
X = np.concatenate((
    X_sym.toarray(), 
    X['AGE_YRS'].values[:, None],
    X_sex.values[:, None]
), axis=1)

# we don't need to split the dataset, just run the test
print('* get large test set', X.shape)

* X_sym: (15436, 14161)
* get large test set (15436, 52)


## Evaluate

In [150]:
# use the trained model3 to predict the test set
y_pred = model3.predict(X)

# get the test results
result_large = classification_report(y, y_pred)

# oops! 
print(result_large)

                     precision    recall  f1-score   support

              Chill       0.71      0.43      0.53      7315
          Dizziness       0.78      0.62      0.69      3447
            Fatigue       0.00      0.00      0.00      1985
           Headache       0.14      0.32      0.19      1576
Injection_site_pain       0.04      0.04      0.04       472
            Myalgia       0.03      0.13      0.05        55
             Nausea       0.03      0.16      0.05       135
               Pain       0.02      0.26      0.03       126
  Pain_in_extremity       0.04      0.06      0.05       258
            Pyrexia       0.00      0.00      0.00        67

           accuracy                           0.38     15436
          macro avg       0.18      0.20      0.16     15436
       weighted avg       0.53      0.38      0.43     15436



# Summary

As shown in the three models, there are mainly two tasks:

1. Extract features from raw data. This reflects how our model abstracts the data.
2. Train a classifier based on features. This reflects how we inteprete the relationship between these data and the target (label).

Even if we use the same classifier (but with different hyperparameters), the better the quality of the features, the better the overall performance of the model. Advances in these two tasks can improve the performance of our model. 