# Task 0: Preprocessing of data

The [data](https://github.com/Franck-Dernoncourt/pubmed-rct) consists of 200k abstracts from PubMed. Each sentence of a abstract is labeld with its role in the abstract (background, objective, method, result, or conclusion). In this notebook we preprocess the data for training our embeddings and models for classifications task 1 and 2. Preprocessing consists of the following steps
- lower casing, tokenization, remove punctuation
- optional: remove placeholder "@" for numbers
- stop word removal
- lemmatization or stemming or neither

In addition we want to use the information that is given by the abstracts. Therefore we construct an additional feature called *relative_linenumber* that measure the relative position of a sentence in an abstract:

$\text{relative_linenumber} = \frac{\text{index of the sentence in an abstract}}{\text{# of sentences in the abstract}}$

In [1]:
import project2Lib
from pathlib import Path
from tqdm import tqdm
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/noraschneider/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/noraschneider/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/noraschneider/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/noraschneider/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/noraschneider/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
def print_nicely(n, df):
    
    for index, row in df.iterrows():
        print("----------------------")
        r = []
        for i in range(len(df.columns)):
            r.append(row[i])
        print(r)
        
        if(index >=n):
            break

In [3]:
# Load data as dataframe
train_data, dev_data, test_data = project2Lib.load_data_as_dataframe(data_dir=Path("./data"), linenumber=True)
train_data.head()

Unnamed: 0,label,sentence,abstract_id,linenumber
0,0,The emergence of HIV as a chronic condition me...,24491034,0.0
1,0,This paper describes the design and evaluation...,24491034,0.1
2,2,This study is designed as a randomised control...,24491034,0.2
3,2,The intervention group will participate in the...,24491034,0.3
4,2,The program is based on self-efficacy theory a...,24491034,0.4


In [17]:
# Removing @ placeholder for numbers
mode = ["", "lemmatization", "stemming"]
for m in tqdm(mode):
    train_data['preprocess'] = train_data['sentence'].apply(lambda x: project2Lib.preprocess_text(x, mode=m, remove_numplaceholder=True))
    train_data.to_csv(f"./PreprocessedData/train_{m}_noph.csv")
    dev_data['preprocess'] = dev_data['sentence'].apply(lambda x: project2Lib.preprocess_text(x, mode=m, remove_numplaceholder=True))
    dev_data.to_csv(f"./PreprocessedData/dev_{m}_noph.csv")
    test_data['preprocess'] = test_data['sentence'].apply(lambda x: project2Lib.preprocess_text(x, mode=m, remove_numplaceholder=True))
    test_data.to_csv(f"./PreprocessedData/test_{m}_noph.csv")

100%|██████████| 1/1 [09:14<00:00, 554.32s/it]


In [None]:
# Keeping @ placeholder for numbers
mode = ["lemmatization"]
for m in tqdm(mode):
    train_data['preprocess'] = train_data['sentence'].apply(lambda x: project2Lib.preprocess_text(x, mode=m, remove_numplaceholder=False))
    train_data.to_csv(f"./PreprocessedData/train_{m}.csv")
    dev_data['preprocess'] = dev_data['sentence'].apply(lambda x: project2Lib.preprocess_text(x, mode=m, remove_numplaceholder=False))
    dev_data.to_csv(f"./PreprocessedData/dev_{m}.csv")
    test_data['preprocess'] = test_data['sentence'].apply(lambda x: project2Lib.preprocess_text(x, mode=m, remove_numplaceholder=False))
    test_data.to_csv(f"./PreprocessedData/test_{m}.csv")

In [6]:
# preprocessing of small dataset
# Load data as dataframe
train_data_small, dev_data_small, test_data_small = project2Lib.load_data_as_dataframe(data_dir=Path("./data_small"), linenumber=True)

# Keeping @ placeholder for numbers
mode = ["lemmatization"]
for m in tqdm(mode):
    train_data_small['preprocess'] = train_data_small['sentence'].apply(lambda x: project2Lib.preprocess_text(x, mode=m, remove_numplaceholder=False))
    train_data_small.to_csv(f"./PreprocessedData/train_{m}_small.csv")
    dev_data_small['preprocess'] = dev_data_small['sentence'].apply(lambda x: project2Lib.preprocess_text(x, mode=m, remove_numplaceholder=False))
    dev_data_small.to_csv(f"./PreprocessedData/dev_{m}_small.csv")
    test_data_small['preprocess'] = test_data_small['sentence'].apply(lambda x: project2Lib.preprocess_text(x, mode=m, remove_numplaceholder=False))
    test_data_small.to_csv(f"./PreprocessedData/test_{m}_small.csv")

100%|██████████| 1/1 [14:37<00:00, 877.34s/it]


## Relevance of relative position

We suspect that a lot of information for the classification task is carried in the relative position of a sentence in the abstract. To demonstrate the power of this feature, we train a simple logistic regression, just based on this feature.

In [11]:
# read data
datasets = ["train", "dev", "test"]
filepaths = []
for i in datasets: 
    filepaths.append(f"./PreprocessedData/{i}_lemmatization_noph.csv")
    
train_data = pd.read_csv(filepaths[0], index_col = 0)
train_data = train_data.fillna('')
dev_data = pd.read_csv(filepaths[1], index_col = 0)
dev_data = dev_data.fillna('')
test_data = pd.read_csv(filepaths[2], index_col = 0)
test_data = test_data.fillna('')


elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



In [28]:
# train and evaluate simple logistic regression
Y_train = train_data["label"]
Y_dev = dev_data["label"]
Y_test = test_data["label"]

X_train = train_data["line_relative"].values
X_dev = dev_data["line_relative"].values
X_test = test_data["line_relative"].values

clf = LogisticRegression(random_state=123, max_iter = 500).fit(X_train.reshape(-1, 1), Y_train)

y_hat_test = clf.predict(X_test.reshape(-1, 1))
f1 = f1_score(Y_test, y_hat_test,average="weighted")
acc = accuracy_score(Y_test, y_hat_test)
print(f"F1 score: {f1}")
print(f"Accuracy: {acc}")
clf.coef_, clf.classes_

F1 score: 0.6817444661972296
Accuracy: 0.7141016512392772


(array([[-11.01463154],
        [-18.04791756],
        [ -3.12187785],
        [  6.25828715],
        [ 25.9261398 ]]),
 array([0, 1, 2, 3, 4]))

F1 score: 0.6817444661972296

Accuracy: 0.7141016512392772

(array([[-11.01463154],

        [-18.04791756],
        
        [ -3.12187785],
        
        [  6.25828715],
        
        [ 25.9261398 ]]),
 array([0, 1, 2, 3, 4]))

In [27]:
# compare to random guessing
from sklearn.dummy import DummyClassifier
import numpy as np
dummy_clf = DummyClassifier(strategy="prior")
dummy_clf.fit(X_train.reshape(-1, 1), Y_train)

y_hat_test = dummy_clf.predict(X_test.reshape(-1, 1))
f1 = f1_score(Y_test, y_hat_test,average="weighted")
acc = accuracy_score(Y_test, y_hat_test)
print(f"F1 score: {f1}")
print(f"Accuracy: {acc}")

F1 score: 0.180058888669869
Accuracy: 0.34842165937680125
