In [1]:
import torch
import os
import numpy as np
import pandas as pd
import xgboost as xgb

In [2]:
def get_training_data(base_dir="/home/ubuntu/training_data"):
    train_texts = []
    train_labels = []
    label_counts = {}
    for subdir, dirs, files in os.walk(base_dir):
        label = subdir.split(r"/")[-1]
        for f in files:
            if f.split('.')[-1] == 'txt' and '-description.txt' not in f and '-guidance.txt' not in f:
                path = os.path.join(subdir, f)
                train_texts.append(path)
                train_labels.append(label)
                if label not in label_counts:
                    label_counts[label] = 1
                else:
                    label_counts[label] += 1
    label_counts = {k: v for k, v in sorted(label_counts.items(), key=lambda item: -item[1])}

    return train_texts, train_labels, label_counts

train_files, train_labels, label_counts = get_training_data()

In [3]:
valid_labels = set([])
for k, v in label_counts.items():
    if v > 100:
        valid_labels.add(k)

In [4]:
len(valid_labels)

54

In [5]:
filtered_files = []
filtered_labels = []
for i in range(len(train_files)):
    if train_labels[i] in valid_labels:
        filtered_files.append(train_files[i])
        filtered_labels.append(train_labels[i])

In [6]:
from sklearn.model_selection import train_test_split
train_files, test_files, train_labels, test_labels = train_test_split(filtered_files, filtered_labels, test_size=.2, shuffle=True, random_state=11235)


In [8]:
import shutil
import os
shutil.rmtree('/home/ubuntu/train_split')
os.mkdir('/home/ubuntu/train_split/')
shutil.rmtree('/home/ubuntu/test_split')
os.mkdir('/home/ubuntu/test_split/')
for label in valid_labels:
    if not os.path.isdir('/home/ubuntu/train_split/' + label):
        os.mkdir('/home/ubuntu/train_split/' + label)
    if not os.path.isdir('/home/ubuntu/test_split/' + label):
        os.mkdir('/home/ubuntu/test_split/' + label)
    

In [9]:
for f in train_files:
    dest = '/home/ubuntu/train_split/' + r'/'.join(f.split(r'/')[-2:])
    shutil.copyfile(f, dest)
for f in test_files:
    dest = '/home/ubuntu/test_split/' + r'/'.join(f.split(r'/')[-2:])
    shutil.copyfile(f, dest)

In [10]:
def read_text(file, text_size=4000):
    t = ""
    with open(file, 'r', encoding="utf-8") as r:
        t = r.read()[:text_size]
    return t

train_texts = [read_text(f) for f in train_files]
test_texts = [read_text(f) for f in test_files]

In [11]:
from transformers import DistilBertTokenizerFast, DistilBertModel
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained("distilbert-base-uncased")
model.eval()
model.to('cuda')
print('done')

done


In [12]:
# not used, somehow slower to batch
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]
batched = list(batch(train_texts, 16))

In [14]:
train_tokens = [tokenizer(t, truncation=True, padding=True, return_tensors="pt") for t in train_texts]

In [15]:
test_tokens = [tokenizer(t, truncation=True, padding=True, return_tensors="pt") for t in test_texts]

In [16]:
def embed(tokenized_text):
    outputs = model(**(tokenized_text.to('cuda')))
    embedding = outputs[0][:,0,:].detach().cpu().numpy()
    return embedding

In [17]:
len(test_tokens)

10491

In [18]:
def get_embeddings(data):  
    embeddings = None
    count = 0
    for i in range(len(data)):
        count += 1
        embedding = embed(data[i])
        if embeddings is None:
            embeddings = embedding
        else:
            embeddings = np.append(embeddings, embedding, axis=0)
        if count % 1000 == 0:
            print(count)
    return embeddings

In [19]:
train_embeddings = get_embeddings(train_tokens)
test_embeddings = get_embeddings(test_tokens)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000


In [20]:
train = pd.DataFrame(data=train_embeddings)
test = pd.DataFrame(data=test_embeddings)

In [21]:
train['label'] = train_labels
test['label'] = test_labels

In [22]:
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

In [23]:
train = pd.read_csv('train.csv')

In [24]:
test = pd.read_csv('test.csv')

In [25]:
clf = xgb.XGBClassifier(max_depth=3, n_estimators=200, tree_method='gpu_hist')

train_y = train['label']
train_x = train.drop(['label'], axis=1)
clf.fit(train_x, train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=0,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan,
              monotone_constraints='(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)',
              n_estimators=200, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='gpu_hist', validate_parameters=1, verbosity=None)

In [27]:
test_y = test['label']
test_x = test.drop(['label'], axis=1)
test_probs = clf.predict_proba(test_x)

In [28]:
def top_k_accuracy(true_labels, predictions, model, k=3):
    ind = np.argpartition(predictions, -k, axis=1)[:,-k:]
    top_k_match = [(true_labels[i] in model.classes_[ind][i]) for i in range(len(true_labels))]
    return np.mean(top_k_match)

In [32]:
top_k_accuracy(test_y, test_probs, clf, k=1)

0.5841197216661901

In [33]:
top_k_accuracy(test_y, test_probs, clf, k=3)

0.8147936326374988

In [34]:
top_k_accuracy(test_y, test_probs, clf, k=5)

0.8866647602707082

In [35]:
train_probs = clf.predict_proba(train_x)
top_k_accuracy(train_y, train_probs, clf, k=1)

0.892183031458532

In [166]:
conf_matrix = sklearn.metrics.confusion_matrix(filtered_test_label, clf.predict(filtered_test_x), labels=list(label_counts.index))

In [167]:
conf_matrix = pd.DataFrame(conf_matrix, index=list(label_counts.index), columns=['predicted ' + x for x in list(label_counts.index)])