In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/Kaggle"

In [None]:
%cd /content/gdrive/My Drive/Kaggle
%pwd

/content/gdrive/My Drive/Kaggle


'/content/gdrive/My Drive/Kaggle'

In [None]:
!kaggle competitions download -c coleridgeinitiative-show-us-the-data
!kaggle datasets download -d xujingzhao/apexpytorch
!kaggle datasets download -d xhlulu/huggingface-bert
!kaggle datasets download -d jonathanbesomi/simple-transformers-pypi

100% 24.2G/24.2G [10:23<00:00, 21.8MB/s]
100% 24.2G/24.2G [10:23<00:00, 41.7MB/s]
Downloading simple-transformers-pypi.zip to /content/gdrive/My Drive/Kaggle
  0% 0.00/141k [00:00<?, ?B/s]
100% 141k/141k [00:00<00:00, 19.7MB/s]


In [None]:
!ls

In [None]:
!unzip \*.zip  && rm *.zip


  inflating: bert-base-multilingual-cased/tokenizer.json  
  inflating: bert-base-multilingual-cased/vocab.txt  
  inflating: bert-base-multilingual-uncased/config.json  
  inflating: bert-base-multilingual-uncased/modelcard.json  
  inflating: bert-base-multilingual-uncased/pytorch_model.bin  
bert-base-multilingual-uncased/pytorch_model.bin:  write error (disk full?).  Continue? (y/n/^C) 

In [None]:
!pip install ../input/simple-transformers-pypi/seqeval-0.0.12-py3-none-any.whl
!pip install ../input/simple-transformers-pypi/simpletransformers-0.22.1-py3-none-any.whl
!pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ../input/apexpytorch

In [None]:
import os
import re
import json
import time
import datetime
import random
import glob
import importlib
from functools import partial

import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

random.seed(123)
np.random.seed(456)

In [None]:
MAX_LENGTH = 80 # max no. words for each sentence.
OVERLAP = 20    # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping
MAX_SAMPLE = 5  # set a small number for experimentation, set None for production.

In [None]:
train_path = '/content/gdrive/MyDrive/Data_Coleridge Initiative/train.csv'
paper_train_folder = '/content/gdrive/MyDrive/Data_Coleridge Initiative/train'

train = pd.read_csv(train_path)
train = train[:MAX_SAMPLE]
print(f'No. raw training rows: {len(train)}')

In [None]:
train = train.groupby('Id').agg({
    'pub_title': 'first',
    'dataset_title': '|'.join,
    'dataset_label': '|'.join,
    'cleaned_label': '|'.join
}).reset_index()

print(f'No. grouped training rows: {len(train)}')

In [None]:
papers = {}
for paper_id in train['Id'].unique():
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

In [None]:
def clean_training_text(txt):
    """
    similar to the default clean_text function but without lowercasing.
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def shorten_sentences(sentences):
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

def find_sublist(big_list, small_list):
    all_positions = []
    for i in range(len(big_list) - len(small_list) + 1):
        if small_list == big_list[i:i+len(small_list)]:
            all_positions.append(i)
    
    return all_positions

def tag_sentence(sentence, labels): # requirement: both sentence and labels are already cleaned
    sentence_words = sentence.split()
    
    if labels is not None and any(re.findall(f'\\b{label}\\b', sentence)
                                  for label in labels): # positive sample
        nes = ['O'] * len(sentence_words)
        for label in labels:
            label_words = label.split()

            all_pos = find_sublist(sentence_words, label_words)
            for pos in all_pos:
                nes[pos] = 'B'
                for i in range(pos+1, pos+len(label_words)):
                    nes[i] = 'I'

        return True, list(zip(sentence_words, nes))
        
    else: # negative sample
        nes = ['O'] * len(sentence_words)
        return False, list(zip(sentence_words, nes))

In [None]:
cnt_pos, cnt_neg = 0, 0 # number of sentences that contain/not contain labels
ner_data = []

pbar = tqdm(total=len(train))
for i, id, dataset_label in train[['Id', 'dataset_label']].itertuples():
    # paper
    paper = papers[id]
    
    # labels
    labels = dataset_label.split('|')
    labels = [clean_training_text(label) for label in labels]
    
    # sentences
    sentences = set([clean_training_text(sentence) for section in paper 
                 for sentence in section['text'].split('.') 
                ])
    sentences = shorten_sentences(sentences) # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars
    
    # positive sample
    for sentence in sentences:
        is_positive, tags = tag_sentence(sentence, labels)
        if is_positive:
            cnt_pos += 1
            ner_data.append(tags)
        elif any(word in sentence.lower() for word in ['data', 'study']): 
            ner_data.append(tags)
            cnt_neg += 1
    
    # process bar
    pbar.update(1)
    pbar.set_description(f"Training data size: {cnt_pos} positives + {cnt_neg} negatives")

# shuffling
random.shuffle(ner_data)

In [None]:
dict =[]
with open('train_ner.json', 'w') as f:
    for row in ner_data:
        words, nes = list(zip(*row))
        row_json = {'tokens' : words, 'tags' : nes}
        dict.append(row_json)
        json.dump(row_json, f)
        f.write('\n')

In [None]:
data = pd.DataFrame(dict)
data = data.apply(pd.Series.explode).reset_index()

In [None]:
data =data.fillna(method ="ffill")

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
data.rename(columns={"index":"sentence_id","tokens":"words","tags":"labels"}, inplace =True)
data["labels"] = data["labels"].str.upper()

In [None]:
sample_sub = pd.read_csv('/content/gdrive/MyDrive/Data_Coleridge Initiative/sample_submission.csv')
test_files_path = '/content/gdrive/MyDrive/Data_Coleridge Initiative/test'
train_files_path = paper_train_folder

In [None]:
def read_append_return(filename, train_files_path=train_files_path, output='text'):
    """
    Function to read json file and then return the text data from them and append to the dataframe
    """
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data
sample_sub['text'] = sample_sub['Id'].apply(partial(read_append_return, train_files_path=test_files_path))

In [None]:
sample_sub

In [None]:
X= data[["sentence_id","words"]]
Y =data["labels"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size =0.2)

In [None]:
#building up train data and test data
train_data = pd.DataFrame({"sentence_id":x_train["sentence_id"],"words":x_train["words"],"labels":y_train})
test_data = pd.DataFrame({"sentence_id":x_test["sentence_id"],"words":x_test["words"],"labels":y_test})

In [None]:
from simpletransformers.ner import NERModel

In [None]:
label = data["labels"].unique().tolist()
label

In [None]:
import torch
import torch.nn
cuda_available = torch.cuda.is_available()
cuda_available

In [None]:
model = NERModel('bert', '../input/huggingface-bert/bert-base-cased',labels=label,args = {'num_train_epochs':1, 'learning_rate':1e-4,
'overwrite_output_dir':True, 'train_batch_size':32, 'eval_batch_size':32}, use_cuda=cuda_available)

In [None]:
model.train_model(train_data)

In [None]:
result, model_outputs, preds_list = model.eval_model(test_data)
result

In [None]:
predicts = []

In [None]:
def predict(text):
    predict = []
    orig_string = text
    list_of_lines = []
    max_length = 350
    while len(orig_string) > max_length:
        line_length = orig_string[:max_length].rfind(' ')
        list_of_lines.append(orig_string[:line_length])
        orig_string = orig_string[line_length + 1:]
    list_of_lines.append(orig_string)
    for i in tqdm(range(len(list_of_lines))):
        prediction, model_output = model.predict([list_of_lines[i]])
        prediction=prediction[0]
        for i in range (len(prediction)):
            for x in prediction[i]:
                if (prediction[i][x] == 'B' or prediction[i][x] == 'I'):
                    predict.append(x)
        predict = list(set(predict))
    predict = " ".join(predict)
    return predict

In [None]:
for x in tqdm(range(len(sample_sub))):
    PredictionString = predict(sample_sub.text[x])
    predicts.append(PredictionString)

In [None]:
predicts