In [4]:
import pandas as pd
import numpy as np

df = pd.read_json('data/train.jsonl', lines=True)
df.head()

Unnamed: 0,ners,sentences,id
0,"[[0, 5, CITY], [16, 23, PERSON], [34, 41, PERS...",Бостон взорвали Тамерлан и Джохар Царнаевы из ...,0
1,"[[21, 28, PROFESSION], [53, 67, ORGANIZATION],...",Умер избитый до комы гитарист и сооснователь г...,1
2,"[[0, 4, PERSON], [37, 42, COUNTRY], [47, 76, O...",Путин подписал распоряжение о выходе России из...,2
3,"[[0, 11, PERSON], [36, 47, PROFESSION], [49, 6...",Бенедикт XVI носил кардиостимулятор\nПапа Римс...,3
4,"[[0, 4, PERSON], [17, 29, ORGANIZATION], [48, ...",Обама назначит в Верховный суд латиноамериканк...,4


In [5]:
def split_ners(ners, sentences):    
    cur_range = (0, len(sentences[0]))
    splitted_ners = [[ne for ne in ners if ne[1] < cur_range[1]]]
    for i in range(1, len(sentences)):
        cur_range = (cur_range[1] + 1, cur_range[1] + len(sentences[i]) + 1)
        new_ners = []
        for ne in ners:
            if ne[0] >= cur_range[0] and ne[1] <= cur_range[1]:
                new_ne = [ne[0] - cur_range[0], ne[1] - cur_range[0], ne[2]]
                new_ners.append(new_ne)
        splitted_ners.append(new_ners)
    return splitted_ners

def combine_ners(splitted_ners, sentences):
    combined_ners = splitted_ners[0]
    prev = 0
    for i in range(1, len(sentences)):
        prev += len(sentences[i - 1]) + 1
        new_ners = []
        for ne in splitted_ners[i]:
            new_ne = [ne[0] + prev, ne[1] + prev, ne[2]]
            new_ners.append(new_ne)
        combined_ners.extend(new_ners)
    return combined_ners

def split_dataframe(df):
    new_df = {'ners': [], 'sentences': []}
    for i, row in df.iterrows():
        sentences = row['sentences'].split('\n')
        splitted_ners = split_ners(row['ners'], sentences)
        new_df['ners'].extend(splitted_ners)
        new_df['sentences'].extend(sentences)
    return pd.DataFrame(new_df)

def get_better_dataset(df):
    new_df = {'ners': [], 'sentences': []}
    for i, row in df.iterrows():
        new_df['sentences'].append(row['sentences'])
        ners = []
        for ne in row['ners']:
            ners.append(f"{row['sentences'][ne[0]:ne[1] + 1]} = {ne[2]}")
        new_df['ners'].append('\n'.join(ners))
    return pd.DataFrame(new_df, index=range(len(new_df['ners'])))

def get_ners_from_string(ners_string, sentence):
    ners = []
    for ne in ners_string.split('\n'):
        word, entity = ne.split(' = ')
        ners.append([sentence.index(word), sentence.index(word) + len(word) - 1, entity])
    return ners

def preprocess_df(df, train=True):
    splitted_df = split_dataframe(df)
    if train:
        splitted_df['sentences'] = splitted_df['sentences'].replace('', np.nan)
        splitted_df = splitted_df.dropna().reset_index().drop('index', axis=1)
    return get_better_dataset(splitted_df)

In [6]:
new_df = preprocess_df(df)

In [10]:
system_prompt = f"""Perform Nested NER for sentences from Russian newspaper enclosed in square brackets. 
For the given sentence you should provide the list of tokens = entity. 
There are a total of 29 named entity types: AGE, AWARD, CITY, COUNTRY, CRIME, DATE, DISEASE, 
DISTRICT, EVENT, FACILITY, FAMILY, IDEOLOGY, LANGUAGE, LAW, LOCATION, MONEY, NATIONALITY, NUMBER, ORDINAL, 
ORGANIZATION, PENALTY, PERCENT, PERSON, PRODUCT, PROFESSION, RELIGION, STATE_OR_PROVINCE, TIME, WORK_OF_ART. 
The maximum depth of nesting is 6."""

In [13]:
dataset = []
for i, row in new_df.iterrows():
    message_dict = {"messages": [
                    {"role": "system", "content": system_prompt}, 
                    {"role": "user", "content": row['sentences']}, 
                    {"role": "assistant", "content": row['ners']}
                   ]}
    
    dataset.append(message_dict)

dataset[:1]

[{'messages': [{'role': 'system',
    'content': 'Perform Nested NER for sentences from Russian newspaper enclosed in square brackets. \nFor the given sentence you should provide the list of tokens = entity. \nThere are a total of 29 named entity types: AGE, AWARD, CITY, COUNTRY, CRIME, DATE, DISEASE, \nDISTRICT, EVENT, FACILITY, FAMILY, IDEOLOGY, LANGUAGE, LAW, LOCATION, MONEY, NATIONALITY, NUMBER, ORDINAL, \nORGANIZATION, PENALTY, PERCENT, PERSON, PRODUCT, PROFESSION, RELIGION, STATE_OR_PROVINCE, TIME, WORK_OF_ART. \nThe maximum depth of nesting is 6.'},
   {'role': 'user',
    'content': 'Бостон взорвали Тамерлан и Джохар Царнаевы из Северного Кавказа'},
   {'role': 'assistant',
    'content': 'Бостон = CITY\nТамерлан = PERSON\nЦарнаевы = PERSON\nСеверного Кавказа = LOCATION\nвзорвали = EVENT\nДжохар Царнаевы = PERSON\nТамерлан и Джохар Царнаевы = FAMILY\nКавказа = LOCATION'}]}]

In [15]:
from sklearn.model_selection import train_test_split

train_dataset, val_dataset = train_test_split(dataset, test_size=0.10, shuffle=True)
len(train_dataset), len(val_dataset)

(4280, 476)

In [8]:
import json

def save_jsonl(filename, data):
    with open(f'{filename}.jsonl', 'w') as f:
        for item in data:
            f.write(json.dumps(item) + "\n")

In [16]:
save_jsonl('runner_train', train_dataset)
save_jsonl('runner_val', val_dataset)

In [18]:
test_df = pd.read_json('data/test.jsonl', lines=True)
test_df.head()

Unnamed: 0,senences,id
0,Владелец «Бирмингема» получил шесть лет тюрьмы...,584
1,Акция протеста на Майдане Независимости объявл...,585
2,Фольксваген может перейти под контроль Порше \...,586
3,В Москве покажут фильмы Чарли Чаплина с живой ...,587
4,Чулпан Хаматова сыграет главную роль в фильме ...,588


In [19]:
def split_test_dataframe(df):
    new_df = {'id': [], 'sentences': []}
    for i, row in df.iterrows():
        sentences = row['senences'].split('\n')
        new_df['id'].extend([row['id'] for _ in sentences])
        new_df['sentences'].extend(sentences)
    return pd.DataFrame(new_df)

In [20]:
test_dataset = split_test_dataframe(test_df)
test_dataset.head()

Unnamed: 0,id,sentences
0,584,Владелец «Бирмингема» получил шесть лет тюрьмы
1,584,мини|слева|«Сент-Эндрюс» — домашний стадион фу...
2,584,"В пятницу, 7 марта суд Гонконга приговорил вла..."
3,584,
4,584,54-летний бизнесмен был признан виновным в отм...


In [26]:
from openai import OpenAI
from tqdm.notebook import tqdm

client = OpenAI()
preds = {'id': [], 'result': []}

for i, row in tqdm(test_dataset.iterrows(), total=len(test_dataset)):
    if len(row['sentences']) == 0:
        res = ''
    else:
        completion = client.chat.completions.create(
            model="ft:gpt-3.5-turbo-1106:personal:runner-v3:9Ix5FMcY",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": row['sentences']}
            ]
        )
        res = completion.choices[0].message.content
    preds['id'].append(row['id'])
    preds['result'].append(res)

  0%|          | 0/65 [00:00<?, ?it/s]

In [27]:
preds_tmp_df = pd.DataFrame(preds)
preds_tmp_df.to_csv('preds_tmp.csv')

In [50]:
named_entities = {
    'AGE', 'AWARD', 'CITY', 'COUNTRY', 'CRIME', 'DATE', 'DISEASE', 'DISTRICT', 'EVENT', 
    'FACILITY', 'FAMILY', 'IDEOLOGY', 'LANGUAGE', 'LAW', 'LOCATION', 'MONEY', 'NATIONALITY', 
    'NUMBER', 'ORDINAL', 'ORGANIZATION', 'PENALTY', 'PERCENT', 'PERSON', 'PRODUCT', 'PROFESSION', 
    'RELIGION', 'STATE_OR_PROVINCE', 'TIME', 'WORK_OF_ART',
}

In [86]:
def postprocess_ners(ners, sentence):
    new_ners = []
    for ne in ners:
        try:
            word, entity = ne.split(' = ')
            entity = entity.upper()
            if entity not in named_entities:
                raise(Exception)
            start = sentence.index(word)
            end = start + len(word) - 1
            new_ners.append([start, end, entity])
        except Exception as e:
            continue
    return new_ners

In [87]:
preds = {'id': [], 'tmp_ners': []}
ners = []
for i, row in preds_tmp_df.iterrows():
    if len(preds['id']) > 0 and row['id'] == preds['id'][-1]:
        preds['tmp_ners'][-1].extend(row['result'].split('\n'))
    else:
        preds['id'].append(row['id'])
        preds['tmp_ners'].append(row['result'].split('\n'))
    

In [88]:
subm = []
for i, row in test_df.iterrows():
    tmp = {}
    tmp['id'] = row['id']
    tmp['ners'] = postprocess_ners(preds['tmp_ners'][i], row['senences'])
    subm.append(tmp)

In [89]:
save_jsonl('test', subm)