In [None]:
# Locate to working directory - local
%cd '/home/abaric/TakeLab/projects/Retriever/Sentiment/'

In [None]:
# Locate to working directory - server
%cd '/home/abaric/retriever-sentiment'

In [None]:
%load_ext autoreload
%autoreload 2

# NER detection
______________________________________________________________________________________________________________________________________________________________________________

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import torch

from tqdm import tqdm
import time
import pandas as pd

## Import data

In [None]:
df = pd.read_csv('data/retriever/sample_20220825.csv')

In [None]:
df

## NER pipeline set-up

In [None]:
# Set up GPU

if torch.cuda.is_available():       
    device = torch.device("cuda:0")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(1))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(1)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(1)/1024**3,1), 'GB')

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

### Import tokenizer, model and pipeline

In [None]:
# Load BERTić tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("classla/bcms-bertic-ner")
model = AutoModelForTokenClassification.from_pretrained("classla/bcms-bertic-ner").to('cuda')

# Set up ner pipeline 
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="first", device=0)

### NER detection function

In [None]:
def detect_ner(ner_pipeline, df):
    # Init df with detected ner
    ner_df = pd.DataFrame()

    for idx, row in tqdm(df.iterrows()):

        ner_result = ner_pipeline(row['title'])

        # If detected NER exists
        if ner_result:

            for ner in ner_result:

                ner_df =  ner_df.append({'title_id': idx,
                                        'article_id':row['id'],
                                        'title': row['title'],
                                        'ner': ner['word'],
                                        'ner_type': ner['entity_group'],
                                        'ner_begin': ner['start'],
                                        'ner_end': ner['end']}, 
                                        ignore_index=True)

    # Set id and index type as int
    ner_df['title_id'] = ner_df['title_id'].astype(int)
    ner_df['article_id'] = ner_df['article_id'].astype(int)
    ner_df['ner_begin'] = ner_df['ner_begin'].astype(int)
    ner_df['ner_end'] = ner_df['ner_end'].astype(int)

    return ner_df

## Detect NER

In [None]:
ner_df = detect_ner(ner_pipeline, df)

In [None]:
ner_df

## Emphasize NER

In [None]:
# Emphasize named entity in article sentence
def emphasize_named_entity(text, begin_ner, end_ner, start_string, end_string):
    return text[:begin_ner] + start_string + text[begin_ner:end_ner] + end_string + text[end_ner:]

In [None]:
ner_df['ner_text'] = ner_df.apply(lambda x: emphasize_named_entity(x['title'], x['ner_begin'], x['ner_end'], '<strong>', '</strong>'), axis=1)
ner_df

### Set document_id

In [None]:
ner_df['document_id'] = ner_df.index

### Rearrange columns

In [None]:
ner_df = ner_df[['article_id', 'title_id', 'document_id', 'title', 'text', 'ner', 'ner_begin', 'ner_end', 'ner_type']]

In [None]:
ner_df

### Discard inadequate NERs

In [None]:
ner_df = ner_df[~((ner_df['ner'].apply(len) == 1)                                             |    # discard one-charachter NERs
                  (ner_df['ner'].isin(['N1', 'narod', 'narod.hr', 'Novi list', 'Dnevno.hr'])) |    # discard portal names as detected NERs 
                  (ner_df['ner'].apply(len) == ner_df['title'].apply(len))                    |    # discard titles that contain only NER
                  ((ner_df['ner'].apply(len) == 2) & (ner_df['ner'].str.islower()))                # discard NER that contain two-charachters and are lowercase
                  )]        

In [None]:
ner_df


## Save to CSV

Merge ner_df with article metadata from Retriever sample

In [None]:
articles_df = pd.read_csv('data/retriever/sample_20220825.csv')
articles_df.rename(columns={'id': 'article_id'}, inplace=True)

In [None]:
ner_df = pd.merge(ner_df, articles_df[['article_id', 'portal', 'date_published', 'body', 'url']], on='article_id')

In [None]:
ner_df

In [None]:
ner_df.to_csv('data/headlines_ner.csv', index=False)

### Sample ner_df for Alanno
This sample will be used for annotation in production rounds in Alanno

In [None]:
sampled_ner_df = ner_df.sample(n=3000)

In [None]:
sampled_ner_df.to_csv('data/sampled_headlines_ner.csv', index=False)