In [None]:
import torch
import torch.nn as nn

import numpy as np
import pandas as pd

import json
from tqdm.notebook import tqdm, trange
from pprint import pprint
import random
from collections import Counter, OrderedDict, defaultdict
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter('ignore')

In [None]:
TARGET_DICT = {
    'cs': 0,
    'econ': 1,
    'eess': 2,
    'math': 3,
    'astro-ph': 4,
    'cond-mat': 4,
    'gr-qc': 4,
    'hep-ex': 4,
    'hep-lat': 4,
    'hep-ph': 4,
    'hep-th': 4,
    'math-ph': 4,
    'nlin': 4,
    'nucl-ex': 4,
    'nucl-th': 4,
    'physics': 4,
    'quant-ph': 4,
    'q-bio': 5,
    'q-fin': 6,
    'stat': 7,
}

def prefix2target(prefix: str) -> int:
    return TARGET_DICT.get(prefix, -1)

TARGET_IND2LABEL = {
    0: 'Computer Science',
    1: 'Economics',
    2: 'Electrical Engineering and Systems Science',
    3: 'Mathematics',
    4: 'Physics',
    5: 'Quantitative Biology',
    6: 'Quantitative Finance',
    7: 'Statistics',
}

In [None]:
data_pd = pd.read_json("../arxivData.json")
data_pd.drop(['author', 'day', 'id', 'link', 'month', 'year'],
             axis=1, inplace=True)

In [None]:
def tag_preproc(tag: str) -> int:
    targets = []
    for elem in json.loads(tag.replace("'", '"').replace("None", "null")):
        prefix = elem['term'].split('.')[0]
        if (prefix in TARGET_DICT):
            targets.append(prefix2target(prefix))
    target_cnt = Counter(targets)
    if (len(target_cnt) == 1):
        return [targets[0]]
    else:
        first, second = target_cnt.most_common(2)
        if (first[1] > second[1]):
            return [first[0]]
        else:
            return [first[0], second[0]]

In [None]:
data_pd['target'] = data_pd['tag'].apply(tag_preproc)
data_pd.drop(["tag"], axis=1, inplace=True)
data_pd['title'] = data_pd['title'].apply(lambda x: x.replace("\n", ""))
data_pd.rename(columns={'summary' : 'abstract'}, inplace=True)
data_pd = data_pd[['title', 'abstract', 'target']]

In [None]:
new_train = pd.read_csv("../train.csv")

In [None]:
def new_data_target_preproc(row):
    target = []
    for key, val in TARGET_IND2LABEL.items():
        if val in row:
            if row[val]:
                target.append(key)
    if (len(target) > 2):
        random.shuffle(target)
        target = target[:2]
    return target
new_train['target'] = new_train.apply(new_data_target_preproc, axis=1)

In [None]:
new_train.drop(["ID", "Computer Science", "Physics", "Mathematics", "Statistics", "Quantitative Biology", "Quantitative Finance"], axis=1, inplace=True)

In [None]:
new_train.rename(columns={"TITLE" : 'title', "ABSTRACT": 'abstract'}, inplace=True)

In [None]:
old_titles = data_pd['title'].values
new_titles = new_train['title'].values
len(old_titles), len(new_titles), \
len(set(new_titles).intersection(set(old_titles)))

(41000, 20972, 2007)

In [None]:
inds_to_stay = []
for ind, row in tqdm(enumerate(new_train.iloc)):
    if (row['title'] not in old_titles):
        inds_to_stay.append(ind)

0it [00:00, ?it/s]

In [None]:
data = pd.concat([new_train.iloc[inds_to_stay], data_pd], axis=0, ignore_index=True)

In [None]:
multi_target_inds = data['target'].apply(len) > 1
data_mult_0 = data.loc[multi_target_inds].copy()
data_mult_1 = data.loc[multi_target_inds].copy()
data.drop(np.where(np.array(multi_target_inds))[0], axis=0, inplace=True)

In [None]:
data_mult_0['target'] = data_mult_0['target'].apply(lambda x: x[0:1])
data_mult_1['target'] = data_mult_1['target'].apply(lambda x: x[1:2])
data = pd.concat([data_mult_0, data_mult_1, data], axis=0, ignore_index=True)

In [None]:
(data['target'].apply(len) > 1).values.sum()

0

In [None]:
data['target'] = data['target'].apply(lambda x: x[0])
data['abstract'] = data['abstract'].apply(lambda x: x.replace("\n", " "))
data['title'] = data['title'] + np.full((len(data), ), ".")

In [None]:
DATA_LEN = data.shape[0] 
data_empty_title = data.sample(int(DATA_LEN * 0.1)).copy()
data_empty_abstract = data.sample(int(DATA_LEN * 0.2)).copy()

In [None]:
data_empty_title['title'] = np.full((len(data_empty_title), ), "")
data_empty_abstract['abstract'] = np.full((len(data_empty_abstract), ), "")

In [None]:
data = pd.concat([data, data_empty_title, data_empty_abstract], axis=0, ignore_index=True)
data = data.sample(frac=1).reset_index(drop=True)

In [None]:
data['text'] = data['title'] + data['abstract']
data.drop(['title', 'abstract'], axis=1, inplace=True)
data = data[['text', 'target']]

In [None]:
data.to_csv('data_base.csv', index=False)