In [2]:
import math
import random
import time
import warnings
import numpy as np
import pandas as pd
import torch
from scipy.optimize import minimize, minimize_scalar
import torch.nn as nn
import os
from random import shuffle
import transformers as T
from pathlib import Path
from sklearn.metrics import fbeta_score
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, Dataset
import tqdm

In [2]:
warnings.filterwarnings("ignore")

In [3]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed = 471
seed_torch(seed)

In [4]:
DATA_DIR = './dataset/data1'
train = pd.read_csv(DATA_DIR  +"/train.csv", index_col=0)
test = pd.read_csv(DATA_DIR + "/test.csv", index_col=0)
sub = pd.read_csv(DATA_DIR + "/sample_submit.csv", header=None)
sub.columns = ["id", "judgement"]

In [5]:
def get_train_data(train):

    # 交差検証 用の番号を振ります。
    Fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    for n, (train_index, val_index) in enumerate(Fold.split(train, train["judgement"])):
        train.loc[val_index, "fold"] = int(n)
    train["fold"] = train["fold"].astype(np.uint8)

    return train

def get_test_data(test):
    return test

In [6]:
train["text"] = train["title"] + " " + train["abstract"].fillna("")
test["text"] = test["title"] + " " + test["abstract"].fillna("")

In [7]:

#stop words list
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 
			'ours', 'ourselves', 'you', 'your', 'yours', 
			'yourself', 'yourselves', 'he', 'him', 'his', 
			'himself', 'she', 'her', 'hers', 'herself', 
			'it', 'its', 'itself', 'they', 'them', 'their', 
			'theirs', 'themselves', 'what', 'which', 'who', 
			'whom', 'this', 'that', 'these', 'those', 'am', 
			'is', 'are', 'was', 'were', 'be', 'been', 'being', 
			'have', 'has', 'had', 'having', 'do', 'does', 'did',
			'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
			'because', 'as', 'until', 'while', 'of', 'at', 
			'by', 'for', 'with', 'about', 'against', 'between',
			'into', 'through', 'during', 'before', 'after', 
			'above', 'below', 'to', 'from', 'up', 'down', 'in',
			'out', 'on', 'off', 'over', 'under', 'again', 
			'further', 'then', 'once', 'here', 'there', 'when', 
			'where', 'why', 'how', 'all', 'any', 'both', 'each', 
			'few', 'more', 'most', 'other', 'some', 'such', 'no', 
			'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 
			'very', 's', 't', 'can', 'will', 'just', 'don', 
			'should', 'now', '']

#cleaning up text
import re
def get_only_chars(line):

    clean_line = ""

    line = line.replace("’", "")
    line = line.replace("'", "")
    line = line.replace("-", " ") #replace hyphens with spaces
    line = line.replace("\t", " ")
    line = line.replace("\n", " ")
    line = line.lower()

    for char in line:
        if char in 'qwertyuiopasdfghjklzxcvbnm ':
            clean_line += char
        else:
            clean_line += ' '

    clean_line = re.sub(' +',' ',clean_line) #delete extra spaces
    if clean_line[0] == ' ':
        clean_line = clean_line[1:]
    return clean_line

########################################################################
# Synonym replacement
# Replace n words in the sentence with synonyms from wordnet
########################################################################

#for the first time you use wordnet
#import nltk
#nltk.download('wordnet')
from nltk.corpus import wordnet 

def synonym_replacement(words, n):
	new_words = words.copy()
	random_word_list = list(set([word for word in words if word not in stop_words]))
	random.shuffle(random_word_list)
	num_replaced = 0
	for random_word in random_word_list:
		synonyms = get_synonyms(random_word)
		if len(synonyms) >= 1:
			synonym = random.choice(list(synonyms))
			new_words = [synonym if word == random_word else word for word in new_words]
			#print("replaced", random_word, "with", synonym)
			num_replaced += 1
		if num_replaced >= n: #only replace up to n words
			break

	#this is stupid but we need it, trust me
	sentence = ' '.join(new_words)
	new_words = sentence.split(' ')

	return new_words

def get_synonyms(word):
	synonyms = set()
	for syn in wordnet.synsets(word): 
		for l in syn.lemmas(): 
			synonym = l.name().replace("_", " ").replace("-", " ").lower()
			synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
			synonyms.add(synonym) 
	if word in synonyms:
		synonyms.remove(word)
	return list(synonyms)

########################################################################
# Random deletion
# Randomly delete words from the sentence with probability p
########################################################################

def random_deletion(words, p):

	#obviously, if there's only one word, don't delete it
	if len(words) == 1:
		return words

	#randomly delete words with probability p
	new_words = []
	for word in words:
		r = random.uniform(0, 1)
		if r > p:
			new_words.append(word)

	#if you end up deleting all words, just return a random word
	if len(new_words) == 0:
		rand_int = random.randint(0, len(words)-1)
		return [words[rand_int]]

	return new_words

########################################################################
# Random swap
# Randomly swap two words in the sentence n times
########################################################################

def random_swap(words, n):
	new_words = words.copy()
	for _ in range(n):
		new_words = swap_word(new_words)
	return new_words

def swap_word(new_words):
	random_idx_1 = random.randint(0, len(new_words)-1)
	random_idx_2 = random_idx_1
	counter = 0
	while random_idx_2 == random_idx_1:
		random_idx_2 = random.randint(0, len(new_words)-1)
		counter += 1
		if counter > 3:
			return new_words
	new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] 
	return new_words

########################################################################
# Random insertion
# Randomly insert n words into the sentence
########################################################################

def random_insertion(words, n):
	new_words = words.copy()
	for _ in range(n):
		add_word(new_words)
	return new_words

def add_word(new_words):
	synonyms = []
	counter = 0
	while len(synonyms) < 1:
		random_word = new_words[random.randint(0, len(new_words)-1)]
		synonyms = get_synonyms(random_word)
		counter += 1
		if counter >= 10:
			return
	random_synonym = synonyms[0]
	random_idx = random.randint(0, len(new_words)-1)
	new_words.insert(random_idx, random_synonym)

########################################################################
# main data augmentation function
########################################################################

def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9):
	
	sentence = get_only_chars(sentence)
	words = sentence.split(' ')
	words = [word for word in words if word is not '']
	num_words = len(words)
	
	augmented_sentences = []
	num_new_per_technique = int(num_aug/4)+1

	#sr
	if (alpha_sr > 0):
		n_sr = max(1, int(alpha_sr*num_words))
		for _ in range(num_new_per_technique):
			a_words = synonym_replacement(words, n_sr)
			augmented_sentences.append(' '.join(a_words))

	#ri
	if (alpha_ri > 0):
		n_ri = max(1, int(alpha_ri*num_words))
		for _ in range(num_new_per_technique):
			a_words = random_insertion(words, n_ri)
			augmented_sentences.append(' '.join(a_words))

	#rs
	if (alpha_rs > 0):
		n_rs = max(1, int(alpha_rs*num_words))
		for _ in range(num_new_per_technique):
			a_words = random_swap(words, n_rs)
			augmented_sentences.append(' '.join(a_words))

	#rd
	if (p_rd > 0):
		for _ in range(num_new_per_technique):
			a_words = random_deletion(words, p_rd)
			augmented_sentences.append(' '.join(a_words))

	augmented_sentences = [get_only_chars(sentence) for sentence in augmented_sentences]
	shuffle(augmented_sentences)

	#trim so that we have the desired number of augmented sentences
	if num_aug >= 1:
		augmented_sentences = augmented_sentences[:num_aug]
	else:
		keep_prob = num_aug / len(augmented_sentences)
		augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]

	#append the original sentence
	augmented_sentences.append(sentence)

	return augmented_sentences

In [8]:
def set_eda(df):
    new_lines = []
    eda_df = pd.DataFrame()
    for id, line, label in tqdm.tqdm(zip(df.index.tolist(),df['text'].tolist(), df['judgement'].tolist()), total=len(df['judgement'].tolist())):
        new_line = eda(line, alpha_sr=0.05, alpha_ri=0.0, alpha_rs=0.0, p_rd=0.1, num_aug=16)
        for l in new_line:
            eda_df = eda_df.append({'text': l, 'judgement': label, 'eda':1, 'pre_id':id}, ignore_index=True)
    eda_df['pre_id']=eda_df['pre_id'].astype(np.int64)
    eda_df['judgement'] = eda_df['judgement'].astype(np.int64)
    eda_df['eda'] = eda_df['eda'].astype(np.int64)
    return eda_df

In [9]:
train_eda = set_eda(train[train.judgement==1])
train['eda'] = 0
train['pre_id'] = None

100%|█████████████████████████████████████████| 632/632 [00:13<00:00, 46.22it/s]


In [10]:
train_new = pd.concat([train, train_eda], axis=0)

In [11]:
train_new

Unnamed: 0,title,abstract,judgement,text,eda,pre_id
0,One-year age changes in MRI brain volumes in o...,Longitudinal studies indicate that declines in...,0,One-year age changes in MRI brain volumes in o...,0,
1,Supportive CSF biomarker evidence to enhance t...,The present study was undertaken to validate t...,0,Supportive CSF biomarker evidence to enhance t...,0,
2,Occurrence of basal ganglia germ cell tumors w...,Objective: To report a case series in which ba...,0,Occurrence of basal ganglia germ cell tumors w...,0,
3,New developments in diagnosis and therapy of C...,The etiology and pathogenesis of idiopathic ch...,0,New developments in diagnosis and therapy of C...,0,
4,Prolonged shedding of SARS-CoV-2 in an elderly...,,0,Prolonged shedding of SARS-CoV-2 in an elderly...,0,
...,...,...,...,...,...,...
6947,,,1,comparison of regional brain volume and glucos...,1,27127
6948,,,1,comparison regional brain volume and glucose m...,1,27127
6949,,,1,comparison of regional brain volume and glucos...,1,27127
6950,,,1,comparison of regional brain volume and glucos...,1,27127


In [12]:
train_new = train_new.reset_index()
train_new.index.name = 'id'
train_new.drop(columns=['index'], inplace=True)

In [13]:
train_new.head()

Unnamed: 0_level_0,title,abstract,judgement,text,eda,pre_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,One-year age changes in MRI brain volumes in o...,Longitudinal studies indicate that declines in...,0,One-year age changes in MRI brain volumes in o...,0,
1,Supportive CSF biomarker evidence to enhance t...,The present study was undertaken to validate t...,0,Supportive CSF biomarker evidence to enhance t...,0,
2,Occurrence of basal ganglia germ cell tumors w...,Objective: To report a case series in which ba...,0,Occurrence of basal ganglia germ cell tumors w...,0,
3,New developments in diagnosis and therapy of C...,The etiology and pathogenesis of idiopathic ch...,0,New developments in diagnosis and therapy of C...,0,
4,Prolonged shedding of SARS-CoV-2 in an elderly...,,0,Prolonged shedding of SARS-CoV-2 in an elderly...,0,


In [14]:
train.judgement.value_counts()

0    26513
1      632
Name: judgement, dtype: int64

In [15]:
train_new.judgement.value_counts()

0    26513
1     7584
Name: judgement, dtype: int64

In [16]:
train = get_train_data(train_new)
test = get_test_data(test)

In [17]:
#保存先のディレクトリ作成
save_dir = "./dataset/data4"
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
train.to_csv(os.path.join(save_dir, 'train.csv'))
test.to_csv(os.path.join(save_dir, 'test.csv'))
sub.to_csv(os.path.join(save_dir, 'sample_submit.csv'), index=None,header=None)

In [20]:
train

Unnamed: 0_level_0,title,abstract,judgement,text,eda,pre_id,fold
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,One-year age changes in MRI brain volumes in o...,Longitudinal studies indicate that declines in...,0,One-year age changes in MRI brain volumes in o...,0,,0
1,Supportive CSF biomarker evidence to enhance t...,The present study was undertaken to validate t...,0,Supportive CSF biomarker evidence to enhance t...,0,,1
2,Occurrence of basal ganglia germ cell tumors w...,Objective: To report a case series in which ba...,0,Occurrence of basal ganglia germ cell tumors w...,0,,4
3,New developments in diagnosis and therapy of C...,The etiology and pathogenesis of idiopathic ch...,0,New developments in diagnosis and therapy of C...,0,,3
4,Prolonged shedding of SARS-CoV-2 in an elderly...,,0,Prolonged shedding of SARS-CoV-2 in an elderly...,0,,1
...,...,...,...,...,...,...,...
34092,,,1,comparison of regional brain volume and glucos...,1,27127,4
34093,,,1,comparison regional brain volume and glucose m...,1,27127,3
34094,,,1,comparison of regional brain volume and glucos...,1,27127,4
34095,,,1,comparison of regional brain volume and glucos...,1,27127,1


In [21]:
test

Unnamed: 0_level_0,title,abstract,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
27145,Estimating the potential effects of COVID-19 p...,The objective of the paper is to analyse chang...,Estimating the potential effects of COVID-19 p...
27146,Leukoerythroblastic reaction in a patient with...,,Leukoerythroblastic reaction in a patient with...
27147,[15O]-water PET and intraoperative brain mappi...,[15O]-water PET was performed on 12 patients w...,[15O]-water PET and intraoperative brain mappi...
27148,Adaptive image segmentation for robust measure...,We present a method that significantly improve...,Adaptive image segmentation for robust measure...
27149,Comparison of Epidemiological Variations in CO...,The objective of this study is to compare the ...,Comparison of Epidemiological Variations in CO...
...,...,...,...
67974,"Knowledge, Attitude, and Practices of Healthca...",In the current outbreak of novel coronavirus (...,"Knowledge, Attitude, and Practices of Healthca..."
67975,Safety and Efficacy of Anti-Il6-Receptor Tocil...,BACKGROUND: As the novel SARS-CoV-2 pandemic o...,Safety and Efficacy of Anti-Il6-Receptor Tocil...
67976,Functional imaging of head and neck tumors usi...,Positron emission tomography (PET) is an imagi...,Functional imaging of head and neck tumors usi...
67977,Effectiveness of 3D virtual imaging,,Effectiveness of 3D virtual imaging
