In [33]:
import pandas as pd
import numpy as np

### Unlabelled dataset

In [78]:
full_df = pd.read_csv('../data/REVIEWS - raw/6_arg_extraction_t_prediction_t_confidence-2.csv')
full_df['id'] = full_df.conference + full_df.paper_id.astype(str) +\
    '_' + full_df.review_id.astype(str) + '_' + full_df.sentence_id.astype(str)
full_df['annotation'] = 'UNL'
full_df = full_df[['id', 'sentence', 'annotation']]
full_df['topic'] = ''

In [79]:
full_df

Unnamed: 0,id,sentence,annotation,topic
0,graph201_1_0,"""I have reviewed this paper earlier as a SIGGR...",UNL,
1,graph201_1_1,The paper presents a novel method based on dee...,UNL,
2,graph201_1_2,"As a representation for the 3D shape, the auth...",UNL,
3,graph201_1_3,The model has been trained on a tiny dataset (...,UNL,
4,graph201_1_4,The architecture of the network is based on an...,UNL,
...,...,...,...,...
257811,iclr202213_3_14,Your grammar is backwards.,UNL,
257812,iclr202213_3_15,"The question you are trying to express is ""bia...",UNL,
257813,iclr202213_3_16,"So heading should be ""to what inside the envir...",UNL,
257814,iclr202213_3_17,"--> idk that this is that surprising, it was k...",UNL,


In [80]:
full_df[['annotation', 'sentence', 'topic']].to_csv('../data/REVIEWS - clean/in-topic/unlabelled.tsv', sep='\t', index=False)


### Labelled dataset

In [81]:
labelled_df = pd.read_csv('../data/REVIEWS - raw/sentences_just_one_position.csv', header=None, 
                          names=['id', 'sentence', 'annotation'], sep='\t')
label_dict = {
    ' NEG': 'Argument_against',
    ' POS': 'Argument_for',
    ' NA': 'NoArgument'
}
labelled_df['topic'] = ''
labelled_df['annotation'] = labelled_df['annotation'].map(lambda x: label_dict[x])

In [82]:
# Non-overlap check
pd.merge(full_df, labelled_df, on='id')

Unnamed: 0,id,sentence_x,annotation_x,topic_x,sentence_y,annotation_y,topic_y


In [83]:
labelled_df.annotation.value_counts()

Argument_against    640
NoArgument          558
Argument_for        203
Name: annotation, dtype: int64

In [84]:
train_df, val_df, test_df = np.split(labelled_df.sample(frac=1, random_state=42), 
                                     [int(.6*len(labelled_df)), int(.8*len(labelled_df))])

In [85]:
len(train_df), len(val_df), len(test_df)

(840, 280, 281)

In [86]:
len(train_df)/len(labelled_df), len(val_df)/len(labelled_df), len(test_df)/len(labelled_df)

(0.5995717344753747, 0.1998572448251249, 0.20057102069950036)

In [87]:
train_df[['annotation', 'sentence', 'topic']].to_csv('../data/REVIEWS - clean/in-topic/train.tsv', sep='\t', index=False)
val_df[['annotation', 'sentence', 'topic']].to_csv('../data/REVIEWS - clean/in-topic/val.tsv', sep='\t', index=False)
test_df[['annotation', 'sentence', 'topic']].to_csv('../data/REVIEWS - clean/in-topic/test.tsv', sep='\t', index=False)
