In [7]:
import pandas as pd

# The CORE dataset that we obtained was split into train, dev and test split. First, we merged the splits.

train_df = pd.read_table("G:/My Drive/Institut Jozef Stefan/Projekti na Institutu/Določanje žanrov/Mapping to CORE/Corpora/Original CORE files/train.tsv")
train_df.columns = ["label", "text"]


train_df["split"] = "train"
train_df.head()

Unnamed: 0,label,text,split
0,NA OP SR OB,The Top TEN 'Whiniest Sets of Fans' in English...,train
1,NA NE,"Ferry consultation needs deeper questions, say...",train
2,ID DF,I'v been recording and mixing music for about ...,train
3,NA SR,The 25-year-old did have chances at Anfield. B...,train
4,NA IN HA,Elizabethan Science and Technology Elizabethan...,train


In [8]:
test_df = pd.read_csv("G:/My Drive/Institut Jozef Stefan/Projekti na Institutu/Določanje žanrov/Mapping to CORE/Corpora/Original CORE files/test.tsv", sep="\t")
test_df.columns = ["label", "text"]

test_df["split"] = "test"
test_df.head()

Unnamed: 0,label,text,split
0,IN OI,DEFINITION About a quarter of the world drives...,test
1,OP OB,How should retail investors own gold? Mostly p...,test
2,NA NE,Famed railway leader Hunter Harrison said he i...,test
3,NA NE,Louis Walsh also claimed that Gary Barlow and ...,test
4,OP OB,"Essays on economic crisis, decoding dominant i...",test


In [9]:
dev_df = pd.read_csv("G:/My Drive/Institut Jozef Stefan/Projekti na Institutu/Določanje žanrov/Mapping to CORE/Corpora/Original CORE files/dev.tsv", sep="\t")
dev_df.columns = ["label", "text"]

dev_df["split"] = "dev"
dev_df.head()

Unnamed: 0,label,text,split
0,OP OB,Being a Successful Band Has Nothing to Do With...,dev
1,NA IN NE DP,'I always like to think of myself as a cartoon...,dev
2,NA SR,QPR vs Southampton - Match preview and team ne...,dev
3,NA NE,Search age: Search in: US says it's ready to t...,dev
4,IN OP DT,Welcome to India Here at The Great Recovery HQ...,dev


In [10]:
dataset_df = train_df.append(test_df,ignore_index = True)

dataset_df.head()

Unnamed: 0,label,text,split
0,NA OP SR OB,The Top TEN 'Whiniest Sets of Fans' in English...,train
1,NA NE,"Ferry consultation needs deeper questions, say...",train
2,ID DF,I'v been recording and mixing music for about ...,train
3,NA SR,The 25-year-old did have chances at Anfield. B...,train
4,NA IN HA,Elizabethan Science and Technology Elizabethan...,train


In [11]:
dataset_df = dataset_df.append(dev_df, ignore_index = True)

dataset_df.tail()

Unnamed: 0,label,text,split
48444,NA NE,Lollapalooza heading to the Middle East in 201...,dev
48445,OP AV,How to: Improve Brand Engagement on Twitter Ea...,dev
48446,OP OB,The Last Post. A shameful reflection on Canada...,dev
48447,NA NE,Related Stories Students at a Sherwood Park hi...,dev
48448,IN OP,2009 September 14th The folks over at VGBlogge...,dev


In [12]:
dataset_df.describe()

Unnamed: 0,label,text,split
count,48449,48432,48449
unique,460,48420,3
top,NA NE,Magdalen Top of the Norrington Table Magdalen ...,train
freq,9872,2,33914


In [14]:
# Check whether there are any empty cells
dataset_df[dataset_df.isnull().any(axis=1)]

Unnamed: 0,label,text,split
93,IN DT,,train
300,IN DT,,train
6436,IN DT,,train
11816,ID DF,,train
13802,NA NE,,train
14133,IN,,train
20588,SP OS,,train
22072,ID QA,,train
23191,LY PO,,train
32198,ID DF,,train


In [15]:
# Delete rows with empty cells
dataset_df.dropna(subset=["text"], inplace=True)
dataset_df.describe()

Unnamed: 0,label,text,split
count,48432,48432,48432
unique,459,48420,3
top,NA NE,Magdalen Top of the Norrington Table Magdalen ...,train
freq,9871,2,33904


In [18]:
dataset_df[dataset_df.text.duplicated(keep=False)].sort_values("text") 

Unnamed: 0,label,text,split


In [19]:
dataset_df.drop_duplicates("text",inplace=True)
dataset_df.describe() 

Unnamed: 0,label,text,split
count,48420,48420,48420
unique,459,48420,3
top,NA NE,The Top TEN 'Whiniest Sets of Fans' in English...,train
freq,9865,1,33898


In [20]:
main_labels = ["IN", "OP", "NA", "IP", "ID", "HI", "LY", "SP"]

In [21]:
list_label = list(dataset_df.label)
len(list_label)

48420

In [22]:
list_label[:10]

['NA OP SR OB',
 'NA NE',
 'ID DF',
 'NA SR',
 'NA IN HA',
 'NA NE',
 'ID DF',
 'NA PB',
 'OP RV',
 'NA NE']

In [23]:
main_labels_list = []
sublabels_list = []

for instance in list_label:
    label = instance.split()
    current_list = [word for word in label if word in main_labels]
    current_string = " ".join(current_list)
    main_labels_list.append(current_string)

    current_sublist = [word for word in label if word not in main_labels]
    current_sub_string = " ".join(current_sublist)
    sublabels_list.append(current_sub_string)

print(len(sublabels_list))
print(len(main_labels_list))


48420
48420


In [24]:
dataset_df["main_labels"] = main_labels_list
dataset_df["sublabels"] = sublabels_list

dataset_df.head()

Unnamed: 0,label,text,split,main_labels,sublabels
0,NA OP SR OB,The Top TEN 'Whiniest Sets of Fans' in English...,train,NA OP,SR OB
1,NA NE,"Ferry consultation needs deeper questions, say...",train,,NE
2,ID DF,I'v been recording and mixing music for about ...,train,ID,DF
3,NA SR,The 25-year-old did have chances at Anfield. B...,train,,SR
4,NA IN HA,Elizabethan Science and Technology Elizabethan...,train,NA IN,HA


In [25]:
dataset_df.describe()

Unnamed: 0,label,text,split,main_labels,sublabels
count,48420,48420,48420,48420.0,48420
unique,459,48420,3,35.0,297
top,NA NE,The Top TEN 'Whiniest Sets of Fans' in English...,train,,NE
freq,9865,1,33898,17659.0,10503


In [26]:
#Calculate number of labels for each instance
dataset_df['Len'] = dataset_df['label'].str.count(' ') + 1
dataset_df.head()

Unnamed: 0,label,text,split,main_labels,sublabels,Len
0,NA OP SR OB,The Top TEN 'Whiniest Sets of Fans' in English...,train,NA OP,SR OB,4
1,NA NE,"Ferry consultation needs deeper questions, say...",train,,NE,2
2,ID DF,I'v been recording and mixing music for about ...,train,ID,DF,2
3,NA SR,The 25-year-old did have chances at Anfield. B...,train,,SR,2
4,NA IN HA,Elizabethan Science and Technology Elizabethan...,train,NA IN,HA,3


In [34]:
#Calculate number of main labels for each instance
dataset_df['main_len'] = dataset_df['main_labels'].str.count(' ') + 1
dataset_df.head()

Unnamed: 0,label,text,split,main_labels,sublabels,Len,multiple_labels,GINCORE,main_len
0,NA OP SR OB,The Top TEN 'Whiniest Sets of Fans' in English...,train,NA OP,SR OB,4,y,NA OP SR OB,2
1,NA NE,"Ferry consultation needs deeper questions, say...",train,,NE,2,n,News,1
2,ID DF,I'v been recording and mixing music for about ...,train,ID,DF,2,n,Forum,1
3,NA SR,The 25-year-old did have chances at Anfield. B...,train,,SR,2,n,News,1
4,NA IN HA,Elizabethan Science and Technology Elizabethan...,train,NA IN,HA,3,y,NA IN HA,2


In [27]:
# Add information regarding multiple labels present
dataset_df["multiple_labels"] = "0"

In [28]:
import numpy as np

# create a list of our conditions
conditions = [
    (dataset_df['Len'] <= 2),
    (dataset_df['Len'] > 2)
    ]

# create a list of the values we want to assign for each condition
values = ['n', 'y']

# create a new column and use np.select to assign values to it using our lists as arguments
dataset_df['multiple_labels'] = np.select(conditions, values)

# display updated DataFrame
dataset_df.head()

Unnamed: 0,label,text,split,main_labels,sublabels,Len,multiple_labels
0,NA OP SR OB,The Top TEN 'Whiniest Sets of Fans' in English...,train,NA OP,SR OB,4,y
1,NA NE,"Ferry consultation needs deeper questions, say...",train,,NE,2,n
2,ID DF,I'v been recording and mixing music for about ...,train,ID,DF,2,n
3,NA SR,The 25-year-old did have chances at Anfield. B...,train,,SR,2,n
4,NA IN HA,Elizabethan Science and Technology Elizabethan...,train,NA IN,HA,3,y


In [29]:
dataset_df.multiple_labels.value_counts()

n    42275
y     6145
Name: multiple_labels, dtype: int64

In [30]:
GINCORE_mappings = {'NA NE':'News', 'ID DF': 'Forum', 'NA SR': 'News', 'NA PB': 'Opinion/Argumentation', 'OP RV':'Review', 'OP RS':'Opinion/Argumentation', 'IN RA':'Research Article','OP OB':'Opinion/Argumentation', 'IN DT':'discard', 'IN': 'Information/Explanation', 'IN OI':'Information/Explanation', 'NA HA':'Information/Explanation', 'OP':'Opinion/Argumentation', 'LY SL':'Lyrical', 'IP DS':'Promotion',
'OTHER':'discard', 'HI HT':'Instruction', 'OP AV':'Opinion/Argumentation', 'NA MA':'discard', 'HI FH':'FAQ', 'IN EN':'Information/Explanation', 'NA SS':'Prose',
'SP IT':'Interview', 'NA ': 'discard', 'IN DP':'Information/Explanation', 'NA TB':'Opinion/Argumentation', 'IN IB':'discard', 'OP LE':'discard', 'ID QA':'Forum','IN FI':'FAQ', 'SP FS':'Opinion/Argumentation', 'OP ID':'discard', 'ID RR':'Forum', 'IP PA':'Opinion/Argumentation', 'IN CM':'Information/Explanation', 'IN IP':'discard','NA DP':'discard', 'IN LT':'Legal/Regulation', 'NA IN':'discard', 'IN OP':'discard', 'IP':'discard', 'HI RE':'Recipe', 'HI':'discard', 'ID':'Forum',
'NA OP':'discard', 'LY PO':'Lyrical', 'IP ED':'Opinion/Argumentation', 'SP TA':'discard', 'NA IP':'discard', 'IN ID':'discard', 'IN HI':'discard',
'HI TS':'Instruction', 'IN TR':'discard', 'SP':'discard', 'IP OE':'discard', 'SP OS':'discard', 'LY PR':'Lyrical', 'OP HI':'discard',
'SP TV':'Script/Drama', 'OP AD':'Promotion', 'OP OO':'discard', 'OP DP':'discard', 'ID HI':'discard', 'OP IP':'discard', 'LY OL':'Lyrical',
'LY': 'Lyrical', 'NA ON':'discard', 'OP LY':'discard', 'HI OH':'discard', 'ID OF':'Forum', 'IN SP':'discard', 'HI LY':'discard',
'NA HI':'discard', 'ID IP':'discard', 'NA ID':'discard', 'NA SP':'discard', 'IN LY':'discard'}

In [35]:
# Map the GINCORE labels to the original labels
dataset_df["GINCORE"] = dataset_df["label"]
dataset_df["GINCORE"].replace(GINCORE_mappings, inplace=True)

dataset_df.head()

Unnamed: 0,label,text,split,main_labels,sublabels,Len,multiple_labels,GINCORE,main_len
0,NA OP SR OB,The Top TEN 'Whiniest Sets of Fans' in English...,train,NA OP,SR OB,4,y,NA OP SR OB,2
1,NA NE,"Ferry consultation needs deeper questions, say...",train,,NE,2,n,News,1
2,ID DF,I'v been recording and mixing music for about ...,train,ID,DF,2,n,Forum,1
3,NA SR,The 25-year-old did have chances at Anfield. B...,train,,SR,2,n,News,1
4,NA IN HA,Elizabethan Science and Technology Elizabethan...,train,NA IN,HA,3,y,NA IN HA,2


In [49]:
single_labels = dataset_df[dataset_df["multiple_labels"] == "n"]
single_main_labels = single_labels[single_labels["main_len"]==1]
single_main_labels[single_main_labels["GINCORE"] == "discard"].label.value_counts()

IN DT    3286
IN IB    1426
NA        864
OTHER     582
NA DP     186
IP        152
HI         74
NA MA      64
SP TA      60
IN TR      41
OP LE      39
SP OS      17
SP         16
OP OO      15
IP OE       9
NA ON       7
OP DP       4
HI OH       2
Name: label, dtype: int64

In [50]:
# Use only instances that have single labels, mapped to the GINCORE schema
dataset_single_labels_df = dataset_df[dataset_df["multiple_labels"]=="n"]
GINCORE_dataset = dataset_single_labels_df[dataset_single_labels_df["GINCORE"] != "discard"]

final_dataset = GINCORE_dataset[["label", "text", "split", "GINCORE"]]
final_dataset.head()

Unnamed: 0,label,text,split,GINCORE
1,NA NE,"Ferry consultation needs deeper questions, say...",train,News
2,ID DF,I'v been recording and mixing music for about ...,train,Forum
3,NA SR,The 25-year-old did have chances at Anfield. B...,train,News
5,NA NE,The value of NYC housing construction starts m...,train,News
6,ID DF,"If this is your first visit, be sure to check ...",train,Forum


In [52]:
final_dataset.GINCORE.value_counts()

News                       12658
Opinion/Argumentation       8980
Information/Explanation     3406
Forum                       3108
Review                      1687
Instruction                 1200
Promotion                   1026
Research Article             804
Lyrical                      636
Interview                    420
Prose                        276
FAQ                          272
Legal/Regulation             186
Recipe                       167
Script/Drama                  22
Name: GINCORE, dtype: int64

In [51]:
final_dataset.describe()

Unnamed: 0,label,text,split,GINCORE
count,34848,34848,34848,34848
unique,40,34848,3,15
top,NA NE,"Ferry consultation needs deeper questions, say...",train,News
freq,9865,1,24401,12658


In [53]:
file = final_dataset.to_json("cleaned_CORE_corpora.json", orient="index",indent=2)


In [54]:
csv_file = final_dataset.to_csv("cleaned_CORE_corpora.csv",index=False)