In [27]:
import pandas as pd
import numpy as np
import json

In [28]:
# Import the dataset - I'll use only the paragraphs that were manually annotated as "keep"
dataset = pd.read_csv("data/Sl-and-MT-GINCO-mapped-to-GINCORE-keeptext.csv")
dataset.head()

Unnamed: 0,id,url,crawled,hard,primary_level_1,primary_level_2,primary_level_3,secondary_level_1,secondary_level_2,secondary_level_3,tertiary_level_1,tertiary_level_2,tertiary_level_3,split,domain,GINCORE,full_text,MT-text,text_length
0,3949,http://www.pomurje.si/aktualno/sport/zimska-li...,2014,False,News/Reporting,News/Reporting,News/Reporting,,,,,,,test,www.pomurje.si,News,"Šport <p/> Zimska liga malega nogometa sobota,...",Sport <p/> Winter Little League Football Satur...,93
1,3726,http://www.ss-sezana.si/sss/index.php?option=c...,2014,False,Information/Explanation,Information/Explanation,Information/Explanation,,,,,,,train,www.ss-sezana.si,Information/Explanation,JEDILNIK <p/> Iskalnik <p/> Poglavitni cilj pr...,JEDILNIK <p/> Search <p/> The main objective o...,76
2,5621,http://www.kamnik-starejsi.si/novice/144-sodel...,2014,False,Promotion of Services,Promotion of Services,Promotion,Opinion/Argumentation,Opinion/Argumentation,Opinion/Argumentation,Information/Explanation,Information/Explanation,Information/Explanation,train,www.kamnik-starejsi.si,Promotion,Projekt INNOVAge in zavod Oreli <p/> Zavod Ore...,Project INNOVAge and the Oreli Institute <p/> ...,232
3,3776,http://www.radiocelje.si/novica.php?id=13007&a...,2014,False,News/Reporting,News/Reporting,News/Reporting,,,,,,,train,www.radiocelje.si,News,"V novembru, mesecu preprečevanja odvisnosti, b...","In November, the month of addiction prevention...",158
4,2102,http://www.mtv.si/novice/selena-gomez-ponudila...,2014,False,Opinionated News,Opinionated News,Opinionated News,,,,,,,test,www.mtv.si,News,Selena Gomez ponudila v poslušanje novi album ...,Selena Gomez launches new album <p/> 16.07.201...,63


In [29]:
# Let's use only the instances which do not have two classes - do not have a secondary label (are not fuzzy)
dataset_reduced = dataset[dataset["secondary_level_1"].isnull()]
dataset_reduced.describe()

Unnamed: 0,crawled,text_length
count,812.0,812.0
mean,2017.362069,352.876847
std,3.499437,490.094958
min,2014.0,12.0
25%,2014.0,94.0
50%,2014.0,201.0
75%,2021.0,404.75
max,2021.0,4364.0


In [30]:
# Keep in the dataset only MT text and the label. We will use the 3 level of primary labels to lower the number of labels, as the labels will be then further merged into two classes (subjective, objective)
df = dataset_reduced[["primary_level_3", "MT-text"]]
df.head()

Unnamed: 0,primary_level_3,MT-text
0,News/Reporting,Sport <p/> Winter Little League Football Satur...
1,Information/Explanation,JEDILNIK <p/> Search <p/> The main objective o...
3,News/Reporting,"In November, the month of addiction prevention..."
4,Opinionated News,Selena Gomez launches new album <p/> 16.07.201...
6,Promotion,The introduction of modern technology and keep...


In [31]:
df.describe()

Unnamed: 0,primary_level_3,MT-text
count,812,812
unique,12,812
top,Promotion,Sport <p/> Winter Little League Football Satur...
freq,153,1


In [32]:
#Let's merge labels in 2 classes: objective and subjective
df.primary_level_3.value_counts()

Promotion                     153
List of Summaries/Excerpts    106
Information/Explanation        96
Opinion/Argumentation          96
News/Reporting                 93
Opinionated News               79
Other                          67
Forum                          50
Instruction                    35
Legal/Regulation               16
Announcement                   14
Interview                       7
Name: primary_level_3, dtype: int64

In [33]:
label_mapping = {"Information/Explanation": 'objective', "News/Reporting":'objective', "Instruction":'objective', "Legal/Regulation":'objective', "Announcement":'objective', "Promotion":'subjective', "Opinion/Argumentation":'subjective', "Opinionated News":'subjective', "Forum":'subjective'}

df["label"] = df["primary_level_3"]
df["label"].replace(label_mapping, inplace=True)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = df["primary_level_3"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


Unnamed: 0,primary_level_3,MT-text,label
0,News/Reporting,Sport <p/> Winter Little League Football Satur...,objective
1,Information/Explanation,JEDILNIK <p/> Search <p/> The main objective o...,objective
3,News/Reporting,"In November, the month of addiction prevention...",objective
4,Opinionated News,Selena Gomez launches new album <p/> 16.07.201...,subjective
6,Promotion,The introduction of modern technology and keep...,subjective


In [34]:
df.label.value_counts()

subjective                    378
objective                     254
List of Summaries/Excerpts    106
Other                          67
Interview                       7
Name: label, dtype: int64

In [35]:
# Keep only instances that have labels that were used in the mapping (e.g., discard Interviews, List of Summaries ...)
final_df = df[df["label"].isin(["objective", "subjective"])]
final_df.describe()

Unnamed: 0,primary_level_3,MT-text,label
count,632,632,632
unique,9,632,2
top,Promotion,Sport <p/> Winter Little League Football Satur...,subjective
freq,153,1,378


In [40]:
print(final_df.label.value_counts())
print(final_df.label.value_counts(normalize=True))

subjective    378
objective     254
Name: label, dtype: int64
subjective    0.598101
objective     0.401899
Name: label, dtype: float64


In [36]:
final_df.tail()

Unnamed: 0,primary_level_3,MT-text,label
990,News/Reporting,Slovenian Ethnographic Museum <p/> Cooperation...,objective
992,Instruction,What you need: a plastic bag and a freezer. <p...,objective
995,Announcement,EuroBasket and changes in the traffic regime <...,objective
997,Information/Explanation,Project News <p/> Promotional project newspape...,objective
1000,Opinion/Argumentation,The debate often brings to the surface first t...,subjective


In [41]:
objectivity_dataset = final_df[["MT-text", "label"]]
objectivity_dataset.columns = ["text", "label"]
objectivity_dataset.head()

Unnamed: 0,text,label
0,Sport <p/> Winter Little League Football Satur...,objective
1,JEDILNIK <p/> Search <p/> The main objective o...,objective
3,"In November, the month of addiction prevention...",objective
4,Selena Gomez launches new album <p/> 16.07.201...,subjective
6,The introduction of modern technology and keep...,subjective


In [43]:
# Save the dataset as csv
objectivity_dataset.to_csv("data/MT-GINCO-objectivity-dataset.csv", index = False)

We now have a dataset with 632 instances with the following distribution of labels: 60 % subjective, 40 % objective.

# Split the dataset

In [None]:
labels = list(dataset.label.unique())
labels

In [None]:
# Split the dataset into train and test split: X are texts, Y are labels

X_train,X_test,Y_train,Y_test = train_test_split(dataset["text"],dataset["label"], test_size=0.2, shuffle = True, stratify= dataset.label)


In [None]:
# Save the file with splits
split_dataset = pd.DataFrame({"text": X_train, "label":Y_train, "split":"train"})
split_dataset.head()

In [None]:
split_dataset_part2 = pd.DataFrame({"text": X_test, "label":Y_test, "split":"test"})
split_dataset_part2.head()

In [None]:
final_split_dataset = pd.concat([split_dataset,split_dataset_part2], ignore_index=True)
final_split_dataset.describe()

In [None]:
# Save the dataset as csv
final_split_dataset.to_csv("data/MT-GINCO-split-objectivity-dataset.csv", index = False)