## Data Exploration

In [5]:
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# OS operations, regex, and NLP tools
import os
import re
import nltk
from nltk.corpus import stopwords

# Modify Python path
import sys
sys.path.insert(0, '../')

# Import data processing functions
from utils.data import (
    save_processed_data, 
    load_data_to_dataset, 
    remove_duplicates, 
    remove_outliers, 
    display_data, 
    convert_to_classification_dataset
)

from datasets import load_from_disk

In [2]:
# Define the path to the data folder and the common file prefix
data_folder = '../../data/'
file_prefix = 'biased.'

In [3]:
# Construct the full paths for the datasets
data_full = data_folder + file_prefix + 'full'
data_train = data_folder + file_prefix + 'word.train'
data_test = data_folder + file_prefix + 'word.test'
data_dev = data_folder + file_prefix + 'word.dev'

### Data Dictionary

1. **id:** A unique identifier used to link to a Wikipedia Diff view
    
    _Example:_ 532355971 (Links to https://en.wikipedia.org/w/index.php?diff=532355971)

2. **src_tok:** Tokenized source text

    _Example:_ she did not do as promised exposing her as an un ##pr ##in ##ci ##pled politician .

3. **tgt_tok:** Tokenized target text

    _Example:_ she did not do , leading to accusations of her being an un ##pr ##in ##ci ##pled politician

4. **src_raw:** Raw source text

    _Example:_ she did not do as promised exposing her as an unprincipled politician.

5. **tgt_raw:** Raw target text

    _Example:_ she did not do , leading to accusations of her being an unprincipled politician.

6. **src_POS_tags:** Part-of-speech tags for the source text

    _Example:_ PRON VERB ADV VERB ADP VERB VERB PRON ADP DET ADJ ADJ ADJ ADJ ADJ NOUN PUNCT

7. **tgt_parse_tags:** Syntactic parse tags for the target text using the Stanford Parser

    _Example:_ nsubj aux neg ROOT mark advcl xcomp dobj prep det amod amod amod amod amod pobj punct

In [4]:
# Load datasets from their respective files
full = pd.read_csv(data_full, sep='\t', names=["id", "src_tok", "tgt_tok", "src_raw", "tgt_raw", "src_POS_tags", "tgt_parse_tags"], on_bad_lines='skip')
train = pd.read_csv(data_train, sep='\t', names=["id", "src_tok", "tgt_tok", "src_raw", "tgt_raw", "src_POS_tags", "tgt_parse_tags"])
test = pd.read_csv(data_test, sep='\t', names=["id", "src_tok", "tgt_tok", "src_raw", "tgt_raw", "src_POS_tags", "tgt_parse_tags"])
dev = pd.read_csv(data_dev, sep='\t', names=["id", "src_tok", "tgt_tok", "src_raw", "tgt_raw", "src_POS_tags", "tgt_parse_tags"])

In [5]:
full.head()

Unnamed: 0,id,src_tok,tgt_tok,src_raw,tgt_raw,src_POS_tags,tgt_parse_tags
0,258378316,"during the campaign , controversy erupted over...","during the campaign , some pointed out alleged...","during the campaign, controversy erupted over ...","during the campaign, some pointed out alleged ...",ADP DET NOUN PUNCT NOUN VERB ADP VERB NOUN ADP...,prep det pobj punct nsubj ROOT prep amod pobj ...
1,486527143,nic ##aea was con ##vo ##ked by the emperor co...,nic ##aea was con ##vo ##ked by the emperor co...,nicaea was convoked by the emperor constantine...,nicaea was convoked by the emperor constantine...,NOUN NOUN VERB VERB VERB VERB ADP DET NOUN NOU...,nsubjpass nsubjpass auxpass ROOT ROOT ROOT age...
2,54024499,it was rather unfortunate that he ve ##hem ##e...,he ve ##hem ##ently opposed the bud ##ding ind...,it was rather unfortunate that he vehemently o...,he vehemently opposed the budding indian scien...,PRON VERB ADV ADJ ADP PRON ADV ADV ADV VERB DE...,nsubj ROOT advmod acomp mark nsubj advmod advm...
3,160186886,dennis the menace is an american animated seri...,dennis the menace is an american animated seri...,dennis the menace is an american animated seri...,dennis the menace is an american animated seri...,VERB DET NOUN VERB DET ADJ VERB NOUN VERB ADP ...,csubj det dobj ROOT det amod amod attr acl age...
4,8797183,"today , on large farms , motorcycles , dogs or...","today , on large farms , motorcycles , dogs or...","today, on large farms, motorcycles, dogs or me...","today, on large farms, motorcycles, dogs or pe...",NOUN PUNCT ADP ADJ NOUN PUNCT NOUN PUNCT NOUN ...,npadvmod punct prep amod pobj punct conj punct...


In [6]:
train.head()

Unnamed: 0,id,src_tok,tgt_tok,src_raw,tgt_raw,src_POS_tags,tgt_parse_tags
0,165188319,"ch ##lor ##of ##or ##m "" the molecular life ##...","ch ##lor ##of ##or ##m "" the molecular life ##...","chloroform ""the molecular lifesaver"" an articl...","chloroform ""the molecular lifesaver"" an articl...",NOUN NOUN NOUN NOUN NOUN PUNCT DET ADJ NOUN NO...,ROOT ROOT ROOT ROOT ROOT punct det amod dobj d...
1,123204846,the free software gnu class ##path project is ...,the free software gnu class ##path project is ...,the free software gnu classpath project is onl...,the free software gnu classpath project is par...,DET ADJ NOUN NOUN NOUN NOUN NOUN VERB ADV ADV ...,det amod nmod compound compound compound nsubj...
2,706783956,"other campaign ##ers , especially the controve...","other campaign ##ers , especially the british ...","other campaigners, especially the controversia...","other campaigners, especially the british acti...",ADJ NOUN NOUN PUNCT ADV DET ADJ ADJ NOUN ADJ N...,amod nsubj nsubj punct advmod det amod amod am...
3,612378448,vocalist rob half ##ord ' s performance is con...,vocalist rob half ##ord ' s performance is con...,vocalist rob halford's performance is consider...,vocalist rob halford's performance is consider...,ADJ X NOUN NOUN PUNCT PART NOUN VERB VERB NUM ...,amod amod poss poss punct case nsubjpass auxpa...
4,876796337,the proud general is a chinese animated featur...,the proud general is a chinese animated featur...,the proud general is a chinese animated featur...,the proud general is a chinese animated featur...,DET ADJ NOUN VERB DET ADJ VERB NOUN NOUN VERB ...,det amod nsubj ROOT det amod amod attr attr ac...


In [7]:
test.head()

Unnamed: 0,id,src_tok,tgt_tok,src_raw,tgt_raw,src_POS_tags,tgt_parse_tags
0,318427508,"in april 2009 a brazilian human rights group ,...","in april 2009 a brazilian human rights group ,...","in april 2009 a brazilian human rights group, ...","in april 2009 a brazilian human rights group, ...",ADP NOUN NUM DET ADJ ADJ NOUN NOUN PUNCT NOUN ...,prep pobj nummod det amod amod compound nsubj ...
1,235640083,the 51 day stand ##off and ensuing murder of 7...,the 51 day stand ##off and ensuing deaths of 7...,the 51 day standoff and ensuing murder of 76 m...,the 51 day standoff and ensuing deaths of 76 m...,DET NUM NOUN NOUN NOUN CCONJ VERB NOUN ADP NUM...,det nummod compound nsubj nsubj cc amod conj p...
2,37561168,"mark o ##ate ##n ( born 8 march 1964 , watford...","mark o ##ate ##n ( born 8 march 1964 , watford...","mark oaten (born 8 march 1964, watford) is a d...","mark oaten (born 8 march 1964, watford) is a l...",NOUN ADJ ADJ ADJ PUNCT VERB NUM NOUN NUM PUNCT...,nsubj amod amod amod punct parataxis nummod np...
3,101665256,another infamous period of colon ##isation in ...,another period of colon ##isation in ancient t...,another infamous period of colonisation in anc...,another period of colonisation in ancient time...,DET ADJ NOUN ADP NOUN NOUN ADP ADJ NOUN VERB A...,det amod nsubj prep pobj pobj prep amod pobj R...
4,480248865,photo sequence of astonishing 2005 chicago ##l...,photo sequence of 2005 chicago ##land crash wi...,photo sequence of astonishing 2005 chicagoland...,photo sequence of 2005 chicagoland crash with ...,NOUN NOUN ADP VERB NUM NOUN NOUN NOUN ADP ADJ ...,compound ROOT prep amod nummod compound compou...


In [8]:
dev.head()

Unnamed: 0,id,src_tok,tgt_tok,src_raw,tgt_raw,src_POS_tags,tgt_parse_tags
0,3257810,in addition to sponsoring palestinian terror a...,in addition to sponsoring palestinian attacks ...,in addition to sponsoring palestinian terror a...,in addition to sponsoring palestinian attacks ...,ADP NOUN ADP VERB ADJ NOUN NOUN ADP PROPN PUNC...,prep pobj prep pcomp amod compound dobj prep p...
1,7455549,the game is currently played in 47 countries w...,the game claims to be currently played in 47 c...,the game is currently played in 47 countries w...,the game claims to be currently played in 47 c...,DET NOUN VERB ADV VERB ADP NUM NOUN ADP ADJ AD...,det nsubjpass auxpass advmod ROOT prep nummod ...
2,524547829,no part of the valley lies in the area current...,no part of the valley lies in the area current...,no part of the valley lies in the area current...,no part of the valley lies in the area current...,DET NOUN ADP DET NOUN VERB ADP DET NOUN ADV VE...,det nsubj prep det pobj ROOT prep det pobj adv...
3,842911055,scholars perceived that it was disco ##rdan ##...,scholars argued that it was disco ##rdan ##t w...,scholars perceived that it was discordant with...,scholars argued that it was discordant with th...,NOUN VERB ADP PRON VERB ADJ ADJ ADJ ADP DET AD...,nsubj ROOT mark nsubj ccomp acomp acomp acomp ...
4,302188700,"since the chinese civil war in 1949 , taiwan h...","since the chinese civil war in 1949 , taiwan h...","since the chinese civil war in 1949, taiwan ha...","since the chinese civil war in 1949, taiwan ha...",ADP DET ADJ ADJ NOUN ADP NUM PUNCT PROPN VERB ...,prep det amod amod pobj prep pobj punct nsubjp...


In [9]:
# Check number of rows in the datasets
display(full.shape)
display(train.shape)
display(test.shape)
display(dev.shape)

(53803, 7)

(1000, 7)

(700, 7)

In [10]:
# Sample row for full dataset
for col in full.columns:
    print(col, full[col][78])

id 755060188
src_tok unlike the previous films , a good day to die hard was the first die hard film to be a critical failure , receiving overwhelmingly negative reviews for its imp ##laus ##ible action sequences , cl ##iche ##d script , weak plot , moore ' s direction , and lack of characterization , although the special effects were praised .
tgt_tok unlike the previous films , a good day to die hard was the first die hard film to be a critical failure , receiving overwhelmingly negative reviews which cited imp ##laus ##ible action sequences , cl ##iche ##d script , weak plot , moore ' s direction , and lack of characterization , although the special effects were praised .
src_raw unlike the previous films, a good day to die hard was the first die hard film to be a critical failure, receiving overwhelmingly negative reviews for its implausible action sequences, cliched script, weak plot, moore's direction, and lack of characterization, although the special effects were praised.
tgt_ra

In [11]:
# Sample row for train dataset
for col in train.columns:
    print(col, train[col][78])

id 815664961
src_tok there he lived in a single room flat at the renowned ana ##rka ##li bazaar , lahore .
tgt_tok there he lived in a single room flat at the ana ##rka ##li bazaar , lahore .
src_raw there he lived in a single room flat at the renowned anarkali bazaar, lahore.
tgt_raw there he lived in a single room flat at the anarkali bazaar, lahore.
src_POS_tags ADV PRON VERB ADP DET ADJ NOUN ADJ ADP DET ADJ ADJ ADJ ADJ NOUN PUNCT NOUN PUNCT
tgt_parse_tags advmod nsubj ROOT prep det amod pobj amod prep det amod amod amod amod pobj punct npadvmod punct


## Data Transformation

In [12]:
# Load the data
dataset_dict = load_data_to_dataset(data_folder)

Map:   0%|          | 0/53803 [00:00<?, ? examples/s]

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [13]:
# Display the loaded data
display_data(dataset_dict)


train dataset:
Dataset({
    features: ['id', 'src_tok', 'tgt_tok', 'src_raw', 'tgt_raw', 'src_POS_tags', 'tgt_parse_tags', 'source', 'target'],
    num_rows: 53803
})

Sample rows:
{'id': 165188319, 'src_tok': 'ch ##lor ##of ##or ##m " the molecular life ##sa ##ver " an article at oxford university providing interesting facts about ch ##lor ##of ##or ##m .', 'tgt_tok': 'ch ##lor ##of ##or ##m " the molecular life ##sa ##ver " an article at oxford university providing facts about ch ##lor ##of ##or ##m .', 'src_raw': 'chloroform "the molecular lifesaver" an article at oxford university providing interesting facts about chloroform.', 'tgt_raw': 'chloroform "the molecular lifesaver" an article at oxford university providing facts about chloroform.', 'src_POS_tags': 'NOUN NOUN NOUN NOUN NOUN PUNCT DET ADJ NOUN NOUN NOUN PUNCT DET NOUN ADP NOUN NOUN VERB ADJ NOUN ADP NOUN NOUN NOUN NOUN NOUN PUNCT', 'tgt_parse_tags': 'ROOT ROOT ROOT ROOT ROOT punct det amod dobj dobj dobj punct det appos 

In [14]:
# Remove duplicates
for split in dataset_dict:
    initial_length = len(dataset_dict[split])
    dataset_dict[split], duplicates = remove_duplicates(dataset_dict[split])
    final_length = len(dataset_dict[split])
    print(f"\nRemoved {initial_length - final_length} duplicates from {split} dataset.")
    if not duplicates.empty:
        print(f"\nDuplicates removed from {split} dataset:")
        display(duplicates[['id', 'source', 'target']].sort_values(by='id'))


Removed 6 duplicates from train dataset.

Duplicates removed from train dataset:


Unnamed: 0,id,source,target
15151,170782428,to complete the lineup on their first full-len...,to complete the lineup on their first full-len...
47781,170782428,"full of the brothers' pop culture obsessions, ...","full of the brothers' pop culture obsessions, ..."
9959,232695029,"the armstrongs and fine served jail time, a co...","the armstrongs and fine served jail time, a co..."
16053,232695029,investigators believe that four people were in...,investigators believe that four people were in...
7086,241894774,"in addition, they all famously wear purity rin...","in addition, they all famously wear purity rin..."
44713,241894774,"they also abstain from alcohol, tobacco, and d...","they also report abstaing from alcohol, tobacc..."
20547,243772559,pettitte ended up getting the victory in the g...,pettitte ended up getting the victory in the g...
47039,243772559,"he recorded his 2, 000 career strikeout in the...","he recorded his 2, 000th career strikeout in t..."
15269,649752518,madeleine buckingham has served as chief execu...,madeleine buckingham has served as chief execu...
15560,649752518,mother jones (abbreviated mojo) is a far-left ...,mother jones (abbreviated mojo) is a american ...



Removed 0 duplicates from dev dataset.

Removed 0 duplicates from test dataset.


In [15]:
# Display the data after removing duplicates
display_data(dataset_dict)


train dataset:
Dataset({
    features: ['id', 'src_tok', 'tgt_tok', 'src_raw', 'tgt_raw', 'src_POS_tags', 'tgt_parse_tags', 'source', 'target', '__index_level_0__'],
    num_rows: 53797
})

Sample rows:
{'id': 165188319, 'src_tok': 'ch ##lor ##of ##or ##m " the molecular life ##sa ##ver " an article at oxford university providing interesting facts about ch ##lor ##of ##or ##m .', 'tgt_tok': 'ch ##lor ##of ##or ##m " the molecular life ##sa ##ver " an article at oxford university providing facts about ch ##lor ##of ##or ##m .', 'src_raw': 'chloroform "the molecular lifesaver" an article at oxford university providing interesting facts about chloroform.', 'tgt_raw': 'chloroform "the molecular lifesaver" an article at oxford university providing facts about chloroform.', 'src_POS_tags': 'NOUN NOUN NOUN NOUN NOUN PUNCT DET ADJ NOUN NOUN NOUN PUNCT DET NOUN ADP NOUN NOUN VERB ADJ NOUN ADP NOUN NOUN NOUN NOUN NOUN PUNCT', 'tgt_parse_tags': 'ROOT ROOT ROOT ROOT ROOT punct det amod dobj dobj 

In [16]:
# Remove outliers
for split in dataset_dict:
    initial_length = len(dataset_dict[split])
    new_dataset, outliers = remove_outliers(dataset_dict[split])
    dataset_dict[split] = new_dataset
    final_length = len(dataset_dict[split])
    print(f"\nRemoved {initial_length - final_length} outliers from {split} dataset.")
    if not outliers.empty:
        print(f"\nOutliers removed from {split} dataset:")
        display(outliers[['id', 'source', 'target']].sort_values(by='id'))


Removed 1836 outliers from train dataset.

Outliers removed from train dataset:


Unnamed: 0,id,source,target
15038,82890,"at the start of the second intifada, the city ...","at the start of the second intifada, the city ..."
36345,383530,israelis fear that granting all of the current...,israelis fear that granting all of the current...
48747,537479,the brown album - primus,brown album - primus
51141,576892,"thus it means that sounds, letters, and words ...","thus it means that sounds, letters, and words ..."
39819,871527,carlos the jackal ( infamous criminal),carlos the jackal (criminal)
...,...,...,...
48255,867555377,"at school, mendes played ice hockey and soccer...","at school, mendes played ice hockey and soccer..."
27556,870264330,notable false suicides/survivors,false suicides/survivors
10645,872518444,these conflicts included a customs dispute wit...,these conflicts included a customs dispute wit...
38639,872530548,"in 2002, georgian newspaper svobodnaya gruziya...","in 2002, georgian newspaper svobodnaya gruziya..."



Removed 28 outliers from dev dataset.

Outliers removed from dev dataset:


Unnamed: 0,id,source,target
492,107449295,googlezon is the name of a fictional future co...,googlezon is the name of a fictional future co...
157,134280371,the reagan administration provides support to ...,"the reagan administration , as part of the col..."
340,15326407,"at the same time, a growing number of healthca...","at the same time, a growing number of healthca..."
573,176222946,sahaja yoga aims at providing a trans-cultural...,sahaja yoga aims at providing a trans-cultural...
435,214513703,the library system has branches in the beautif...,the library system has branches in the suburbs...
190,21786706,zablocki (1997) and amintrani (2001) assert th...,zablocki (1997) and amintrani (2001) cite apa ...
414,230981873,the culture of taiwan is a hybrid blend of con...,the culture of taiwan is a hybrid blend of con...
96,248919990,1977-2000: traditional marriage,1977-2000: opposite-sex marriage
308,260859945,horn and kramer give an explanation of these u...,horn and kramer give an explanation of these c...
618,289044176,hitler made in a public speech in berlin on 4 ...,hitler `s patience was exhausted only after a ...



Removed 37 outliers from test dataset.

Outliers removed from test dataset:


Unnamed: 0,id,source,target
104,9121384,bitch is a republic.,egypt is a republic.
121,12808397,during his student years he was active in stud...,during his student years he was active in stud...
309,35939558,"the old market cross in lymm known as ""the cro...","the old market cross in lymm known as ""the cro..."
714,46181875,"the judge added more ominously : ""if he does n...","the judge added more: ""if he does not repent, ..."
861,56172585,"arrest, trial, and death","arrest, trial, and execution"
990,65221643,its good fun to catch the rabbits and break th...,such hunters often claim that it's good fun to...
727,76786928,the vandals may have given their name to the r...,the vandals may have given their name to the r...
778,85073053,she is only 5 feet tall.,she is 5 feet tall.
410,101107569,"subsequently, a free and sovereign india absor...","subsequently, a free and sovereign india absor..."
643,114139531,the current allstar line up is:,the current line up is:


In [17]:
# Display the data after removing outliers
display_data(dataset_dict)


train dataset:
Dataset({
    features: ['id', 'src_tok', 'tgt_tok', 'src_raw', 'tgt_raw', 'src_POS_tags', 'tgt_parse_tags', 'source', 'target', '__index_level_0__'],
    num_rows: 51961
})

Sample rows:
{'id': 165188319, 'src_tok': 'ch ##lor ##of ##or ##m " the molecular life ##sa ##ver " an article at oxford university providing interesting facts about ch ##lor ##of ##or ##m .', 'tgt_tok': 'ch ##lor ##of ##or ##m " the molecular life ##sa ##ver " an article at oxford university providing facts about ch ##lor ##of ##or ##m .', 'src_raw': 'chloroform "the molecular lifesaver" an article at oxford university providing interesting facts about chloroform.', 'tgt_raw': 'chloroform "the molecular lifesaver" an article at oxford university providing facts about chloroform.', 'src_POS_tags': 'NOUN NOUN NOUN NOUN NOUN PUNCT DET ADJ NOUN NOUN NOUN PUNCT DET NOUN ADP NOUN NOUN VERB ADJ NOUN ADP NOUN NOUN NOUN NOUN NOUN PUNCT', 'tgt_parse_tags': 'ROOT ROOT ROOT ROOT ROOT punct det amod dobj dobj 

In [18]:
# Save the processed data for BART seq2seq model
SEQ2SEQ_PATH = "../../data/seq2seq/"
save_processed_data(dataset_dict, SEQ2SEQ_PATH)

Saving the dataset (0/1 shards):   0%|          | 0/51961 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/672 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/963 [00:00<?, ? examples/s]

In [19]:
# Convert the dataset to classification dataset
classification_dataset = convert_to_classification_dataset(SEQ2SEQ_PATH)

In [20]:
# Lets check sample rows of the classification dataset
display_data(classification_dataset)


train dataset:
Dataset({
    features: ['text', 'label'],
    num_rows: 103922
})

Sample rows:
{'text': 'in 2013, gervais was named person of the year by peta for his selfless work on curbing animal abuse.', 'label': 1}
{'text': "tribadism or tribbing is a form of mutual masturbation, sometimes misleadingly called frottage, in which a woman rubs her vulva against her partner's body for sexual stimulation.", 'label': 1}
{'text': "following the sad demise of ahds, and the cessation of the methods network, the centre for e-research (cerch) was established at king's college london in 2008. the centre's aims are to facilitate interdisciplinary, institutional, national and international collaboration.", 'label': 1}

dev dataset:
Dataset({
    features: ['text', 'label'],
    num_rows: 1344
})

Sample rows:
{'text': 'richards is a popular politician and member of the bermuda senate for the united bermuda party.', 'label': 1}
{'text': 'most of these were arcade ports from american laser game

In [6]:
folder_path = '../../data/seq2seq'
# Load the dataset from disk
seq2seq_dataset = load_from_disk(folder_path)

In [7]:
display_data(seq2seq_dataset)


train dataset:
Dataset({
    features: ['id', 'source', 'target'],
    num_rows: 51961
})

Sample rows:
{'id': 165188319, 'source': 'chloroform "the molecular lifesaver" an article at oxford university providing interesting facts about chloroform.', 'target': 'chloroform "the molecular lifesaver" an article at oxford university providing facts about chloroform.'}
{'id': 123204846, 'source': 'the free software gnu classpath project is only partially compatible with the current version of sun java.', 'target': 'the free software gnu classpath project is partially compatible with the current version of sun java.'}
{'id': 706783956, 'source': 'other campaigners, especially the controversial british activist peter tatchell, attacked him for questioning the universal validity of gay identity.', 'target': 'other campaigners, especially the british activist peter tatchell, attacked him for questioning the universal validity of gay identity.'}

dev dataset:
Dataset({
    features: ['id', 'sour