## Data Exploration

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import nltk
from nltk.corpus import stopwords

In [24]:
data_full = '../../data/biased.full'
data_train = '../../data/biased.word.train'
data_test = '../../data/biased.word.test'
data_dev = '../../data/biased.word.dev'

### Data Dictionary

1. **id:** A unique identifier used to link to a Wikipedia Diff view
    
    _Example:_ 532355971 (Links to https://en.wikipedia.org/w/index.php?diff=532355971)

2. **src_tok:** Tokenized source text

    _Example:_ she did not do as promised exposing her as an un ##pr ##in ##ci ##pled politician .

3. **tgt_tok:** Tokenized target text

    _Example:_ she did not do , leading to accusations of her being an un ##pr ##in ##ci ##pled politician

4. **src_raw:** Raw source text

    _Example:_ she did not do as promised exposing her as an unprincipled politician.

5. **tgt_raw:** Raw target text

    _Example:_ she did not do , leading to accusations of her being an unprincipled politician.

6. **src_POS_tags:** Part-of-speech tags for the source text

    _Example:_ PRON VERB ADV VERB ADP VERB VERB PRON ADP DET ADJ ADJ ADJ ADJ ADJ NOUN PUNCT

7. **tgt_parse_tags:** Syntactic parse tags for the target text using the Stanford Parser

    _Example:_ nsubj aux neg ROOT mark advcl xcomp dobj prep det amod amod amod amod amod pobj punct

In [25]:
full = pd.read_csv(data_full, sep='\t', names=["id", "src_tok", "tgt_tok", "src_raw", "tgt_raw", "src_POS_tags", "tgt_parse_tags"], error_bad_lines=False)
train = pd.read_csv(data_train, sep='\t', names=["id", "src_tok", "tgt_tok", "src_raw", "tgt_raw", "src_POS_tags", "tgt_parse_tags"])
test = pd.read_csv(data_test, sep = '\t', names=["id", "src_tok", "tgt_tok", "src_raw", "tgt_raw", "src_POS_tags", "tgt_parse_tags"])
dev = pd.read_csv(data_dev, sep='\t', names=["id", "src_tok", "tgt_tok", "src_raw", "tgt_raw", "src_POS_tags", "tgt_parse_tags"])



  full = pd.read_csv(data_full, sep='\t', names=["id", "src_tok", "tgt_tok", "src_raw", "tgt_raw", "src_POS_tags", "tgt_parse_tags"], error_bad_lines=False)
b'Skipping line 60908: expected 7 fields, saw 9\n'


In [26]:
full.head()

Unnamed: 0,id,src_tok,tgt_tok,src_raw,tgt_raw,src_POS_tags,tgt_parse_tags
0,258378316,"during the campaign , controversy erupted over...","during the campaign , some pointed out alleged...","during the campaign, controversy erupted over ...","during the campaign, some pointed out alleged ...",ADP DET NOUN PUNCT NOUN VERB ADP VERB NOUN ADP...,prep det pobj punct nsubj ROOT prep amod pobj ...
1,486527143,nic ##aea was con ##vo ##ked by the emperor co...,nic ##aea was con ##vo ##ked by the emperor co...,nicaea was convoked by the emperor constantine...,nicaea was convoked by the emperor constantine...,NOUN NOUN VERB VERB VERB VERB ADP DET NOUN NOU...,nsubjpass nsubjpass auxpass ROOT ROOT ROOT age...
2,54024499,it was rather unfortunate that he ve ##hem ##e...,he ve ##hem ##ently opposed the bud ##ding ind...,it was rather unfortunate that he vehemently o...,he vehemently opposed the budding indian scien...,PRON VERB ADV ADJ ADP PRON ADV ADV ADV VERB DE...,nsubj ROOT advmod acomp mark nsubj advmod advm...
3,160186886,dennis the menace is an american animated seri...,dennis the menace is an american animated seri...,dennis the menace is an american animated seri...,dennis the menace is an american animated seri...,VERB DET NOUN VERB DET ADJ VERB NOUN VERB ADP ...,csubj det dobj ROOT det amod amod attr acl age...
4,8797183,"today , on large farms , motorcycles , dogs or...","today , on large farms , motorcycles , dogs or...","today, on large farms, motorcycles, dogs or me...","today, on large farms, motorcycles, dogs or pe...",NOUN PUNCT ADP ADJ NOUN PUNCT NOUN PUNCT NOUN ...,npadvmod punct prep amod pobj punct conj punct...


In [27]:
train.head()

Unnamed: 0,id,src_tok,tgt_tok,src_raw,tgt_raw,src_POS_tags,tgt_parse_tags
0,165188319,"ch ##lor ##of ##or ##m "" the molecular life ##...","ch ##lor ##of ##or ##m "" the molecular life ##...","chloroform ""the molecular lifesaver"" an articl...","chloroform ""the molecular lifesaver"" an articl...",NOUN NOUN NOUN NOUN NOUN PUNCT DET ADJ NOUN NO...,ROOT ROOT ROOT ROOT ROOT punct det amod dobj d...
1,123204846,the free software gnu class ##path project is ...,the free software gnu class ##path project is ...,the free software gnu classpath project is onl...,the free software gnu classpath project is par...,DET ADJ NOUN NOUN NOUN NOUN NOUN VERB ADV ADV ...,det amod nmod compound compound compound nsubj...
2,706783956,"other campaign ##ers , especially the controve...","other campaign ##ers , especially the british ...","other campaigners, especially the controversia...","other campaigners, especially the british acti...",ADJ NOUN NOUN PUNCT ADV DET ADJ ADJ NOUN ADJ N...,amod nsubj nsubj punct advmod det amod amod am...
3,612378448,vocalist rob half ##ord ' s performance is con...,vocalist rob half ##ord ' s performance is con...,vocalist rob halford's performance is consider...,vocalist rob halford's performance is consider...,ADJ X NOUN NOUN PUNCT PART NOUN VERB VERB NUM ...,amod amod poss poss punct case nsubjpass auxpa...
4,876796337,the proud general is a chinese animated featur...,the proud general is a chinese animated featur...,the proud general is a chinese animated featur...,the proud general is a chinese animated featur...,DET ADJ NOUN VERB DET ADJ VERB NOUN NOUN VERB ...,det amod nsubj ROOT det amod amod attr attr ac...


In [28]:
test.head()

Unnamed: 0,id,src_tok,tgt_tok,src_raw,tgt_raw,src_POS_tags,tgt_parse_tags
0,318427508,"in april 2009 a brazilian human rights group ,...","in april 2009 a brazilian human rights group ,...","in april 2009 a brazilian human rights group, ...","in april 2009 a brazilian human rights group, ...",ADP NOUN NUM DET ADJ ADJ NOUN NOUN PUNCT NOUN ...,prep pobj nummod det amod amod compound nsubj ...
1,235640083,the 51 day stand ##off and ensuing murder of 7...,the 51 day stand ##off and ensuing deaths of 7...,the 51 day standoff and ensuing murder of 76 m...,the 51 day standoff and ensuing deaths of 76 m...,DET NUM NOUN NOUN NOUN CCONJ VERB NOUN ADP NUM...,det nummod compound nsubj nsubj cc amod conj p...
2,37561168,"mark o ##ate ##n ( born 8 march 1964 , watford...","mark o ##ate ##n ( born 8 march 1964 , watford...","mark oaten (born 8 march 1964, watford) is a d...","mark oaten (born 8 march 1964, watford) is a l...",NOUN ADJ ADJ ADJ PUNCT VERB NUM NOUN NUM PUNCT...,nsubj amod amod amod punct parataxis nummod np...
3,101665256,another infamous period of colon ##isation in ...,another period of colon ##isation in ancient t...,another infamous period of colonisation in anc...,another period of colonisation in ancient time...,DET ADJ NOUN ADP NOUN NOUN ADP ADJ NOUN VERB A...,det amod nsubj prep pobj pobj prep amod pobj R...
4,480248865,photo sequence of astonishing 2005 chicago ##l...,photo sequence of 2005 chicago ##land crash wi...,photo sequence of astonishing 2005 chicagoland...,photo sequence of 2005 chicagoland crash with ...,NOUN NOUN ADP VERB NUM NOUN NOUN NOUN ADP ADJ ...,compound ROOT prep amod nummod compound compou...


In [29]:
dev.head()

Unnamed: 0,id,src_tok,tgt_tok,src_raw,tgt_raw,src_POS_tags,tgt_parse_tags
0,3257810,in addition to sponsoring palestinian terror a...,in addition to sponsoring palestinian attacks ...,in addition to sponsoring palestinian terror a...,in addition to sponsoring palestinian attacks ...,ADP NOUN ADP VERB ADJ NOUN NOUN ADP PROPN PUNC...,prep pobj prep pcomp amod compound dobj prep p...
1,7455549,the game is currently played in 47 countries w...,the game claims to be currently played in 47 c...,the game is currently played in 47 countries w...,the game claims to be currently played in 47 c...,DET NOUN VERB ADV VERB ADP NUM NOUN ADP ADJ AD...,det nsubjpass auxpass advmod ROOT prep nummod ...
2,524547829,no part of the valley lies in the area current...,no part of the valley lies in the area current...,no part of the valley lies in the area current...,no part of the valley lies in the area current...,DET NOUN ADP DET NOUN VERB ADP DET NOUN ADV VE...,det nsubj prep det pobj ROOT prep det pobj adv...
3,842911055,scholars perceived that it was disco ##rdan ##...,scholars argued that it was disco ##rdan ##t w...,scholars perceived that it was discordant with...,scholars argued that it was discordant with th...,NOUN VERB ADP PRON VERB ADJ ADJ ADJ ADP DET AD...,nsubj ROOT mark nsubj ccomp acomp acomp acomp ...
4,302188700,"since the chinese civil war in 1949 , taiwan h...","since the chinese civil war in 1949 , taiwan h...","since the chinese civil war in 1949, taiwan ha...","since the chinese civil war in 1949, taiwan ha...",ADP DET ADJ ADJ NOUN ADP NUM PUNCT PROPN VERB ...,prep det amod amod pobj prep pobj punct nsubjp...


In [30]:
display(full.shape)
display(train.shape)
display(test.shape)
display(dev.shape)

(181473, 7)

(53803, 7)

(1000, 7)

(700, 7)

In [31]:
# sample row for full dataset
for col in full.columns:
    print(col, full[col][78])

id 755060188
src_tok unlike the previous films , a good day to die hard was the first die hard film to be a critical failure , receiving overwhelmingly negative reviews for its imp ##laus ##ible action sequences , cl ##iche ##d script , weak plot , moore ' s direction , and lack of characterization , although the special effects were praised .
tgt_tok unlike the previous films , a good day to die hard was the first die hard film to be a critical failure , receiving overwhelmingly negative reviews which cited imp ##laus ##ible action sequences , cl ##iche ##d script , weak plot , moore ' s direction , and lack of characterization , although the special effects were praised .
src_raw unlike the previous films, a good day to die hard was the first die hard film to be a critical failure, receiving overwhelmingly negative reviews for its implausible action sequences, cliched script, weak plot, moore's direction, and lack of characterization, although the special effects were praised.
tgt_ra

In [32]:
# sample row for train dataset
for col in train.columns:
    print(col, train[col][78])

id 815664961
src_tok there he lived in a single room flat at the renowned ana ##rka ##li bazaar , lahore .
tgt_tok there he lived in a single room flat at the ana ##rka ##li bazaar , lahore .
src_raw there he lived in a single room flat at the renowned anarkali bazaar, lahore.
tgt_raw there he lived in a single room flat at the anarkali bazaar, lahore.
src_POS_tags ADV PRON VERB ADP DET ADJ NOUN ADJ ADP DET ADJ ADJ ADJ ADJ NOUN PUNCT NOUN PUNCT
tgt_parse_tags advmod nsubj ROOT prep det amod pobj amod prep det amod amod amod amod pobj punct npadvmod punct


In [33]:
# sample row for test dataset
for col in test.columns:
    print(col, test[col][78])

id 169232861
src_tok the text of the two private secrets were both handed to blessed pope pius ix on july 18 1851 . this text was rediscovered by reverend father marcel co ##rte ##ville m . s .
tgt_tok the text of the two private secrets were both handed to pope pius ix on july 18 1851 . this text was rediscovered by reverend father marcel co ##rte ##ville m . s .
src_raw the text of the two private secrets were both handed to blessed pope pius ix on july 18 1851. this text was rediscovered by reverend father marcel corteville m.s.
tgt_raw the text of the two private secrets were both handed to pope pius ix on july 18 1851. this text was rediscovered by reverend father marcel corteville m.s.
src_POS_tags DET NOUN ADP DET NUM ADJ NOUN VERB DET VERB ADP ADJ NOUN NOUN ADP ADP PROPN NUM NUM PUNCT DET NOUN VERB VERB ADP ADJ NOUN NOUN NOUN NOUN NOUN NOUN PUNCT VERB PUNCT
tgt_parse_tags det nsubjpass prep det nummod amod pobj auxpass dep ROOT prep amod compound pobj prep prep pobj nummod numm