# Setup

In [4]:
# needs console command: python -m spacy download en
import spacy
import nltk
import pandas as pd
import ast
nlp = spacy.load('en', disable=['ner', 'textcat', 'tagger'])

In [5]:
def load_df(path, list_cols):
    df = pd.read_csv(path,sep='|')
    df[list_cols] = df[list_cols].applymap(lambda x: ast.literal_eval(x))
    return df

In [6]:
s_list_col = 's_sent'
s_pos_col = 's_pos'
n_list_col ='n_sent'
n_pos_col = 'n_pos'

aligned_cols = [s_list_col, s_pos_col, n_list_col, n_pos_col]
aligned_pos = 'TS_LossFunction/code/data/wikipedia_aligned_pos.csv'
df = load_df(aligned_pos, aligned_cols)

raw = 'TS_LossFunction/code/data/wikipedia_aligned.csv'
df_raw = pd.read_csv(raw, sep='|')

df['s_raw'] = df_raw['simple_sentence']
df['n_raw'] = df_raw['normal_sentence']

In [23]:
df.head()

Unnamed: 0,simple_topic,simple_numb,normal_topic,normal_numb,s_sent,s_pos,n_sent,n_pos,s_raw,n_raw
0,"cherokee, oklahoma",0,"cherokee, oklahoma",0,"(cherokee, is, a, city, of, oklahoma, in, the,...","(NN, VBZ, DT, NN, IN, NN, IN, DT, JJ, NNS, .)","(cherokee, is, a, city, in, alfalfa, county, ,...","(NN, VBZ, DT, NN, IN, JJ, NN, ,, NN, ,, JJ, NN...",cherokee is a city of oklahoma in the united s...,"cherokee is a city in alfalfa county , oklahom..."
1,skateboard,2,skateboard,5,"(skateboard, decks, are, normally, between, 28...","(NN, NNS, VBP, RB, IN, CD, CC, CD, NNS, RB, .)","(skateboard, decks, are, usually, between, 28,...","(NN, NNS, VBP, RB, IN, CD, CC, CD, NNS, RB, .)",skateboard decks are normally between 28 and 3...,skateboard decks are usually between 28 and 33...
2,skateboard,2,skateboard,5,"(the, bottom, of, the, deck, can, be, printed,...","(DT, NN, IN, DT, NN, MD, VB, VBN, IN, DT, NN, ...","(the, underside, of, the, deck, can, be, print...","(DT, NN, IN, DT, NN, MD, VB, VBN, IN, DT, NN, ...",the bottom of the deck can be printed with a d...,the underside of the deck can be printed with ...
3,skateboard,3,skateboard,6,"(the, longboard, was, made, by, two, surfers, ...","(DT, NN, VBD, VBN, IN, CD, NNS, :, CC, NN, CC,...","(this, was, created, by, two, surfers, ;, ben,...","(DT, VBD, VBN, IN, CD, NNS, :, CC, NN, CC, JJ,...",the longboard was made by two surfers ; ben wh...,this was created by two surfers ; ben whatson ...
4,skateboard,2,skateboard,5,"(other, materials, used, in, making, decks, fi...","(JJ, NNS, VBN, IN, VBG, NNS, NN, ,, NN, ,, NN,...","(some, of, them, have, special, materials, tha...","(DT, IN, PRP, VBP, JJ, NNS, WDT, VBP, TO, VB, ...",other materials used in making decks fiberglas...,some of them have special materials that help ...


# Add dependency tags using spaCy

In [17]:
s_dep = []
for doc in nlp.pipe(df['s_raw'].astype('unicode').values, batch_size=50, n_threads=3):
    if doc.is_parsed:
        s_dep.append(tuple([n.dep_ for n in doc]))
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        s_dep.append(None)

df['s_dep'] = s_dep 

# This took 8 minutes on my laptop

In [18]:
n_dep = []
for doc in nlp.pipe(df['n_raw'].astype('unicode').values, batch_size=50, n_threads=3):
    if doc.is_parsed:
        n_dep.append(tuple([n.dep_ for n in doc]))
    else:
        n_dep.append(None)

df['n_dep'] = n_dep  

# also 8 minutes

In [19]:
df.head()

Unnamed: 0,simple_topic,simple_numb,normal_topic,normal_numb,s_sent,s_pos,n_sent,n_pos,s_raw,n_raw,s_dep,n_dep
0,"cherokee, oklahoma",0,"cherokee, oklahoma",0,"(cherokee, is, a, city, of, oklahoma, in, the,...","(NN, VBZ, DT, NN, IN, NN, IN, DT, JJ, NNS, .)","(cherokee, is, a, city, in, alfalfa, county, ,...","(NN, VBZ, DT, NN, IN, JJ, NN, ,, NN, ,, JJ, NN...",cherokee is a city of oklahoma in the united s...,"cherokee is a city in alfalfa county , oklahom...","(advmod, ROOT, det, attr, prep, pobj, prep, de...","(advmod, ROOT, det, attr, prep, compound, pobj..."
1,skateboard,2,skateboard,5,"(skateboard, decks, are, normally, between, 28...","(NN, NNS, VBP, RB, IN, CD, CC, CD, NNS, RB, .)","(skateboard, decks, are, usually, between, 28,...","(NN, NNS, VBP, RB, IN, CD, CC, CD, NNS, RB, .)",skateboard decks are normally between 28 and 3...,skateboard decks are usually between 28 and 33...,"(compound, nsubj, ROOT, advmod, quantmod, numm...","(compound, nsubj, ROOT, advmod, advmod, nummod..."
2,skateboard,2,skateboard,5,"(the, bottom, of, the, deck, can, be, printed,...","(DT, NN, IN, DT, NN, MD, VB, VBN, IN, DT, NN, ...","(the, underside, of, the, deck, can, be, print...","(DT, NN, IN, DT, NN, MD, VB, VBN, IN, DT, NN, ...",the bottom of the deck can be printed with a d...,the underside of the deck can be printed with ...,"(det, nsubjpass, prep, det, pobj, aux, auxpass...","(det, nsubjpass, prep, det, pobj, aux, auxpass..."
3,skateboard,3,skateboard,6,"(the, longboard, was, made, by, two, surfers, ...","(DT, NN, VBD, VBN, IN, CD, NNS, :, CC, NN, CC,...","(this, was, created, by, two, surfers, ;, ben,...","(DT, VBD, VBN, IN, CD, NNS, :, CC, NN, CC, JJ,...",the longboard was made by two surfers ; ben wh...,this was created by two surfers ; ben whatson ...,"(det, nsubjpass, auxpass, ROOT, agent, nummod,...","(nsubjpass, auxpass, ROOT, agent, nummod, pobj..."
4,skateboard,2,skateboard,5,"(other, materials, used, in, making, decks, fi...","(JJ, NNS, VBN, IN, VBG, NNS, NN, ,, NN, ,, NN,...","(some, of, them, have, special, materials, tha...","(DT, IN, PRP, VBP, JJ, NNS, WDT, VBP, TO, VB, ...",other materials used in making decks fiberglas...,some of them have special materials that help ...,"(amod, ROOT, acl, prep, pcomp, dobj, dobj, pun...","(nsubj, prep, pobj, ROOT, amod, dobj, nsubj, r..."


In [8]:
df.to_csv('wikipedia_aligned_dep.csv')

# Count clauses

In [7]:
def count_all(df, deps):
    # TAKES: dataframe, names of columns where dependencies are located
    df = count_rel_clauses(df, deps)
    df = count_adv_clauses(df, deps)
    df = count_coord(df, deps)
    df = count_appos(df, deps)
    df = count_passive(df, deps)
    df = count_parataxis(df, deps)
    return df

# Relative clauses
def count_rel_clauses(df, deps):
    count = lambda x: x.count('relcl')
    for col in deps:
        df[str(col) + str('_relcl')] = df[col].apply(count)
    return df

# Adverbial phrases(clauses)
def count_adv_clauses(df, deps):
    count = lambda x: x.count('advcl')
    for col in deps:
        df[str(col) + str('_advcl')] = df[col].apply(count)
    return df

# Coordination
def count_coord(df, deps):
    count = lambda x: x.count('cc')
    for col in deps:
        df[str(col) + str('_cc')] = df[col].apply(count)
    return df

# Apposition
def count_appos(df, deps):
    count = lambda x: x.count('appos')
    for col in deps:
        df[str(col) + str('_appos')] = df[col].apply(count)
    return df

# Passive voice
def count_passive(df, deps):
    count = lambda x: x.count('nsubjpass') + x.count('csubjpass')
    for col in deps:
        df[str(col) + str('_pass')] = df[col].apply(count)
    return df

# Parataxis (non-conjunctive parenthetical/coordinated clause)
def count_parataxis(df, deps):
    count = lambda x: x.count('parataxis')
    for col in deps:
        df[str(col) + str('_para')] = df[col].apply(count)
    return df

# Nominalization
# Embeddedness
# Subordination
# SVO or non-standard word order

# https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md

# Results : Wikipedia Dataset

In [16]:
df_wiki = df.copy()
df_wiki = count_all(df_wiki, ['s_dep', 'n_dep'])

In [36]:
print('Relative Clauses in simple sentences: ' + str(df_wiki['s_dep_relcl'].sum()))
print('Relative Clauses in normal sentences: ' + str(df_wiki['n_dep_relcl'].sum()))

print('Coordinating Conjunctions in simple sentences: ' + str(df_wiki['s_dep_cc'].sum()))
print('Coordinating Conjunctions in normal sentences: ' + str(df_wiki['n_dep_cc'].sum()))

print('Apposition in simple sentences: ' + str(df_wiki['s_dep_appos'].sum()))
print('Apposition in normal sentences: ' + str(df_wiki['n_dep_appos'].sum()))

print('Adverbial phrases in simple sentences: ' + str(df_wiki['s_dep_advcl'].sum()))
print('Adverbial phrases in normal sentences: ' + str(df_wiki['n_dep_advcl'].sum()))

print('Parataxis instances in simple sentences: ' + str(df_wiki['s_dep_para'].sum()))
print('Parataxis instances in normal sentences: ' + str(df_wiki['n_dep_para'].sum()))

print('Passive voice instances in simple sentences: ' + str(df_wiki['s_dep_pass'].sum()))
print('Passive voice instances in normal sentences: ' + str(df_wiki['n_dep_pass'].sum()))

# This produces unexpected results.
# I suspected before I included it that perhaps a ratio of passive to active verbs might be better than sheer number.
# I have a suspicion that the simple sentences have more verbs overall than normal sentences, skewing these results.
# Can test this later

Relative Clauses in simple sentences: 22187
Relative Clauses in normal sentences: 26537
Coordinating Conjunctions in simple sentences: 76157
Coordinating Conjunctions in normal sentences: 98136
Apposition in simple sentences: 31363
Apposition in normal sentences: 40183
Adverbial phrases in simple sentences: 24780
Adverbial phrases in normal sentences: 32745
Parataxis instances in simple sentences: 266
Parataxis instances in normal sentences: 325
Passive voice instances in simple sentences: 39525
Passive voice instances in normal sentences: 38249


# Experimentation

In [9]:
text = """The levels of dissolved oxygen in the world’s ocean waters are declining precipitously.
Just like humans, fish need oxygen to survive, and so in ocean areas experiencing significant levels of oxygen scarcity, or hypoxia, fish populations are plummeting.
Although some hypoxic areas, called “dead zones,” occur naturally, hypoxia in coastal areas and inland waters is caused mainly by agricultural run-off and by discharge of industrial waste waters.
More than one-hundred permanent dead zones, many covering thousands of square miles, exist worldwide today.
Since reproductive success is the most critical factor in the sustainability of any species, the extent of this threat to marine life and genetic diversity can hardly be overstated.
After several months in hypoxic waters, female fish produce fewer eggs.
Moreover, hypoxic conditions serve to alter the normal ratio between two particular hormones manufactured during the embryonic stage when a fish’s gender is determined.
During gestation and under these conditions, the mother produces more testosterone (and less estradiol), which inhibits the development of female reproductive organs and other female characteristics in the embryo, while promoting the development of male traits."""

In [10]:
def raw_to_df(raw):
    # must be split into lines for each sentence
    raw = raw.splitlines()
    tokens = [nltk.word_tokenize(sent) for sent in raw]
    tokens = [[w.lower() for w in sent] for sent in tokens]
    df = pd.DataFrame()
    df['raw'] = tuple(raw)
    df['sent'] = tuple(tokens)
    return df

In [11]:
df_test = raw_to_df(text)

In [12]:
dep = []
for doc in nlp.pipe(df_test['raw'].astype('unicode').values, batch_size=50, n_threads=3):
    if doc.is_parsed:
        dep.append(tuple([n.dep_ for n in doc]))
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        dep.append(None)

df_test['dep'] = dep 

In [13]:
for index, row in df_test.iterrows():
    print(row['sent'])
    print(row['dep'])
    print()

['the', 'levels', 'of', 'dissolved', 'oxygen', 'in', 'the', 'world', '’', 's', 'ocean', 'waters', 'are', 'declining', 'precipitously', '.']
('det', 'nsubj', 'prep', 'amod', 'pobj', 'prep', 'det', 'poss', 'case', 'compound', 'pobj', 'aux', 'ROOT', 'advmod', 'punct')

['just', 'like', 'humans', ',', 'fish', 'need', 'oxygen', 'to', 'survive', ',', 'and', 'so', 'in', 'ocean', 'areas', 'experiencing', 'significant', 'levels', 'of', 'oxygen', 'scarcity', ',', 'or', 'hypoxia', ',', 'fish', 'populations', 'are', 'plummeting', '.']
('advmod', 'prep', 'pobj', 'punct', 'nsubj', 'ROOT', 'nsubj', 'aux', 'ccomp', 'punct', 'cc', 'advmod', 'prep', 'compound', 'pobj', 'acl', 'amod', 'dobj', 'prep', 'compound', 'pobj', 'punct', 'cc', 'conj', 'punct', 'compound', 'nsubj', 'aux', 'conj', 'punct')

['although', 'some', 'hypoxic', 'areas', ',', 'called', '“', 'dead', 'zones', ',', '”', 'occur', 'naturally', ',', 'hypoxia', 'in', 'coastal', 'areas', 'and', 'inland', 'waters', 'is', 'caused', 'mainly', 'by', 

In [14]:
df_test = count_all(df_test, ['dep'])

In [15]:
df_test.head()

Unnamed: 0,raw,sent,dep,dep_relcl,dep_advcl,dep_cc,dep_appos,dep_pass,dep_para
0,The levels of dissolved oxygen in the world’s ...,"[the, levels, of, dissolved, oxygen, in, the, ...","(det, nsubj, prep, amod, pobj, prep, det, poss...",0,0,0,0,0,0
1,"Just like humans, fish need oxygen to survive,...","[just, like, humans, ,, fish, need, oxygen, to...","(advmod, prep, pobj, punct, nsubj, ROOT, nsubj...",0,0,2,0,0,0
2,"Although some hypoxic areas, called “dead zone...","[although, some, hypoxic, areas, ,, called, “,...","(mark, det, amod, nsubj, punct, advcl, nmod, a...",0,2,2,0,1,0
3,"More than one-hundred permanent dead zones, ma...","[more, than, one-hundred, permanent, dead, zon...","(amod, quantmod, compound, punct, nummod, amod...",0,0,0,1,0,0
4,Since reproductive success is the most critica...,"[since, reproductive, success, is, the, most, ...","(mark, amod, nsubj, advcl, det, advmod, amod, ...",0,1,1,0,1,0
