# Setup

In [2]:
# needs console command: python -m spacy download en
import spacy
import nltk
import pandas as pd
import ast
nlp = spacy.load('en', disable=['ner', 'textcat', 'tagger'])

In [3]:
def load_df(path, list_cols):
    df = pd.read_csv(path,sep='|')
    df[list_cols] = df[list_cols].applymap(lambda x: ast.literal_eval(x))
    return df

In [4]:
s_list_col = 's_sent'
s_pos_col = 's_pos'
n_list_col ='n_sent'
n_pos_col = 'n_pos'

aligned_cols = [s_list_col, s_pos_col, n_list_col, n_pos_col]
aligned_pos = 'data/wikipedia_aligned_pos.csv'
df = load_df(aligned_pos, aligned_cols)

raw = 'data/wikipedia_aligned.csv'
df_raw = pd.read_csv(raw, sep='|')

df['s_raw'] = df_raw['simple_sentence']
df['n_raw'] = df_raw['normal_sentence']

In [5]:
df.head()

Unnamed: 0,simple_topic,simple_numb,normal_topic,normal_numb,s_sent,s_pos,n_sent,n_pos,s_raw,n_raw
0,"cherokee, oklahoma",0,"cherokee, oklahoma",0,"(cherokee, is, a, city, of, oklahoma, in, the,...","(NN, VBZ, DT, NN, IN, NN, IN, DT, JJ, NNS, .)","(cherokee, is, a, city, in, alfalfa, county, ,...","(NN, VBZ, DT, NN, IN, JJ, NN, ,, NN, ,, JJ, NN...",cherokee is a city of oklahoma in the united s...,"cherokee is a city in alfalfa county , oklahom..."
1,skateboard,2,skateboard,5,"(skateboard, decks, are, normally, between, 28...","(NN, NNS, VBP, RB, IN, CD, CC, CD, NNS, RB, .)","(skateboard, decks, are, usually, between, 28,...","(NN, NNS, VBP, RB, IN, CD, CC, CD, NNS, RB, .)",skateboard decks are normally between 28 and 3...,skateboard decks are usually between 28 and 33...
2,skateboard,2,skateboard,5,"(the, bottom, of, the, deck, can, be, printed,...","(DT, NN, IN, DT, NN, MD, VB, VBN, IN, DT, NN, ...","(the, underside, of, the, deck, can, be, print...","(DT, NN, IN, DT, NN, MD, VB, VBN, IN, DT, NN, ...",the bottom of the deck can be printed with a d...,the underside of the deck can be printed with ...
3,skateboard,3,skateboard,6,"(the, longboard, was, made, by, two, surfers, ...","(DT, NN, VBD, VBN, IN, CD, NNS, :, CC, NN, CC,...","(this, was, created, by, two, surfers, ;, ben,...","(DT, VBD, VBN, IN, CD, NNS, :, CC, NN, CC, JJ,...",the longboard was made by two surfers ; ben wh...,this was created by two surfers ; ben whatson ...
4,skateboard,2,skateboard,5,"(other, materials, used, in, making, decks, fi...","(JJ, NNS, VBN, IN, VBG, NNS, NN, ,, NN, ,, NN,...","(some, of, them, have, special, materials, tha...","(DT, IN, PRP, VBP, JJ, NNS, WDT, VBP, TO, VB, ...",other materials used in making decks fiberglas...,some of them have special materials that help ...


# Add dependency tags using spaCy

In [6]:
s_dep = []
for doc in nlp.pipe(df['s_raw'].astype('unicode').values, batch_size=50, n_threads=3):
    if doc.is_parsed:
        s_dep.append(tuple([n.dep_ for n in doc]))
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        s_dep.append(None)

df['s_dep'] = s_dep 

# This took 8 minutes on my laptop

In [9]:
n_dep = []
for doc in nlp.pipe(df['n_raw'].astype('unicode').values, batch_size=50, n_threads=3):
    if doc.is_parsed:
        n_dep.append(tuple([n.dep_ for n in doc]))
    else:
        n_dep.append(None)

df['n_dep'] = n_dep  

# also 8 minutes

In [10]:
df.head()

Unnamed: 0,simple_topic,simple_numb,normal_topic,normal_numb,s_sent,s_pos,n_sent,n_pos,s_raw,n_raw,s_dep,n_dep
0,"cherokee, oklahoma",0,"cherokee, oklahoma",0,"(cherokee, is, a, city, of, oklahoma, in, the,...","(NN, VBZ, DT, NN, IN, NN, IN, DT, JJ, NNS, .)","(cherokee, is, a, city, in, alfalfa, county, ,...","(NN, VBZ, DT, NN, IN, JJ, NN, ,, NN, ,, JJ, NN...",cherokee is a city of oklahoma in the united s...,"cherokee is a city in alfalfa county , oklahom...","(advmod, ROOT, det, attr, prep, pobj, prep, de...","(advmod, ROOT, det, attr, prep, compound, pobj..."
1,skateboard,2,skateboard,5,"(skateboard, decks, are, normally, between, 28...","(NN, NNS, VBP, RB, IN, CD, CC, CD, NNS, RB, .)","(skateboard, decks, are, usually, between, 28,...","(NN, NNS, VBP, RB, IN, CD, CC, CD, NNS, RB, .)",skateboard decks are normally between 28 and 3...,skateboard decks are usually between 28 and 33...,"(compound, nsubj, ROOT, advmod, quantmod, numm...","(compound, nsubj, ROOT, advmod, advmod, nummod..."
2,skateboard,2,skateboard,5,"(the, bottom, of, the, deck, can, be, printed,...","(DT, NN, IN, DT, NN, MD, VB, VBN, IN, DT, NN, ...","(the, underside, of, the, deck, can, be, print...","(DT, NN, IN, DT, NN, MD, VB, VBN, IN, DT, NN, ...",the bottom of the deck can be printed with a d...,the underside of the deck can be printed with ...,"(det, nsubjpass, prep, det, pobj, aux, auxpass...","(det, nsubjpass, prep, det, pobj, aux, auxpass..."
3,skateboard,3,skateboard,6,"(the, longboard, was, made, by, two, surfers, ...","(DT, NN, VBD, VBN, IN, CD, NNS, :, CC, NN, CC,...","(this, was, created, by, two, surfers, ;, ben,...","(DT, VBD, VBN, IN, CD, NNS, :, CC, NN, CC, JJ,...",the longboard was made by two surfers ; ben wh...,this was created by two surfers ; ben whatson ...,"(det, nsubjpass, auxpass, ROOT, agent, nummod,...","(nsubjpass, auxpass, ROOT, agent, nummod, pobj..."
4,skateboard,2,skateboard,5,"(other, materials, used, in, making, decks, fi...","(JJ, NNS, VBN, IN, VBG, NNS, NN, ,, NN, ,, NN,...","(some, of, them, have, special, materials, tha...","(DT, IN, PRP, VBP, JJ, NNS, WDT, VBP, TO, VB, ...",other materials used in making decks fiberglas...,some of them have special materials that help ...,"(amod, ROOT, acl, prep, pcomp, dobj, dobj, pun...","(nsubj, prep, pobj, ROOT, amod, dobj, nsubj, r..."


In [11]:
df.to_csv('wikipedia_aligned_dep.csv')

# Count clauses

In [72]:
# Relative clauses
def count_rel_clauses(df):
    count = lambda x: x.count('relcl')
    df['s_relcl'] = df['s_dep'].apply(count)
    df['n_relcl'] = df['n_dep'].apply(count)
    return df

# Adverbial phrases(clauses)
def count_adv_clauses(df):
    count = lambda x: x.count('advcl')
    df['s_advcl'] = df['s_dep'].apply(count)
    df['n_advcl'] = df['n_dep'].apply(count)
    return df

# Coordination
def count_coord(df):
    count = lambda x: x.count('cc')
    df['s_coord'] = df['s_dep'].apply(count)
    df['n_coord'] = df['n_dep'].apply(count)
    return df

# Apposition
def count_appos(df):
    count = lambda x: x.count('appos')
    df['s_appos'] = df['s_dep'].apply(count)
    df['n_appos'] = df['n_dep'].apply(count)
    return df

# Passive voice
def count_passive(df):
    count = lambda x: x.count('nsubjpass') + x.count('csubjpass')
    df['s_pass'] = df['s_dep'].apply(count)
    df['n_pass'] = df['n_dep'].apply(count)
    return df

# Seem more important

# Nominalization
# Embeddedness
# Subordination
# SVO or non-standard word order
# Clause order (??)

# Less important/supported

# Number of verbs, nouns, adjectives, adverbs, pronouns (all separate features) (I don't know if we will use these but it will be easy to implement)
# Incidence of content words
# Incidence of functional words
# ‘Raw’ frequency of content words
# ‘Minimal’ frequency of content words
# Average number of verb hypernyms (a word with a broad meaning constituting a category into which words with more specific meanings fall; a superordinate. For example, colour is a hypernym of red.)
# Incidence of NPs
# Number of NP modifiers
# Number of words before the main verb
# Number of high level constituents
# Number of personal pronouns
# Type-token ratio
# Pronoun-NP ratio
# Number of ‘and’, ‘or’, ‘if’, negations, logic operators, connectives of various shapes and sizes
# Verb, noun, adverb, adjective ambiguity ratios (?)

# https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md

# Results

In [55]:
df = count_rel_clauses(df)
print('Relative Clauses in simple sentences: ' + str(df['s_relcl'].sum()))
print('Relative Clauses in normal sentences: ' + str(df['n_relcl'].sum()))

Relative Clauses in simple sentences: 22174
Relative Clauses in normal sentences: 26532


In [59]:
df = count_coord(df)
print('Coordinating Conjunctions in simple sentences: ' + str(df['s_coord'].sum()))
print('Coordinating Conjunctions in normal sentences: ' + str(df['n_coord'].sum()))

Coordinating Conjunctions in simple sentences: 76130
Coordinating Conjunctions in normal sentences: 98109


In [67]:
df = count_appos(df)
print('Apposition in simple sentences: ' + str(df['s_appos'].sum()))
print('Apposition in normal sentences: ' + str(df['n_appos'].sum()))

Apposition in simple sentences: 31346
Apposition in normal sentences: 40161


In [71]:
df = count_adv_clauses(df)
print('Adverbial phrases in simple sentences: ' + str(df['s_advcl'].sum()))
print('Adverbial phrases in normal sentences: ' + str(df['n_advcl'].sum()))

Adverbial phrases in simple sentences: 24773
Adverbial phrases in normal sentences: 32733


In [73]:
df = count_passive(df)
print('Passive voice instances in simple sentences: ' + str(df['s_pass'].sum()))
print('Passive voice instances in normal sentences: ' + str(df['n_pass'].sum()))

# This produces unexpected results.
# I suspected before I included it that perhaps a ratio of passive to active verbs might be better than sheer number.
# I have a suspicion that the simple sentences have more verbs overall than normal sentences, skewing these results.
# Can test this later

Passive voice instances in simple sentences: 39506
Passive voice instances in normal sentences: 38234
