In [2]:
import pandas as pd
import pickle
import re

In [3]:
# beggin with uppercase and end with strong punctuation
def uper_punct(sentence):

    strong_punct = [".","!","?"]
    if sentence[0].isupper() and sentence[-1] in strong_punct:
        return sentence
    else:
        return None

In [4]:
# limit the sentence size between 4 and 30 words
def limit_length(tokens):
    if (len(tokens) >= 4 and len(tokens) <= 30):
        return tokens
    else:
        return None

In [5]:
# check if a dependency root and a subject exist 
def root_subj_presence(dep_list):
  
  dependencies = [tup2 for tup1,tup2 in dep_list]

  # if a verb in finite verb form exist we keep that row
  if ('root' in dependencies) and ('nsubj' in dependencies):
    return dep_list
  else:
    return None

In [6]:
# check that at least one finite verb exist
def finite_verb(features_list):

    features = ' '.join(features_list)

    # if a verb in finite verb form exist we keep that row
    if ('Fin' in features):
        return features_list
    else:
        return None

In [7]:
# context dependence based on structural connectives
def struct_connectives(sentence, upos, dep, xpos):
  
    allowed_connectives = {'either':'or', 'not only':'but also','not':'but', 
                        'neither':'nor', 'such':'that', 'scarcely':'when',
                         'as many':'as','both':'and', 'whether':'or', 
                         'just as':'so', 'the':'the', 'as':'as', 'as much':'as', 
                        'no sooner':'than', 'rather':'than'}

    xpos_verbs = {"VB","VBD","VBG","VBN","VBP","VBZ"}

        # if it starts with a coordinating conjunction or a subordinating conjunction it is context depended
        if upos[0] == 'CCONJ' or upos[0] == 'SCONJ':   

            # we incorporate the xpos tags for better accuracy on identifying the verbs
            xpos_list = ['V' if x in xpos_verbs else x for x in xpos]

            # if a second clause does not exist it is context depended
            if xpos_list.count('V')==1 and upos.count('VERB')==1 and upos.count('CCONJ')==1 and upos.count('SCONJ')==1:

                # it is not context depended only in the case it starts with an allowed connective
                allowed_flag = False
                for k, v in allowed_connectives.items():
                    if sentence.lower().startswith(k):    
                        keep_key = k
                        keep_value = v
                        allowed_flag = True
                        break

                # if the sentence starts with an allowed connective
                if allowed_flag:
                    split_sent = sentence.lower().replace(keep_key,"")

                # check whether both of the necessary connectives of an allowed connective patternt exist (to avoid sentences like: "Both went there.")
                if keep_value in split_sent:
                    return sentence
                else:
                    return None

        else:
            return None

    return sentence

In [8]:
# context dependence based on pronominal anaphora
def pronom_anaphora(sentence, upos, dependency, features, head):

    # in case there is no pronoun in the sentence, the sentence is ignored
    if 'PRON' in upos:

        # obtain all the indices from all the existing pronouns third person singular pronouns and demonstrative pronouns in the sentence
        pron_indices = [i for i, feat in enumerate(features) if ('|Gender=Neut|Number=Sing|Person=3|PronType=Prs' in feat) or ('PronType=Dem' in feat)]

        if pron_indices:
            for pron in pron_indices:

                # if 'which' is following after such a pronoun (indication of a relative clause starting with it), we consider no-anaphora
                if ('which' in sentence.lower()[pron:]):
                    return sentence

        # in case only a third person singular pronoun exist
        elif any('|Gender=Neut|Number=Sing|Person=3|PronType=Prs' in feat for feat in features):

            pron_indices = [i for i, feat in enumerate(features) if '|Gender=Neut|Number=Sing|Person=3|PronType=Prs' in feat]

            # we examine whether the occurence of that pronoun is pleonastic, in which case there is no context-dependence
            if any(dep[1] == 'expl' for dep in dependency):
                expl_indices = [i for i, dep in enumerate(dependency) if dep[1] == 'expl']

                for expl in expl_indices:
                  # if a pleonastic (expletive) relation exist and it does not directed to the target pronoun, we discard the sentence
                  if not any(head[expl]== pron_index for pron_index in pron_indices):
                    return None
                  else:
                    return sentence
            else:
                return None

        # if only a demonstrative pronoun exist, the sentence is context-depended
        elif any('PronType=Dem' in feat for feat in features):
            return None

    return sentence

In [9]:
def execute_selection(data):

    # beggin with uppercase and end with strong punctuation
    data['Sentence'] = data['Sentence'].apply(uper_punct)
    data = data.dropna()

    # limit the sentence size between 4 and 25 words
    data['Tokens'] = data['Tokens'].apply(limit_length)
    data = data.dropna()

    # check if a dependency root and a subject exist 
    data['Dependency'] = data['Dependency'].apply(root_subj_presence)
    data = data.dropna()

    # check that at least one finite verb exist
    data['Features'] = data['Features'].apply(finite_verb)
    data = data.dropna()

    # context dependence based on structural connectives
    data["Sentence"] = data[["Sentence", "Upos", "Dependency", "Xpos"]].apply(lambda x : struct_connectives(*x), axis=1)
    data = data.dropna()

    # context dependence based on pronominal anaphora
    data["Sentence"] = data[["Sentence", "Upos", "Dependency", "Features", "Head"]].apply(lambda x : pronom_anaphora(*x), axis=1)
    data = data.dropna()

    return data

## Best sentences selection on Wikipedia corpus

In [10]:
wiki_data = pd.read_pickle("./data/misc/wikipedia_sentences_parsed.pkl")
clean_wiki_frame = execute_selection(wiki_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [11]:
clean_wiki_frame.to_pickle("./data/misc/wikipedia_sentences_parsed_selection.pkl")

In [12]:
clean_wiki_frame

Unnamed: 0,Sentence,Tokens,Lemma,Upos,Xpos,Dependency,Features,id,Head
0,Reuven Rivlin has been the President since Jul...,"[Reuven, Rivlin, has, been, the, President, si...","[reuven, rivlin, have, be, the, president, sin...","[PROPN, PROPN, AUX, AUX, DET, NOUN, ADP, PROPN...","[NNP, NNP, VBZ, VBN, DT, NN, IN, NNP, CD, .]","[(president, nsubj), (reuven, flat), (presiden...","[Number=Sing, Number=Sing, Mood=Ind|Number=Sin...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]","[6, 1, 6, 6, 6, 0, 8, 6, 8, 6]"
1,The volcanic soil of the islands proved to be ...,"[The, volcanic, soil, of, the, islands, proved...","[the, volcanic, soil, of, the, island, prove, ...","[DET, ADJ, NOUN, ADP, DET, NOUN, VERB, PART, A...","[DT, JJ, NN, IN, DT, NNS, VBD, TO, VB, JJ, IN,...","[(soil, det), (soil, amod), (proved, nsubj), (...","[Definite=Def|PronType=Art, Degree=Pos, Number...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]","[3, 3, 7, 6, 6, 3, 0, 10, 10, 7, 13, 13, 10, 7]"
2,"After the Sharpeville Massacre, the UN tried t...","[After, the, Sharpeville, Massacre, ,, the, UN...","[after, the, sharpeville, massacre, ,, the, un...","[ADP, DET, ADJ, NOUN, PUNCT, DET, PROPN, VERB,...","[IN, DT, JJ, NN, ,, DT, NNP, VBD, TO, VB, JJ, ...","[(massacre, case), (massacre, det), (massacre,...","[_, Definite=Def|PronType=Art, Degree=Pos, Num...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[4, 4, 4, 8, 8, 7, 8, 0, 10, 8, 12, 10, 14, 10..."
3,The paws have three soft toe pads and retracti...,"[The, paws, have, three, soft, toe, pads, and,...","[the, paw, have, three, soft, toe, pad, and, r...","[DET, NOUN, VERB, NUM, ADJ, NOUN, NOUN, CCONJ,...","[DT, NNS, VBP, CD, JJ, NN, NNS, CC, JJ, NNS, .]","[(paws, det), (have, nsubj), (root, root), (pa...","[Definite=Def|PronType=Art, Number=Plur, Mood=...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]","[2, 3, 0, 7, 7, 7, 3, 10, 10, 7, 3]"
4,The stone is on the ice in front of the foot i...,"[The, stone, is, on, the, ice, in, front, of, ...","[the, stone, be, on, the, ice, in, front, of, ...","[DET, NOUN, AUX, ADP, DET, NOUN, ADP, NOUN, AD...","[DT, NN, VBZ, IN, DT, NN, IN, NN, IN, DT, NN, ...","[(stone, det), (ice, nsubj), (ice, cop), (ice,...","[Definite=Def|PronType=Art, Number=Sing, Mood=...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[2, 6, 6, 6, 6, 0, 8, 6, 11, 11, 8, 14, 14, 8, 6]"
...,...,...,...,...,...,...,...,...,...
216178,"In a few other dictatorships, such as Saudi Ar...","[In, a, few, other, dictatorships, ,, such, as...","[in, a, few, other, dictatorship, ,, such, as,...","[ADP, DET, ADJ, ADJ, NOUN, PUNCT, ADJ, ADP, AD...","[IN, DT, JJ, JJ, NNS, ,, JJ, IN, JJ, NNP, ,, D...","[(dictatorships, case), (dictatorships, det), ...","[_, Definite=Ind|PronType=Art, Degree=Pos, Deg...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[5, 5, 5, 5, 16, 5, 10, 7, 10, 5, 16, 14, 14, ..."
216179,Abstract art is modern art which does not repr...,"[Abstract, art, is, modern, art, which, does, ...","[abstract, art, be, modern, art, which, do, no...","[ADJ, NOUN, AUX, ADJ, NOUN, PRON, AUX, PART, V...","[JJ, NN, VBZ, JJ, NN, WDT, VBZ, RB, VB, NNS, I...","[(art, amod), (art, nsubj), (art, cop), (art, ...","[Degree=Pos, Number=Sing, Mood=Ind|Number=Sing...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[2, 5, 5, 5, 0, 9, 9, 9, 5, 9, 14, 14, 14, 10, 5]"
216181,This is a casual relationship is usually only ...,"[This, is, a, casual, relationship, is, usuall...","[this, be, a, casual, relationship, be, usuall...","[PRON, AUX, DET, ADJ, NOUN, AUX, ADV, ADV, ADP...","[DT, VBZ, DT, JJ, NN, VBZ, RB, RB, IN, NN, CC,...","[(relationship, nsubj), (relationship, cop), (...","[Number=Sing|PronType=Dem, Mood=Ind|Number=Sin...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[5, 5, 5, 5, 10, 10, 10, 10, 10, 0, 15, 15, 15..."
216182,It also cost about $3.9 billion.,"[It, also, cost, about, $, 3.9, billion, .]","[it, also, cost, about, $, 3.9, billion, .]","[PRON, ADV, VERB, ADV, SYM, NUM, NUM, PUNCT]","[PRP, RB, VBD, RB, $, CD, CD, .]","[(cost, nsubj), (cost, advmod), (root, root), ...",[Case=Nom|Gender=Neut|Number=Sing|Person=3|Pro...,"[1, 2, 3, 4, 5, 6, 7, 8]","[3, 3, 0, 5, 3, 7, 5, 3]"


## Best sentences selection on BNC corpus

In [16]:
bnc_data = pd.read_pickle("./data/misc/bnc_sentences_parsed.pkl")
clean_bnc_frame = execute_selection(bnc_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [18]:
clean_bnc_frame.to_pickle("./data/misc/bnc_sentences_parsed_selection.pkl")

In [19]:
clean_bnc_frame

Unnamed: 0,Sentence,Tokens,Lemma,Upos,Xpos,Dependency,Features,id,Head
0,The interaction of long chain molecules with l...,"[The, interaction, of, long, chain, molecules,...","[the, interaction, of, long, chain, molecule, ...","[DET, NOUN, ADP, ADJ, NOUN, NOUN, ADP, NOUN, A...","[DT, NN, IN, JJ, NN, NNS, IN, NNS, VBZ, IN, JJ...","[(interaction, det), (interest, nsubj), (molec...","[Definite=Def|PronType=Art, Number=Sing, _, De...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[2, 12, 6, 6, 6, 2, 8, 6, 12, 12, 12, 0, 19, 1..."
1,When an amorphous polymer is mixed with a suit...,"[When, an, amorphous, polymer, is, mixed, with...","[when, a, amorphous, polymer, be, mix, with, a...","[SCONJ, DET, ADJ, NOUN, AUX, VERB, ADP, DET, A...","[WRB, DT, JJ, NN, VBZ, VBN, IN, DT, JJ, NN, ,,...","[(mixed, mark), (polymer, det), (polymer, amod...","[PronType=Int, Definite=Ind|PronType=Art, Degr...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[6, 4, 4, 6, 6, 13, 10, 10, 10, 6, 13, 13, 0, ..."
2,"In a ' poor ' solvent, the interactions are fe...","[In, a, ', poor, ', solvent, ,, the, interacti...","[in, a, ', poor, ', solvent, ,, the, interacti...","[ADP, DET, PUNCT, ADJ, PUNCT, NOUN, PUNCT, DET...","[IN, DT, ``, JJ, '', NN, ,, DT, NNS, VBP, JJR,...","[(solvent, case), (solvent, det), (solvent, pu...","[_, Definite=Ind|PronType=Art, _, Degree=Pos, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[6, 6, 6, 6, 6, 11, 11, 9, 11, 11, 0, 18, 14, ..."
3,The fundamental thermodynamic equation used to...,"[The, fundamental, thermodynamic, equation, us...","[the, fundamental, thermodynamic, equation, us...","[DET, ADJ, ADJ, NOUN, VERB, PART, VERB, DET, N...","[DT, JJ, JJ, NN, VBN, TO, VB, DT, NNS, VBZ, DT...","[(equation, det), (equation, amod), (equation,...","[Definite=Def|PronType=Art, Degree=Pos, Degree...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[4, 4, 4, 10, 4, 7, 5, 9, 7, 0, 16, 16, 16, 15..."
4,This is valid only for components of comparabl...,"[This, is, valid, only, for, components, of, c...","[this, be, valid, only, for, component, of, co...","[PRON, AUX, ADJ, ADV, ADP, NOUN, ADP, ADJ, NOU...","[DT, VBZ, JJ, RB, IN, NNS, IN, JJ, NN, ,, CC, ...","[(valid, nsubj), (valid, cop), (root, root), (...","[Number=Sing|PronType=Dem, Mood=Ind|Number=Sin...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[3, 3, 0, 6, 6, 3, 9, 9, 6, 24, 24, 24, 15, 15..."
...,...,...,...,...,...,...,...,...,...
197201,Nonetheless I must stand my ground and restate...,"[Nonetheless, I, must, stand, my, ground, and,...","[nonetheless, I, must, stand, my, ground, and,...","[ADV, PRON, AUX, VERB, PRON, NOUN, CCONJ, VERB...","[RB, PRP, MD, VB, PRP$, NN, CC, VB, IN, RB, RB...","[(stand, advmod), (stand, nsubj), (stand, aux)...","[_, Case=Nom|Number=Sing|Person=1|PronType=Prs...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[4, 4, 4, 0, 6, 4, 8, 4, 14, 14, 14, 14, 14, 8..."
197202,How much longer will everyone ignore this phen...,"[How, much, longer, will, everyone, ignore, th...","[how, much, long, will, everyone, ignore, this...","[ADV, ADV, ADV, AUX, PRON, VERB, DET, NOUN, PU...","[WRB, RB, RBR, MD, NN, VB, DT, NN, .]","[(much, advmod), (longer, advmod), (ignore, ad...","[PronType=Int, Degree=Pos, Degree=Cmp, VerbFor...","[1, 2, 3, 4, 5, 6, 7, 8, 9]","[2, 3, 6, 6, 6, 0, 8, 6, 6]"
197204,"If so, it shows the town suffering more than i...","[If, so, ,, it, shows, the, town, suffering, m...","[if, so, ,, it, show, the, town, suffer, more,...","[SCONJ, ADV, PUNCT, PRON, VERB, DET, NOUN, VER...","[IN, RB, ,, PRP, VBZ, DT, NN, VBG, JJR, IN, PR...","[(so, mark), (shows, advcl), (shows, punct), (...","[_, _, _, Case=Nom|Gender=Neut|Number=Sing|Per...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[2, 5, 5, 5, 0, 7, 5, 7, 8, 13, 13, 13, 8, 17,..."
197205,I doubt if many Scottish historians would take...,"[I, doubt, if, many, Scottish, historians, wou...","[I, doubt, if, many, scottish, historian, woul...","[PRON, VERB, SCONJ, ADJ, ADJ, NOUN, AUX, VERB,...","[PRP, VBP, IN, JJ, JJ, NNS, MD, VB, DT, NN, RB...","[(doubt, nsubj), (root, root), (take, mark), (...","[Case=Nom|Number=Sing|Person=1|PronType=Prs, M...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","[2, 0, 8, 6, 6, 8, 8, 2, 10, 8, 8, 2]"
