In [1]:
import spacy 
from spacy.matcher import Matcher
import textacy
import pandas as pd 
import numpy as np 

nlp = spacy.load('en_core_web_sm')

from spacy.symbols import NOUN, PROPN, VERB
from spacy.tokens import Doc, Span, Token

In [2]:
# First let's import cnn

cnn = pd.read_csv("~/Documents/moral_templates/Data/breitbart_articles.csv")

# Drop articles for which I don't have the text 

cnn = cnn.dropna(subset=['clean_strings'])



In [3]:
def subject_verb_object_triples(doc):

    sents = doc.sents

    for sent in sents:
        start_i = sent[0].i

        verbs = textacy.spacier.utils.get_main_verbs_of_sent(sent)
        for verb in verbs:
            subjs = textacy.spacier.utils.get_subjects_of_verb(verb)
            if not subjs:
                continue
            objs = textacy.spacier.utils.get_objects_of_verb(verb)
            if not objs:
                continue

            # add adjacent auxiliaries to verbs, for context
            # and add compounds to compound nouns
            verb_span = textacy.spacier.utils.get_span_for_verb_auxiliaries(verb)
            verb = sent[verb_span[0] - start_i : verb_span[1] - start_i + 1]
            for subj in subjs:
                subj_dep = subj.dep_
                subj_tag = subj.tag_
                subj = sent[
                    textacy.spacier.utils.get_span_for_compound_noun(subj)[0]
                    - start_i : subj.i
                    - start_i
                    + 1
                ]
                for obj in objs:
                    if obj.pos == NOUN:
                        span = textacy.spacier.utils.get_span_for_compound_noun(obj)
                    elif obj.pos == VERB:
                        span = textacy.spacier.utils.get_span_for_verb_auxiliaries(obj)
                    else:
                        span = (obj.i, obj.i)
                    obj_dep = obj.dep_
                    obj_tag = obj.tag_
                    obj = sent[span[0] - start_i : span[1] - start_i + 1]
                    end_pos = span[1] 

                    yield (start_i,subj, subj_dep, subj_tag, verb, obj, obj_dep, obj_tag,  end_pos)

In [4]:
# Create a function for creating the dataframe 

def create_svo_dataframe(doc): 

    start_list = []
    subject_list = []
    sdep_list = []
    stag_list = []
    verb_list = []
    object_list = []
    odep_list = []
    otag_list = []
    end_list = []

    triplets = subject_verb_object_triples(doc)

    for triplet in triplets:
        start = triplet[0]
        subj = triplet[1]
        subj_dep = triplet[2]
        subj_tag = triplet[3]
        verb = triplet[4]
        obj = triplet[5]
        obj_dep = triplet[6]
        obj_tag = triplet[7]
        end = triplet[8]

        start_list.append(start)
        subject_list.append(subj)
        sdep_list.append(subj_dep)
        stag_list.append(subj_tag)
        verb_list.append(verb)
        object_list.append(obj)
        odep_list.append(obj_dep)
        otag_list.append(obj_tag)
        end_list.append(end)
    
    dict = {'subject': subject_list, 
        'verb': verb_list, 
        'object': object_list, 
        'start': start_list, 
        'end': end_list, 
        'subj_dep': sdep_list, 
        'subj_tag': stag_list, 
        'obj_dep': odep_list, 
        'obj_tag': otag_list}
    
    svo_df = pd.DataFrame(dict)

    return svo_df


In [5]:
svodf = pd.DataFrame()

for i in range(len(cnn)): 
    doc = nlp(cnn.iloc[i]['clean_strings'])
    df = create_svo_dataframe(doc)
    df['Document'] = i 
    df['publication'] = cnn.iloc[i]['publication']
    svodf = svodf.append(df, ignore_index = True)
    if (i % 100 == 0):
        print(f'working on article {i}')

svodf.tail(25)

working on article 0
working on article 100
working on article 200
working on article 300
working on article 400
working on article 500
working on article 600
working on article 700
working on article 800
working on article 900
working on article 1000
working on article 1100
working on article 1200
working on article 1300
working on article 1400
working on article 1500
working on article 1600
working on article 1700
working on article 1800
working on article 1900
working on article 2000
working on article 2100
working on article 2200
working on article 2300
working on article 2400
working on article 2500
working on article 2600
working on article 2700
working on article 2800
working on article 2900
working on article 3000
working on article 3100
working on article 3200
working on article 3300
working on article 3400
working on article 3500
working on article 3600
working on article 3700
working on article 3800
working on article 3900
working on article 4000
working on article 4100
work

Unnamed: 0,subject,verb,object,start,end,subj_dep,subj_tag,obj_dep,obj_tag,Document,publication
481592,(Clinton),"(would, appoint)",(liberal),1882.0,1886.0,nsubj,NNP,dobj,NN,23779,Breitbart
481593,(Clinton),"(would, never, make)",(policy),1882.0,1914.0,nsubj,NNP,dobj,NN,23779,Breitbart
481594,(person),"(could, side)","(immigration, policy)",1986.0,2014.0,nsubj,NN,dobj,NN,23779,Breitbart
481595,(justices),"(to, strike)",(parts),1986.0,2008.0,nsubj,NNS,dobj,NNS,23779,Breitbart
481596,(Trump),"(to, create)",(policy),2016.0,2037.0,nsubj,JJ,dobj,NN,23779,Breitbart
481597,(lawyer),(proves),(himself),2050.0,2061.0,nsubj,NN,dobj,PRP,23779,Breitbart
481598,(lawyer),(proves),(be),2050.0,2063.0,nsubj,NN,xcomp,VB,23779,Breitbart
481599,(nation),"(has, threatened)","(to, flood)",0.0,30.0,nsubj,NN,xcomp,VB,23780,Breitbart
481600,(president),"(has, threatened)","(to, flood)",0.0,30.0,nsubj,NN,xcomp,VB,23780,Breitbart
481601,(all),(approved),(vote),74.0,88.0,nsubj,DT,dobj,NN,23780,Breitbart


In [6]:
svodf_filtered = svodf[(svodf.obj_dep=='dobj')]
svodf_final = svodf_filtered[svodf_filtered.obj_tag.isin(["NN", "NNS", "NNP", "NNPS"])]


svodf_final.tail(50)


Unnamed: 0,subject,verb,object,start,end,subj_dep,subj_tag,obj_dep,obj_tag,Document,publication
481541,(provision),(gives),"(branches, discretion)",303.0,328.0,nsubj,NN,dobj,NN,23779,Breitbart
481542,(who),(enters),(country),303.0,333.0,nsubj,WP,dobj,NN,23779,Breitbart
481544,(Court),(quoted),(case),413.0,419.0,nsubj,NNP,dobj,NN,23779,Breitbart
481545,(Court),"(has, sustained)",(power),413.0,434.0,nsubj,NNP,dobj,NN,23779,Breitbart
481546,(Trump),(impose),(ban),445.0,455.0,nsubj,NNP,dobj,NN,23779,Breitbart
481550,(statute),(conveys),(authority),565.0,571.0,nsubj,NN,dobj,NN,23779,Breitbart
481551,(president),"(to, enact)","(policy, advocates)",565.0,580.0,nsubj,NN,dobj,NNS,23779,Breitbart
481552,(statute),"(does, not, delegate)",(power),597.0,606.0,nsubj,NN,dobj,NN,23779,Breitbart
481553,(Congress),"(could, amend)",(law),597.0,620.0,nsubj,NNP,dobj,NN,23779,Breitbart
481554,(voters),"(can, consider)",(religion),725.0,741.0,nsubj,NNS,dobj,NN,23779,Breitbart


In [7]:
svodf_final.to_csv('~/Documents/moral_templates/Data/bb_triplets_dataset.csv')