In [None]:
# Performs the token-base rule matching using spaCy on all news article bodies in the dataset.

In [130]:
import pandas as pd
import json
import mlconjug
import spacy
import en_core_web_sm
from spacy.matcher import Matcher
nlp = en_core_web_sm.load()

In [152]:
def findFirstDate(doc, start, end):
    date = ""
    for i in range(start,end):
        if doc[i].ent_type_ == 'DATE':
            if doc[i].ent_iob_ == 'B' and doc[i+1].ent_type_ == 'DATE':
                date += doc[i].text +" "+ doc[i+1].text
            else:
                date = doc[i].text
            return date

In [153]:
newsdata = pd.read_csv('Hindu-Scraper/output.csv', error_bad_lines=False, warn_bad_lines=False)
newsdata['is_event'] = 0
newsdata

Unnamed: 0,title,location,date,body,keywords,output,target_date,actors,is_event
0,SC suspends eco clearance for international ai...,NEW DELHI,"April 01, 2019 00:00 IST",The health of the environment is key to preser...,"transparent institutions ,” justice chandrachu...",0,,,0
1,‘Bid to dislodge AIADMK from T.N. won’t succeed’,Vellore,"April 01, 2019 00:00 IST",Deputy Chief Minister O. Panneerselvam has c...,‘ two leaves ’ symbol vellore lok sabha candid...,0,,,0
2,Centre turns down Bengal’s request for paramil...,Kolkata,"April 01, 2019 00:00 IST",The Mamata Banerjee government’s plea to ret...,000 paramilitary personnel posted mamata baner...,0,,,0
3,Prayers before polls,Tirunelveli,"April 01, 2019 00:00 IST",Whenever she happened to pass by the Kotturp...,late chief minister jayalalithaa ’ roman catho...,0,,,0
4,‘Christian missionaries’ contribution to Tamil...,MADURAI,"April 01, 2019 00:00 IST",The contribution of Christian missionaries to ...,tamil nadu bishops ’ council antony pappusamy ...,0,,,0
5,Cases booked for violation of code,THANJAVUR,"April 01, 2019 00:00 IST",The Tamil University police have booked cases ...,civil supplies corporation employees associati...,1,,,0
6,Tipparaju Hawaldar’s supporters angry,RAICHUR,"April 01, 2019 00:00 IST",The row over the denial of ticket from the Ra...,former raichur rural mla tipparaju hawaladar r...,0,,,0
7,"BJP, a betrayer: Mutharasan",THANJAVUR,"April 01, 2019 00:00 IST","R.Mutharasan, State secretary, Communist Party...",tamil nadu state leader claims thanjavur lok s...,0,,,0
8,31brief2,Vijayawada,"April 01, 2019 00:00 IST",BJP State president Kanna Lakshminarayana se...,ap planning board vice chairman c bjp state pr...,0,,,0
9,Water level,Tirunelveli,"April 01, 2019 00:00 IST",Water level in Papanasam dam on Sunday stood...,sunday stood papanasam dam manimuthar dam 75 c...,0,,,0


In [154]:
verb_list = ['organize', 'plan', 'announce', 'prepare', 'demand', 'stage', 'call', 'hold']
noun_list = ['demonstration', 'march', 'protest', 'strike', 'bandh', 'dharna', 'union', 'riot', 'march', 'gathering', 'attack']

In [155]:
phrase_list = []
for v in verb_list:
    for n in noun_list:
        phrase_list.append((v, n))
phrase_list

[('organize', 'demonstration'),
 ('organize', 'march'),
 ('organize', 'protest'),
 ('organize', 'strike'),
 ('organize', 'bandh'),
 ('organize', 'dharna'),
 ('organize', 'union'),
 ('organize', 'riot'),
 ('organize', 'march'),
 ('organize', 'gathering'),
 ('organize', 'attack'),
 ('plan', 'demonstration'),
 ('plan', 'march'),
 ('plan', 'protest'),
 ('plan', 'strike'),
 ('plan', 'bandh'),
 ('plan', 'dharna'),
 ('plan', 'union'),
 ('plan', 'riot'),
 ('plan', 'march'),
 ('plan', 'gathering'),
 ('plan', 'attack'),
 ('announce', 'demonstration'),
 ('announce', 'march'),
 ('announce', 'protest'),
 ('announce', 'strike'),
 ('announce', 'bandh'),
 ('announce', 'dharna'),
 ('announce', 'union'),
 ('announce', 'riot'),
 ('announce', 'march'),
 ('announce', 'gathering'),
 ('announce', 'attack'),
 ('prepare', 'demonstration'),
 ('prepare', 'march'),
 ('prepare', 'protest'),
 ('prepare', 'strike'),
 ('prepare', 'bandh'),
 ('prepare', 'dharna'),
 ('prepare', 'union'),
 ('prepare', 'riot'),
 ('prepar

In [156]:
# Generate rules for matching
# VP TOKEN* NP
# NP TOKEN* VP
def generate_patterns(phrase_list):
    patterns = []
    for phrase in phrase_list:
        patterns.append([{'LEMMA': phrase[0], 'POS': 'VERB'},
                   {'IS_ALPHA': True, 'OP': '*'},
                   {'POS': 'NOUN', 'LEMMA': phrase[1]}])
        
        patterns.append([{'LEMMA': phrase[1], 'POS': 'NOUN'},
               {'IS_ALPHA': True, 'OP': '*'},
               {'POS': 'VERB', 'LEMMA': phrase[0]}])          
    return patterns

def generate_date_patterns(phrase_list):
    patterns = []
    for phrase in phrase_list:
        patterns.append([{'LEMMA': phrase[0], 'POS': 'VERB'},
                   {'IS_ALPHA': True, 'OP': '*'},
                   {'POS': 'NOUN', 'LEMMA': phrase[1]}, 
                {'IS_ALPHA': True, 'OP': '*'},
               {'ENT_TYPE': 'DATE'}])
        
        patterns.append([{'LEMMA': phrase[1], 'POS': 'NOUN'},
               {'IS_ALPHA': True, 'OP': '*'},
               {'POS': 'VERB', 'LEMMA': phrase[0]},
                {'IS_ALPHA': True, 'OP': '*'},
           {'ENT_TYPE': 'DATE'}])              
    return patterns
patterns = generate_patterns(phrase_list)
date_patterns = generate_date_patterns(phrase_list)

In [158]:
# Token-based rule matching is performed. Takes a while.
matcher = Matcher(nlp.vocab)
matcher.add("RiotPlanning", None, *patterns)
newsdata['raw_target_date'] = ''
newsdata['actors'] = ''
newsdata['actors'] = newsdata.astype(object)
for row, article in newsdata.iterrows():
    doc = nlp(article.get('body'))
    matches = matcher(doc)
    if matches:
        newsdata.at[row, 'is_event'] = 1
        newsdata.at[row, 'actors'] = [e.text for e in doc.ents if e[0].ent_type_ in ['ORG', 'GPE', 'PERSON']]
        for match_id, start, end in matches:
            date = findFirstDate(doc, start, len(doc))
            newsdata.at[row,'raw_target_date'] = date
            break
print('Done')

Done


In [94]:
newsdata.at[423, 'body']
# newsdata.to_csv('Hindu-Scraper/newsJ2MLabelledAll.csv', sep='|', index=False)