In [1]:
import spacy
import en_core_web_sm
import pandas as pd
import pickle as pkl
import re
from spacy.pipeline import merge_entities
nlp = en_core_web_sm.load()
nlp.add_pipe(merge_entities)

In [2]:
base = './top-300/'
my_dataset = base + 'all.txt'
deepak_dataset = base + 'combined-raw-data.csv'

In [3]:
def process(keywords):
    keywords_hypen = [x.replace(" ", "-") for x in keywords]
    total_keywords = list(set(keywords + keywords_hypen))
    return total_keywords

def is_phrase_in(phrase, text):
    return re.search(r"\b{}(s|es)?\b".format(phrase), text, re.IGNORECASE) is not None

In [4]:
keywords_aadhar = ['Aadhaar', 'UIDAI', 'unique identity number', 'UID', \
            'unique Aadhaar number', 'Unique Identification', \
            'Adhar', 'Aadhar', 'Adhaar', 'Adharcard', 'Aadharcard', \
            'Aadhaarcard', 'Aadhar Card','Aadhar', 'Aadhaar', 'Adhar',\
            'Adharcard', 'Aadharcard', 'Aadhaarcard', 'UIDAI', 'Aadhar Card', \
            '12-digit', 'Enrolment ID', 'Enrollment ID', 'Enrolment Number', \
            'Enrollment Number', 'Personal Identification Number', 'E-Aadhaar', 'UID/EID', \
            'E-Adhar', 'E-Aadhar', 'E-Adhaar', 'Pankaj Kumar', 'uidai.gov.in', \
            'Aadhaar-link', 'Adhaar-link', 'Aadhar-link', 'Adhar-link', \
            'Aadhaar-enable', 'Adhaar-enable', 'Aadhar-enable', 'Adhar-enable'
            ]

keywords_farmers = ['loan waiver', 'loan waivers', 'farmer loan', 'farmer suicide','farmer suicides',\
                    'pest infestation', 'farmer loans','drought','farmer', 'farmers', 'crop insurance',\
                    'Swaminathan Commission', 'National Commission on Farmer', 'kisan', 'agriculture',\
                    'monsoon failure', 'crop failure', 'fertilizers', 'Seeds Corporation', 'crop loss',\
                    'crop losses', 'unseasonal rains', 'irrigation facilities', 'debt traps',\
                   'loan waiver', 'farmer loan', 'farmer suicide', 'pest infestation', 'Swaminathan Commission',\
                    'National Commission on Farmer','kisan', 'monsoon failure', 'crop failure',\
                    'fertilizer', 'Seeds Corporation', 'agricultural', 'crop prices', \
                   'agrarian', 'NCSTC', 'farming community', 'debt waiver', 'waiver scheme', 'farm loan', \
                   'crop loan', 'farmer suicide', 'farmers suicide', 'farmer agitation', 'plight farmer', \
                   'distressed farmer', 'farmer issue', 'farmers\' protest', 'farmer protest', 'agrarian crisis', \
                   'agrarian unrest', 'agriculture protest', 'farmers\' march', 'farmers march', 'farmer march']

keywords_demon = ['Rs 1,000 notes', 'Rs 500 notes', 'lower denomination', 'Rs 500 and Rs 1,000 notes',\
                 'demonetisation', 'denomination note', 'cash withdrawal', 'swipe machine', 'unaccounted money',\
                 'withdrawal limit', 'black money', 'long queue', 'cashless transaction', 'cashless economy',\
                 'demonitis', 'demonitiz', 'swipe machine', 'pos machine', 'fake currency', 'digital payment',\
                 'digital transaction', 'cash transaction', 'cashless economy', 'cash crunch', 'currency switch'\
                 , 'demonetised note', 'cashless transaction', 'note ban', 'currency switch','ATMs', 'now-defunct currency',\
                 'demonitis', 'demonitiz', 'denomination note', 'cash withdrawal', 'swipe machine', 'unaccounted money', 'withdrawal limit', \
                  'pos machine', 'fake currency', 'digital payment', 'digital transaction', 'cash transaction', 'cashless economy',\
                  'black money', 'cash crunch', 'currency switch', 'long queue', 'demonetised note',\
                  'cashless transaction', 'note ban', 'currency switch', 'demonetis', 'demonetiz', 'denomination note']

keywords_gst = ['GST', 'Goods and Services Tax', 'Goods & Services Tax', 'Excise Duty',\
                'good and service tax', 'tax reform', 'goods and services tax', 'gst', 'taxpayers',\
               'GST', 'Goods and Services Tax', 'Goods & Services Tax', 'excise duty', \
               'GSTIN', 'CGST', 'SGST', 'IGST', 'Reverse Charge', 'GSTR', 'GSP', 'Suvidha Provider', \
               'HSN', 'gabbar singh tax', 'goods service tax', 'good service tax']

keywords_tech = ['Privacy', 'Cashless', 'Technology', 'Technological', 'Innovation', 'Software', 'Engineering', 'High Technology',\
            'Technical', 'Tech', 'Personal Data Protection', 'Big Data', 'Artificial Intelligence', 'Digital India', \
                'high speed internet', 'make in india', 'e-governance', 'umang', 'digital literacy', 'national policy on electronics', \
                'npe', 'e-gadget']

policies = {"aadhar":process(keywords_aadhar), "demon":process(keywords_demon), "farmers":process(keywords_farmers), "gst":process(keywords_gst), "tech":process(keywords_tech)}

### Combine Dataset

In [5]:
all_sentences = []
with open(my_dataset, 'r') as file:
    for line in file:
        sentence = line.strip().lower()
        if(len(sentence) > 0):
            all_sentences.append(sentence)
print('Num sentences in my dataset: ', len(all_sentences))

Num sentences in my dataset:  49662


In [6]:
df = pd.read_csv(deepak_dataset)
deepak_sentences = list(set(df['statement']))
print('Num sentences in Deepak dataset: ', len(deepak_sentences))
for sentence in deepak_sentences:
    refined_sentence = sentence.strip().lower()
    if(len(refined_sentence) > 0):
        all_sentences.append(refined_sentence)
all_sentences = list(set(all_sentences))
print('Num sentences in final dataset: ', len(all_sentences))

Num sentences in Deepak dataset:  1989
Num sentences in final dataset:  49921


In [7]:
output = open(base + 'refined-all-by-stmnts.txt','w')
for line in all_sentences:
    output.write(line + '\n')
output.close()

### Identify policy

In [8]:
def check_policy(sent, policies):
    p = []
    words = []
    for policy, keywords in policies.items():
        for word in keywords:
            if is_phrase_in(word, sent):
                words.append(word)
                p.append(policy)
                break
    if len(p)>1:
        print(p)
    return p, words

In [9]:
output = open(base + 'out-all-by-stmnts.txt','w')
with open(base + 'refined-all-by-stmnts.txt','r') as file:
    for line in file:
        p, words = check_policy(line, policies)
        p = " ".join(p)
        words = " ".join(words)
        output.write(p+" -- "+words+"\n")
output.close()

['demon', 'tech']
['aadhar', 'farmers']
['farmers', 'tech']
['demon', 'tech']
['demon', 'tech']
['farmers', 'tech']
['farmers', 'tech']
['demon', 'farmers']
['farmers', 'gst']
['demon', 'tech']
['demon', 'tech']
['farmers', 'tech']
['aadhar', 'tech']
['farmers', 'tech']
['aadhar', 'tech']
['demon', 'farmers']
['demon', 'gst']
['farmers', 'tech']
['farmers', 'tech']
['demon', 'tech']
['demon', 'tech']
['farmers', 'tech']
['demon', 'gst']
['demon', 'tech']
['farmers', 'tech']
['aadhar', 'tech']
['demon', 'tech']
['demon', 'tech']
['demon', 'tech']
['farmers', 'tech']
['demon', 'gst']
['aadhar', 'gst']
['demon', 'tech']
['aadhar', 'tech']
['aadhar', 'demon']
['demon', 'gst']
['aadhar', 'demon']
['aadhar', 'tech']
['demon', 'tech']
['demon', 'farmers']
['farmers', 'tech']
['demon', 'gst']
['demon', 'farmers']
['demon', 'gst']
['aadhar', 'demon', 'gst', 'tech']
['demon', 'tech']
['farmers', 'tech']
['gst', 'tech']
['demon', 'tech']
['demon', 'tech']
['demon', 'farmers']
['aadhar', 'tech']
[

In [12]:
aadhar_lines = []
demon_lines = []
farmers_lines = []
gst_lines = []
tech_lines = []
with open(base + "refined-all-by-stmnts.txt",'r') as file_s, open(base + "out-all-by-stmnts.txt",'r') as file_o: 
    for x, y in zip(file_s, file_o):
        y = y.strip().split(" ")
        if len(y)>1:
            print(y)
        for policy in y:
            if policy=="aadhar":
                aadhar_lines.append(x)
            if policy=="farmers":
                farmers_lines.append(x)
            if policy=="demon":
                demon_lines.append(x)
            if policy=="tech":
                tech_lines.append(x)
            if policy=="gst":
                gst_lines.append(x)
aadhar_lines = list(set(aadhar_lines))
farmers_lines = list(set(farmers_lines))
demon_lines = list(set(demon_lines))
tech_lines = list(set(tech_lines))
gst_lines = list(set(gst_lines))
print(len(aadhar_lines), len(farmers_lines), len(demon_lines), len(tech_lines), len(gst_lines))

['farmers', '--', 'farmers']
['demon', 'tech', '--', 'cashless', 'economy', 'Cashless']
['tech', '--', 'Tech']
['farmers', '--', 'farmers']
['gst', '--', 'gst']
['aadhar', '--', 'Aadhar']
['gst', '--', 'gst']
['farmers', '--', 'farmers']
['gst', '--', 'gst']
['farmers', '--', 'farm', 'loan']
['aadhar', 'farmers', '--', 'UID', 'fertilizer']
['tech', '--', 'Cashless']
['farmers', '--', 'agrarian']
['farmers', '--', 'farm', 'loan']
['farmers', '--', 'loan', 'waiver']
['gst', '--', 'gst']
['farmers', '--', 'farmers']
['farmers', '--', 'farmers']
['demon', '--', 'demonetisation']
['tech', '--', 'Technology']
['gst', '--', 'gst']
['demon', '--', 'ATMs']
['aadhar', '--', 'Aadhaar']
['farmers', '--', 'farmer', 'suicides']
['tech', '--', 'Privacy']
['aadhar', '--', 'Unique', 'Identification']
['farmers', '--', 'farmers']
['farmers', '--', 'farmers']
['farmers', '--', 'farm', 'loan']
['farmers', '--', 'farmers']
['farmers', '--', 'farmers']
['farmers', '--', 'farmers']
['demon', '--', 'demonetis

['farmers', '--', 'farmers']
['farmers', '--', 'farmers']
['farmers', '--', 'farmers']
['farmers', '--', 'crop', 'loss']
['farmers', '--', 'farmers']
['farmers', '--', 'farmers']
['gst', '--', 'gst']
['tech', '--', 'Technical']
['gst', 'tech', '--', 'gst', 'Technology']
['farmers', '--', 'farmers']
['tech', '--', 'make', 'in', 'india']
['demon', '--', 'note', 'ban']
['aadhar', '--', 'Aadhaar']
['tech', '--', 'make', 'in', 'india']
['farmers', '--', 'farmers']
['aadhar', '--', 'Aadhaar']
['farmers', '--', 'farmers']
['demon', '--', 'denomination', 'note']
['farmers', '--', 'farmers']
['gst', '--', 'gst']
['demon', '--', 'black', 'money']
['farmers', '--', 'farmers']
['demon', '--', 'black', 'money']
['gst', '--', 'gst']
['demon', '--', 'demonetisation']
['aadhar', '--', 'Aadhaar']
['farmers', '--', 'farmers']
['demon', '--', 'black', 'money']
['gst', '--', 'gst']
['farmers', '--', 'farmers']
['farmers', '--', 'farmers']
['farmers', '--', 'farmers']
['farmers', '--', 'farmers']
['demon',

['farmers', '--', 'drought']
['farmers', '--', 'farmers']
['tech', '--', 'Technological']
['tech', '--', 'Software']
['demon', '--', 'demonetisation']
['tech', '--', 'Technology']
['farmers', '--', 'crop', 'loan']
['farmers', '--', 'farmers']
['farmers', '--', 'farmers']
['demon', '--', 'black', 'money']
['demon', '--', 'demonetisation']
['farmers', '--', 'farmers']
['demon', '--', 'black', 'money']
['gst', '--', 'gst']
['demon', '--', 'note-ban']
['farmers', '--', 'farmers']
['farmers', '--', 'loan', 'waiver']
['farmers', '--', 'farmers']
['farmers', '--', 'kisan']
['gst', '--', 'gst']
['farmers', '--', 'agrarian']
['farmers', '--', 'agricultural']
['gst', '--', 'gst']
['aadhar', '--', 'UIDAI']
['demon', '--', 'black', 'money']
['aadhar', '--', 'UIDAI']
['farmers', '--', 'farmers']
['aadhar', '--', 'UIDAI']
['demon', '--', 'Rs', '1,000', 'notes']
['tech', '--', 'Technology']
['demon', '--', 'demonetisation']
['gst', '--', 'gst']
['farmers', '--', 'farmers']
['farmers', '--', 'agricult

['gst', '--', 'gst']
['farmers', '--', 'farmers']
['tech', '--', 'Technology']
['farmers', '--', 'farmer']
['tech', '--', 'Technology']
['tech', '--', 'digital', 'literacy']
['gst', '--', 'gst']
['demon', '--', 'digital', 'payment']
['demon', '--', 'demonetisation']
['farmers', '--', 'farmers']
['farmers', '--', 'crop', 'loan']
['farmers', '--', 'farmers']
['gst', '--', 'gst']
['farmers', '--', 'farmers']
['demon', '--', 'black', 'money']
['demon', '--', 'ATMs']
['farmers', '--', 'farmers']
['tech', '--', 'Innovation']
['demon', '--', 'black', 'money']
['gst', '--', 'gst']
['farmers', '--', 'drought']
['farmers', '--', 'farmers']
['farmers', '--', 'agriculture']
['demon', '--', 'lower', 'denomination']
['aadhar', '--', 'Aadhar']
['demon', '--', 'demonetisation']
['gst', '--', 'gst']
['farmers', '--', 'farmers']
['gst', '--', 'excise', 'duty']
['tech', '--', 'Technology']
['demon', '--', 'demonetisation']
['demon', '--', 'demonetisation']
['tech', '--', 'Software']
['demon', '--', 'Rs',

In [110]:
def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3 
intersection(demon_lines, gst_lines)

['economic affairs secretary subhashchandragarg said demonetisation and gst reflect long-term vision of the government and its ability to undertake massive structural reforms.\n',
 'in remarks apparently aimed at former reserve bank governor raghuram rajan, who friday blamed the goods and services tax roll-out along with demonetisation for lower gdp growth, arunjaitley said the economy recovered fast from the limited disruptionist impact of the tax reform.\n',
 "raising his pet poll issues, gandhi said: prime minister narendra modi ruined the country's economy by implementing demonetisation and 'gabbar singh tax' (gst).\n",
 'notebandi (demonetisation), prohibition and goods and services tax (gst) were very courageous steps, but many times i feel that prohibition was a more difficult decision than demonetisation, deputy chief minister modi said at de-addiction day celebrations in patna in the presence of nitish and prohibition, excise and registration minister bijendra prasad yadav.\n'

In [13]:
file_a = open(base + 'aadhar_all.txt','w')
file_d = open(base + 'demon_all.txt','w')
file_f = open(base + 'farmers_all.txt','w')
file_g = open(base + 'gst_all.txt','w')
file_t = open(base + 'tech_all.txt','w')
for line in aadhar_lines:
    file_a.write(line)
for line in demon_lines:
    file_d.write(line)
for line in tech_lines:
    file_t.write(line)
for line in gst_lines:
    file_g.write(line)
for line in farmers_lines:
    file_f.write(line)
file_a.close()
file_d.close()
file_f.close()
file_g.close()
file_t.close()

### Relevance Check

#### Using pobj

In [101]:
idx = 5 # this indicates policy -- 0 -> aadhar, 1 -> demon, 2 -> farmers, 3 -> gst, 4 -> tech

In [102]:
nlp = en_core_web_sm.load()
nlp.add_pipe(merge_entities)

def is_relevant(sent, keywords):
    doc = nlp(sent)
    for chunk in (doc.noun_chunks):
        if chunk.root.dep_!='nsubj' and chunk.root.dep_!='nsubjpass':
            continue
        for word in keywords:
            if is_phrase_in(word, chunk.text):
                return True
    return False 

In [103]:
def process2(keywords):
    keywords_small = [x.lower() for x in keywords]
    keywords_big = [x.upper() for x in keywords]
    keywords_hypen = [x.replace(" ", "-") for x in keywords]
    total_keywords = list(set(keywords + keywords_hypen + keywords_small + keywords_big))
    return total_keywords

In [104]:
files = ['aadhar_all.txt', 'demon_all.txt', 'farmers_all.txt', 'gst_all.txt','tech_all.txt']
f = open(base + files[idx], 'r+')
dicti = {files[0]:keywords_aadhar, 
         files[1]:keywords_demon, 
         files[2]:keywords_farmers, 
         files[3]:keywords_gst,
         files[4]:keywords_tech}
print(files[idx])
rel_count = 0
total_count = 0
total_keywords = process2(dicti[files[idx]])
print(total_keywords)

IndexError: list index out of range

In [100]:
f2 = open(base + 'pobj_relevant_' + files[idx], 'w+')
for line in f:
    total_count += 1
    if(is_relevant_by_pobj(line.strip(), total_keywords)):
        rel_count += 1
        f2.write(line)
f2.close()
print(rel_count, total_count)

119 779


#### Using Supervised Models

In [34]:
idx = 3 # this indicates policy -- 0 -> aadhar, 1 -> demon, 2 -> farmers, 3 -> gst, 4 -> tech

In [35]:
def read_into_string(filename):
    text_file = open(filename, 'r')
    lines = []
    for line in text_file:
        sentence = line.strip().lower()
        if(len(sentence) > 0):
            lines.append(line)
    text_file.close()
    return lines
print(files[idx])
dataset = read_into_string(base + files[idx])

gst_all.txt


In [36]:
def supervised_results(X_test, text_clf):
    predicted = text_clf.predict(X_test)
    f = open(base + 'supervised_relevant_' + files[idx], 'w+')
    for sentence, prediction in zip(X_test, predicted):
        if(prediction == 1):
            f.write(sentence)
    f.close()
    print('relevant statements: {}, Total statements: {}'.format(sum(predicted), len(X_test))) # relevant-1, non-relevant-0

In [37]:
supervised_results(dataset, pkl.load(open(base + files[idx][:-8] + '-model.pkl', 'rb')))

relevant statements: 381, Total statements: 620
