### Importing Libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import time
from random import randint
from lxml import html
import pandas as pd
from datetime import date
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

### Utility Classes, Functions & Essential Variables

In [2]:
# %load links/utils.py
QUERY_DICT = {'Organization Founded By^-1':["""SELECT ?item ?itemLabel WHERE {
                                          ?item wdt:P112 wd:%s.
                                          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                                        }"""
                                           ],
              'Organization Founded By':["""SELECT ?item ?itemLabel WHERE {
                                          wd:%s wdt:P112 ?item.
                                          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                                        }"""
                                        ],
              'Organization Headquarters':["""SELECT ?item ?itemLabel WHERE {
                                          wd:%s wdt:P159 ?item.
                                          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                                        }"""
                                          ],
              'Organization Subsidiary Of^-1':["""SELECT ?item ?itemLabel WHERE {
                                          wd:%s wdt:P355 ?item.
                                          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                                        }"""
                                              ],
              'Organization Subsidiary Of':["""SELECT ?item ?itemLabel WHERE {
                                          ?item wdt:P355 wd:%s.
                                          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                                        }"""
                                           ],
              'Organization top employees':["""SELECT ?item ?itemLabel WHERE {
                                          wd:%s wdt:P169 ?item.
                                          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                                        }""", # CEO
                                            """SELECT ?item ?itemLabel WHERE {
                                          wd:%s wdt:P488 ?item.
                                          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                                        }""" # Chairperson
                                            ],
              'Person Employee or Member of^-1':["""SELECT ?item ?itemLabel WHERE {
                                          ?item wdt:P108 wd:%s.
                                          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                                        }""",
                                            """SELECT ?item ?itemLabel WHERE {
                                          wd:%s wdt:P527 ?item.
                                          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                                        }""" 
                                                ],
              'Person Employee or Member of':["""SELECT ?item ?itemLabel WHERE {
                                              wd:%s wdt:P108 ?item.
                                              SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                                            }""",
                                              """SELECT ?item ?itemLabel WHERE {
                                              wd:%s wdt:P463 ?item.
                                              SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                                            }"""## member of ---> Band Members
                                            ],
              'Person Place of Birth':["""SELECT ?item ?itemLabel WHERE {
                                              wd:%s wdt:P19 ?item.
                                              SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                                            }"""
                                      ],
              'Person Current and Past Location of Residence':["""SELECT ?item ?itemLabel WHERE {
                                              wd:%s wdt:P551 ?item.
                                              SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                                            }"""
                                                              ],
              'Person Parents':["""SELECT ?item ?itemLabel WHERE {
                                              wd:%s wdt:P22 ?item.
                                              SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                                            }""", #Father
                                """SELECT ?item ?itemLabel WHERE {
                                              wd:%s wdt:P25 ?item.
                                              SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                                            }""", #Mother
                                """SELECT ?item ?itemLabel WHERE {
                                              wd:%s wdt:P1038 ?item.
                                              SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                                            }""" #Relative (Adopted Parents?)
                                # Shall we include stepparents??
                               ],
              'Person Parents^-1':["""SELECT ?item ?itemLabel WHERE {
                                              wd:%s wdt:P40 ?item.
                                              SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                                            }"""
                                  ],
              'Person Siblings':["""SELECT ?item ?itemLabel WHERE {
                                              wd:%s wdt:P3373 ?item.
                                              SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                                            }"""
                                ],
              'Person Spouse':["""SELECT ?item ?itemLabel WHERE {
                                              wd:%s wdt:P26 ?item.
                                              SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                                            }"""
                              ],
              'Citizen of':["""SELECT ?item ?itemLabel WHERE {
                                              wd:%s wdt:P27 ?item.
                                              SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                                            }"""
                           ],
              'Educated at':["""SELECT ?item ?itemLabel WHERE {
                                              wd:%s wdt:P69 ?item.
                                              SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                                            }"""
                            ]
             }




from SPARQLWrapper import SPARQLWrapper, JSON   
from rosette.api import API, DocumentParameters, RosetteException
import pandas as pd
import wikipedia
import requests
import numpy as np
import pickle
import random
from threading import Lock
import os, sys
import threading
from threading import Thread
import time
import queue

class Utils:
    
    def __init__(self):
        self.id_dict = {}
        self.lock = Lock()
        self.load_dict()
        
    
    def __del__(self):
        self.save_dict()

    def get_id(self, message, dict_to_use=None):
#         if dict_to_use:
#             dict_to_use = dict_to_use
#         else:
#             global id_dict
#             dict_to_use = id_dict
    
        if message in self.id_dict:
            return self.id_dict[message]
        else:
            API_ENDPOINT = "https://www.wikidata.org/w/api.php"
            query = message
            params = {
                'action': 'wbsearchentities',
                'format': 'json',
                'language': 'en',
                'search': query
            }
            r = requests.get(API_ENDPOINT, params = params)
            try:
                with self.lock:
                    self.id_dict[message] = r.json()['search'][0]['id']
                return self.id_dict[message]
            except Exception:
                return -1 #The id doesn't exist


    def id_to_name(self, eid):
#         if dict_to_use:
#             dict_to_use = dict_to_use
#         else:
#             global id_dict
#             dict_to_use = id_dict

        if eid in self.id_dict.values():
            return [key for key, value in self.id_dict.items() if value == eid][0]
        else:
            API_ENDPOINT = "https://www.wikidata.org/w/api.php"
            query = eid
            params = {
                'action': 'wbsearchentities',
                'format': 'json',
                'language': 'en',
                'search': query
            }
            r = requests.get(API_ENDPOINT, params = params)
            try:
                with self.lock:
                    self.id_dict[ r.json()['search'][0]['label'] ] = r.json()['search'][0]['id']
                return r.json()['search'][0]['label']
            except Exception:
                return -1 #The id doesn't exist


    def get_results(self, query, value, endpoint_url="https://query.wikidata.org/sparql"):
        sparql = SPARQLWrapper(endpoint_url)
        sparql.setQuery(query%value)
        sparql.setReturnFormat(JSON)
        return sparql.query().convert()


    def ground_truth(self, relation, subject, debug=False):
        global QUERY_DICT
        results = []
        gt = []
        try:
            results = [self.get_results(query, self.get_id(subject)) for query in QUERY_DICT[relation]]
            for result in results:
                for r in result["results"]["bindings"]:
                    gt.append(r['itemLabel']['value'])
        except:
            if debug:
                print (relation, subject)
        return gt

    def add_ground_truth(self, df, debug=False):
        if df.empty:
            return df
        if debug:
            print (df)
        df = df.reset_index()
        df['Pseudo Ground Truth'] = df.apply(lambda row: self.ground_truth(row['Relationship'], row['Subject']), axis=1)
        df['Count_PGT'] = df['Pseudo Ground Truth'].apply(lambda x: len(x))
        df = df.set_index(['Subject','Relationship'])
        return df

    def add_recall_score(self, df):
        df['Recall Prediction'] = np.random.randint(0, 100, df.shape[0])/100
        return df


    def load_dict(self):
        try:
            with open('data/dumps/id_dict.pkl', 'rb') as fp:
                self.id_dict = pickle.load(fp)
        except:
            print ("Creating a new Dictionary")
            self.id_dict = {}


    def save_dict(self):
        with self.lock:
            old_dict = self.get_dict()
            self.id_dict = {**self.id_dict, **old_dict}
            with open('data/dumps/id_dict.pkl', 'wb') as fp:
                pickle.dump(self.id_dict, fp, protocol=pickle.HIGHEST_PROTOCOL)
                print("Saved")


    def get_dict(self):
        di = {}
        with open('data/dumps/id_dict.pkl', 'rb') as fp:
            di = pickle.load(fp)
        return di


    def Analyse(self, message, doc=None, lock=None, alt_url='https://api.rosette.com/rest/v1/'):
        """ Run the example """
        # Create an API instance
        api = API(user_key="969b3593686184bb42803d8da453f119", service_url=alt_url)

        # Set selected API options.
        # For more information on the functionality of these
        # and other available options, see Rosette Features & Functions
        # https://developer.rosette.com/features-and-functions#morphological-analysis-introduction

        # api.set_option('modelType','perceptron') #Valid for Chinese and Japanese only

        # Opening the ID Dictionary
#         load_dict()
        ### Will Close after Analysis of the document is completed
    
        if lock == None:
            lock = Lock()

        params = DocumentParameters()
        if doc:
            relationships_text_data = doc[:20000]
        else:
            relationships_text_data = wikipedia.page(message).content[:20000]
        params["content"] = relationships_text_data
        rel = []
        message_id = self.get_id(message)
        message_split = message.split(" ")
        try:
            with lock:
                RESULT = api.relationships(params)
            
            for r in RESULT['relationships']:
                arg2_split = r['arg2'].split(" ")
                confidence = '?'
                if "confidence" in r:
                    confidence = str(round(r["confidence"],2))
                if any(s in arg2_split for s in message_split):
                    if self.get_id(r['arg2']) == message_id:
                        rel.append({'Relationship':r['predicate']+'^-1', 'Subject':r['arg2'], 'Object':r['arg1'], 'Confidence': confidence})
                rel.append({'Relationship':r['predicate'],'Subject':r['arg1'],'Object':r['arg2'], 'Confidence': confidence})

            ## Closing the ID Dict
            self.save_dict()
            ##
            return rel, message_id
        except RosetteException as exception:
            print(exception)


class HeatMaps(Thread):
    def __init__(self, lock, relation='Educated at', eid=None, name=None, rel_dict={}):
        Thread.__init__(self)
        self.q1 = queue.Queue()
        self.q2 = queue.Queue()
        self.u = Utils()
        self.lock = lock
        self.rel_dict = rel_dict
        self.eid = eid
        self.message = name
        self.error = None
        self.relation = relation
        self.inverse = True if "^-1" in relation else False
        if name:
            self.eid = self.u.get_id(name)
        else:
            self.message = str(self.u.id_to_name(eid))
        self.start()
        
        
    def run(self):
        if self.eid not in self.rel_dict:
            a = Thread(target = self.Analyse, args = ())
            b = Thread(target = self.ground_truth, args = ())
            a.start()
            b.start()
            a.join()
            b.join()
        self.matrix_block()


    def Analyse(self):
        """ Run the example """
        # Create an API instance
        api = API(user_key="969b3593686184bb42803d8da453f119", service_url='https://api.rosette.com/rest/v1/')
#         u = Utils()
        params = DocumentParameters()
        relationships_text_data = []
        
        while True:
            try:
                relationships_text_data = wikipedia.page(self.message).content[:20000]
                break
            except wikipedia.DisambiguationError as e:
                print(self.eid, self.message)
                nameclash = True
                for n in e.options:
                    if self.u.get_id(n) == self.eid:
                        if n == self.message:
                            pass
                        else:
                            self.message = n
                            nameclash = False
                            break
                if nameclash:
                    self.message = " "
            except wikipedia.exceptions.PageError as e:
                self.error = self.u.id_to_name(self.eid) + " " + str(e)
                print (self.error)
                break
            
        
        try:
            params["content"] = relationships_text_data
            rel = []
            message_id = self.u.get_id(self.message)
            message_split = self.message.split(" ")
            pred_list = []
            RESULT = []
            with self.lock:
                RESULT = api.relationships(params)
            
            args = ['arg1','arg2']
            arg_to_split = 'arg2' if self.inverse else 'arg1'
            args.remove(arg_to_split)
            other_arg = args[0]
            rel_to_compare = self.relation.split("^-1")[0]
                
            for r in RESULT['relationships']:
                if r['predicate'] == rel_to_compare:
                    arg_split = r[arg_to_split].split(" ") # Subject Split 
                    if any(s in arg_split for s in message_split): # Searching for alias names
                        if self.u.get_id(r[arg_to_split]) == message_id:
                            pred_list.append(r[other_arg])
                            
            self.q1.put(set(pred_list))
        except RosetteException as exception:
            print(exception)
            self.error = exception
            self.q1.put(set(pred_list))
        except Exception as e:
            print(e, self.message)
            self.error = e
            self.q1.put(set(pred_list))


    def ground_truth(self):
#         u = Utils()
        
        pgt = set(self.u.ground_truth(self.relation, self.message))
        self.q2.put(pgt)
    
    
    def matrix_block(self):
        if self.eid in self.rel_dict:
            self.pgt = self.rel_dict[self.eid]['PGT']
            self.extracted = self.rel_dict[self.eid]['Extracted']
            self.contained = self.rel_dict[self.eid]['Contained']
        else:
            q1 = self.q1.get() # Extracted from API
            q2 = self.q2.get() # PGT
            #print(self.message, q1)
            #print(self.message, q2)
            self.pgt = len(q2)
            self.extracted = len(q1)
            q1 = [self.u.get_id(i) for i in q1]
            q2 = [self.u.get_id(i) for i in q2]
            #print(self.message, q1)
            #print(self.message, q2)
            count = 0
            for i in q1:
                if i in q2:
                    count += 1
            self.contained = count

    def get_values(self):
        if self.error:
            raise Exception(self.error)
        return [self.eid, self.message, self.extracted, self.contained, self.pgt]
    
    
    
    
class Distribution(Thread):
    def __init__(self, eid=None, name=None, lock=None, rel_dict={}):
        Thread.__init__(self)
        self.doc_len = None
        self.u = Utils()
        self.eid = eid
        self.message = name
        self.error = None
        if name:
            self.eid = self.u.get_id(name)
        else:
            self.message = self.u.id_to_name(eid)
        if eid in rel_dict:
            self.doc_len = rel_dict[eid]['Doc_Length']
            return
        self.start()
    
    def run(self):
        while True:
            try:
                document = wikipedia.page(self.message).content
                self.doc_len = len(document)
                break
            except wikipedia.DisambiguationError as e:
                print(self.eid, self.message)
                nameclash = True
                for n in e.options:
                    if self.u.get_id(n) == self.eid:
                        if n == self.message:
                            pass
                        else:
                            self.message = n
                            nameclash = False
                            break
                if nameclash:
                    self.message = " "
            except wikipedia.exceptions.PageError as e:
                self.error = self.u.id_to_name(self.eid) + " " + str(e)
                print (self.error)
                break
    
    def get_values(self):
        if self.error:
            raise Exception(self.error)
        return [self.eid, self.message, self.doc_len]
    
    
class MissingExtractions(Thread):
    def __init__(self, eid=None, name=None, relation=None, rel_dict={}):
        Thread.__init__(self)
        self.missing = None
        self.u = Utils()
        self.relation = relation
        self.eid = eid
        self.message = name
        self.error = None
        if not eid:
            self.eid = self.u.get_id(name)
        if not name:
            self.message = self.u.id_to_name(eid)
        if eid in rel_dict:
            self.missing = rel_dict[eid]['Missing']
            return
        self.start()
    
    def run(self):
        while True:
            try:
                document = wikipedia.page(self.message).content
                pgt = set(self.u.ground_truth(self.relation, self.message))
                count = 0
                for item in pgt:
                    if document.find(item) == -1:
                        count += 1
                self.missing = count
                break
            except wikipedia.DisambiguationError as e:
                print(self.eid, self.message)
                nameclash = True
                for n in e.options:
                    if self.u.get_id(n) == self.eid:
                        if n == self.message:
                            pass
                        else:
                            self.message = n
                            nameclash = False
                            break
                if nameclash:
                    self.message = " "
            except wikipedia.exceptions.PageError as e:
                self.error = self.u.id_to_name(self.eid) + " " + str(e)
                print (self.error)
                break
    
    def get_values(self):
        if self.error:
            raise Exception(self.error)
        return [self.eid, self.message, self.missing]
    

class News(Thread):
    def __init__(self, lock=None, link=None, name=None, shared_df=None):
        Thread.__init__(self)
        self.df = pd.DataFrame()
        self.main_df = pd.DataFrame()
        self.u = Utils()
        self.lock = lock
        self.link = link
        self.eid = self.u.get_id(name)
        self.doc = ""
        self.message = name
        self.start()
        
        
    def run(self):
        a = Thread(target = self.get_link_text, args = ())
        a.start()
        a.join()

        
    def get_link_text(self):
        r = requests.get(self.link)
        content = r.text
        doc_summary = []
        soup = BeautifulSoup(content, "html.parser")
        paras = soup.findAll("p")
        for p in paras:
            doc_summary.append(p.text)
        s = " ".join(doc_summary)
        s = " ".join(s.split())
        self.doc = s
        ## Analysing the link text
        res,_ = self.u.Analyse(message=self.message, doc=self.doc, lock=self.lock)
        self.df = pd.DataFrame(res, columns=['Subject','Relationship','Object','Confidence'])
        self.main_df = self.df[self.df['Subject'].apply(lambda row: self.u.get_id(row)) == self.eid]
        self.main_df['Subject'] = self.message
    
    def get_df(self):
        return self.df
    
    def get_main_df(self):
        return self.main_df

In [3]:
query = "Ivanka"
N = 100
u = Utils()

### Extracting All Headlines

In [4]:
def make_headline_dict(query):
    ### Getting Custom Date Range ###
    try:
        d1,m1,y1 = input("Enter Start Date (dd/mm/yyyy) :").split('/')
    except:
        y1,m1,d1 = ('2014','01','01')
    try:
        d2,m2,y2 = input("Enter End Date (dd/mm/yyyy) :").split('/')
    except:
        y2,m2,d2 = str(date.today()).split('-')

    ### Setting up API ###
    headline_list = []
    headline_dict = {}

    options = Options()
    options.binary_location = "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe"
    options.add_argument('--headless')
    options.add_argument('--window-size=1200x600')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')

    chromedriver = 'C:\\Users\\Bhavya\\Desktop\\Vaibhav\\chromedriver.exe'
    r = webdriver.Chrome(executable_path=chromedriver, options=options)

    ### Scraping From Google ###
    final_date = date(int(y2),int(m2),int(d2))
    while True:
        dd = int(d1)
        mm = int(m1) 
        yy = int(y1) + mm//12
        mm = mm % 12 + 1    

        temp_date = date(yy,mm,dd)   
        if temp_date > final_date:
            break  

        r.get("https://www.google.com/search?q={}&hl=en-US&gl=US&source=lnt&tbs=cdr%3A1%2Ccd_min%3A{}%2F{}%2F{}%2Ccd_max%3A{}%2F{}%2F{}&tbm=nws".format(\
                            query,m1,d1,y1,mm,dd,yy))
        soup = BeautifulSoup(r.page_source, 'lxml')
        all_headlines = soup.findAll("h3")
        for headline in all_headlines:
            headline_list.append(headline.text)
            headline_dict[headline.text] = headline.a['href']        
        d1,m1,y1 = dd,mm,yy
    
    ### Saving the file ###
    save_headline_dict(headline_dict)

In [19]:
def save_headline_dict(headline_dict):
    path = 'data/dumps/{}_headline_dict.pkl'.format(query)
    with open(path, 'wb') as fp:
        pickle.dump(headline_dict, fp, protocol=pickle.HIGHEST_PROTOCOL)
        print("headline_dict saved!")
        
def load_headline_dict(query):
    path = 'data/dumps/{}_headline_dict.pkl'.format(query)
    headline_dict = {}
    try:
        with open(path, 'rb') as fp:
            headline_dict = pickle.load(fp)
    except:
        print ("Creating a new Dictionary")
        headline_dict = {}
    return headline_dict

In [18]:
#make_headline_dict(query)

Enter Start Date (dd/mm/yyyy) :
Enter End Date (dd/mm/yyyy) :
headline_dict saved!


In [20]:
headline_dict = load_headline_dict(query)

In [21]:
top100 = dict(random.sample(headline_dict.items(), 100))
top100

{"Ivanka Trump's Opening a Store in Trump Tower": 'https://www.racked.com/2017/8/10/16126236/ivanka-trump-store-nyc',
 'Ivanka Should Quit': 'https://www.politico.com/magazine/story/2018/02/26/ivanka-should-quit-217089',
 "Ivanka Trump on the Hot Seat: Can Her Brand Survive Donald's ...": 'https://www.thewrap.com/ivanka-trump-on-the-hot-seat-can-her-brand-survive-donalds-outrageous-brand/',
 "'I'm so proud of her!' Ivanka Trump poses atop a mountain in Aspen ...": 'https://www.dailymail.co.uk/femail/article-2998904/Ivanka-Trump-poses-atop-mountain-Aspen-niece-Kai-marveling-seven-year-old-s-skiing-skills.html',
 "Was Ivanka Trump 'Complicit' in Halting Equal Pay Rule? Watchdog ...": 'https://www.newsweek.com/ivanka-trump-complicit-equal-pay-879571',
 'Millionaire Ivanka Trump says following these 6 negotiation rules can ...': 'https://www.businessinsider.com/ivanka-trumps-shares-top-negotiating-rules-2016-2',
 'Ivanka Trump Faces Backlash After Posting Photo of Her Toddler Son': 'https:

In [28]:
top100.keys()

TypeError: 'dict_keys' object does not support indexing

In [None]:
class News(Thread):
    def __init__(self, lock=None, link=None, name=None, shared_df=None):
        Thread.__init__(self)
        self.df = pd.DataFrame()
        self.main_df = pd.DataFrame()
        self.u = Utils()
        self.lock = lock
        self.link = link
        self.eid = self.u.get_id(name)
        self.doc = ""
        self.message = name
        self.start()
        
        
    def run(self):
        a = Thread(target = self.get_link_text, args = ())
        a.start()
        a.join()

        
    def get_link_text(self):
        r = requests.get(self.link)
        content = r.text
        doc_summary = []
        soup = BeautifulSoup(content, "html.parser")
        paras = soup.findAll("p")
        for p in paras:
            doc_summary.append(p.text)
        s = " ".join(doc_summary)
        s = " ".join(s.split())
        self.doc = s
        ## Analysing the link text
        res,_ = self.u.Analyse(message=self.message, doc=self.doc, lock=self.lock)
        self.df = pd.DataFrame(res, columns=['Subject','Relationship','Object','Confidence'])
        self.main_df = self.df[self.df['Subject'].apply(lambda row: self.u.get_id(row)) == self.eid]
        self.main_df['Subject'] = self.message
    
    def get_df(self):
        return self.df
    
    def get_main_df(self):
        return self.main_df

In [45]:
def get_news(link):
    ### Setting up API ###
    options = Options()
    options.binary_location = "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe"
    options.add_argument('--headless')
    options.add_argument('--window-size=1200x600')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')

    chromedriver = 'C:\\Users\\Bhavya\\Desktop\\Vaibhav\\chromedriver.exe'
    r = webdriver.Chrome(executable_path=chromedriver, options=options)

    ### Scraping From URL ###
    r.get(link)
    doc_summary = []
    soup = BeautifulSoup(r.page_source, 'lxml')
    all_paragraphs = soup.findAll("p")
    for paragraph in all_paragraphs:
        doc_summary.append(paragraph.text)
    doc_summary = " ".join(doc_summary)
    doc_summary = " ".join(doc_summary.split())
    return doc_summary

In [46]:
get_news(link)

  self.parser.feed(markup)


"By Daily Mail Reporter Published: 22:52 GMT, 5 January 2014 | Updated: 10:03 GMT, 6 January 2014 2 View comments She's been staying at her father's Mar-a-lago Club in Palm Beach for more than a week. So it's no surprise she'd want to make the most of her last day at the beautiful Florida resort. Ivanka Trump hob nobbed with guests at the second annual Trump Invitational Grand Prix on Sunday. New mother: Ivanka Trump shared an image on her Instagram of herself, her daughter Arabella and Georgina Bloomberg at the second annual Trump Invitational in Florida The 32-year-old businesswoman and former model posted a snap of herself and Georgina Bloomberg, New York City Mayor Michael Bloomberg's daughter, at the equestrian event. 'With beautiful new mommy @GeorginaBloomberg at the Trump Invitational,' she captioned the pic. Ivanka, whose two-year-old daughter Arabella also appeared in the image, looked beautiful in a blue and red heart-printed halter dress and nude coloured sandals. Adorable 

### Others

In [183]:
link = list(headline_link.values())[1]
link

'https://www.dailymail.co.uk/tvshowbiz/article-2534295/Ivanka-Trump-poses-day-Florida-vacation-new-mother-Georgina-Bloomberg-Donalds-Trump-Invitational.html'

In [43]:
r = requests.get(link)
content = r.text
doc_summary = []
soup = BeautifulSoup(content, "html.parser")
paras = soup.findAll("p")
for p in paras:
    doc_summary.append(p.text)
s = " ".join(doc_summary)
s = " ".join(s.split())

  self._sock = None


In [44]:
s

"By Daily Mail Reporter Published: 22:52 GMT, 5 January 2014 | Updated: 10:03 GMT, 6 January 2014 2 View comments She's been staying at her father's Mar-a-lago Club in Palm Beach for more than a week. So it's no surprise she'd want to make the most of her last day at the beautiful Florida resort. Ivanka Trump hob nobbed with guests at the second annual Trump Invitational Grand Prix on Sunday. New mother: Ivanka Trump shared an image on her Instagram of herself, her daughter Arabella and Georgina Bloomberg at the second annual Trump Invitational in Florida The 32-year-old businesswoman and former model posted a snap of herself and Georgina Bloomberg, New York City Mayor Michael Bloomberg's daughter, at the equestrian event. 'With beautiful new mommy @GeorginaBloomberg at the Trump Invitational,' she captioned the pic. Ivanka, whose two-year-old daughter Arabella also appeared in the image, looked beautiful in a blue and red heart-printed halter dress and nude coloured sandals. Adorable 

In [None]:
u = Utils()
class News(Thread):
    def __init__(self, lock=None, link=None, name=None, shared_df=None):
        Thread.__init__(self)
        self.df = pd.DataFrame()
        self.main_df = pd.DataFrame()
        self.u = Utils()
        self.lock = lock
        self.link = link
        self.eid = self.u.get_id(name)
        self.doc = ""
        self.message = name
        self.start()
        
        
    def run(self):
        a = Thread(target = self.get_link_text, args = ())
        a.start()
        a.join()

        
    def get_link_text(self):
        r = requests.get(self.link)
        content = r.text
        doc_summary = []
        soup = BeautifulSoup(content, "html.parser")
        paras = soup.findAll("p")
        for p in paras:
            doc_summary.append(p.text)
        s = " ".join(doc_summary)
        s = " ".join(s.split())
        self.doc = s
        ## Analysing the link text
        res,_ = self.u.Analyse(message=self.message, doc=self.doc, lock=self.lock)
        self.df = pd.DataFrame(res, columns=['Subject','Relationship','Object','Confidence'])
        self.main_df = self.df[self.df['Subject'].apply(lambda row: self.u.get_id(row)) == self.eid]
        self.main_df['Subject'] = self.message
    
    def get_df(self):
        return self.df
    
    def get_main_df(self):
        return self.main_df

# Preprocess

### Exploring TopN Headlines

In [15]:
links = list(headline_link.values())[:N]
lock = Lock()

### Creating threads for each link
threads = []
for link in links:
    threads.append( News(link=link, name=query, lock=lock) )

### Waiting for each thread to complete
for t in threads:
    try:
        t.join()
    except Exception as e:
        pass

### Merging the dfs
DFs = []
for i,t in enumerate(threads):
    DFs.append(t.get_main_df())

MERGED_DF = pd.concat(DFs, ignore_index=True)

Saved
Saved
Saved
Saved
Saved
Saved
Saved
Saved
Saved


  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None


Saved


  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None


Saved


  self._sock = None


Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None


Creating a new Dictionary
Saved


  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None


Creating a new Dictionary
Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._sock = None
  self._sock = None


Saved
Saved


  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None
  self._sock = None


Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved
Saved
Saved
Saved
Saved
Saved
Saved
Saved
Saved
Saved


  result = method(y)


Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved
Saved
Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved
Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved
Saved
Saved
Saved
Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved
Saved
Saved
Saved
Saved
Saved
Saved
Saved
Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved
Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved
Saved
Saved
Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved
Saved
Saved
Saved
Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved
Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved
Saved
Saved
Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved


  result = method(y)


Saved
Saved
Saved
Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved
Saved


  result = method(y)


Saved
Saved
Saved
Saved
Saved
Saved
Saved
Saved
Saved
Saved
Saved
Saved
Saved
Saved
Saved
Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved
Saved
Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved
Saved
Saved
Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved
Saved
Saved
Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved
Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Saved
Saved
Saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [16]:
dfs_copy = DFs.copy()

def retrieve():
    global dfs_copy
    return dfs_copy

# DFs = retrieve()

### Adding Dummy Relations

In [17]:
PERSON_RELATIONS = ['Educated at', 'Citizen of', 'Person Employee or Member of', 'Organization top employees^-1',\
                    'Person Current and Past Location of Residence', 'Person Parents', 'Person Parents^-1',\
                    'Person Place of Birth', 'Person Siblings', 'Person Spouse']
ORG_RELATION =     ['Organization Founded By', 'Organization Collaboration', 'Organization Collaboration^-1',\
                    'Organization Headquarters', 'Organization Subsidiary Of', 'Organization Subsidiary Of^-1',\
                    'Organization top employees', 'Person Employee or Member of^-1', 'Organization Acquired By^-1',\
                    'Organization Acquired By', 'Organization Provider To', 'Organization Provider To^-1']
COMMON_RELATION =  ['Organization Founded By^-1']

isPerson = False

def add_dummy(df, person=False):
    global isPerson
    rel = set(df['Relationship'])
    if person:
        isPerson = person
    else:
        isPerson = True if any(s in PERSON_RELATIONS for s in rel) else False
    if isPerson:
        dummy_rels = COMMON_RELATION + PERSON_RELATIONS
    else:
        dummy_rels = COMMON_RELATION + ORG_RELATION
    for r in rel:
        dummy_rels.remove(r)
    for r in dummy_rels:
        df = df.append({'Subject': query, 'Relationship':r, 'Object':''}, ignore_index=True)
    return df

def count_confidence(main_df):
    if (not main_df.empty):
        main_df = main_df.sort_values('Object', ascending=True).drop_duplicates().groupby(['Subject','Relationship']).agg(lambda x: list(x))
        main_df['Object'] = main_df['Object'].agg(lambda x: x if x != [''] else [])
        main_df['Count'] = main_df['Object'].apply(lambda x: len(x))
        #main_df = main_df[[c for c in main_df if c not in ['Confidence']] + ['Confidence']]
    return main_df

In [18]:
main_df = MERGED_DF.drop('Confidence',axis=1)
main_df = main_df.groupby(['Subject','Relationship','Object']).size().to_frame('c').reset_index()
main_df['Object'] = main_df.apply(lambda main_df: main_df['Object']+':'+str(main_df['c']), axis=1) 
main_df = main_df.drop('c',axis=1)
main_df = add_dummy(main_df)

main_df = count_confidence(main_df)

###### ADDING GROUND TRUTH ######
main_df = u.add_ground_truth(main_df)

###### ADDING RECALL SCORE ######
main_df = u.add_recall_score(main_df)

main_df.to_pickle("data/dumps/web_reality-{}-{}.pkl".format(N,query))
main_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Object,Count,Pseudo Ground Truth,Count_PGT,Recall Prediction
Subject,Relationship,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Donald Trump,Citizen of,"[American:2, Americans:1, Chinese:1, Czech:3, German village:3, Hanoi:1, Kallstadt:3, New Zealand:1, Puerto Rico:1, Russian:2, United States:2, Yemen:1]",12,[United States of America],1,0.81
Donald Trump,Educated at,"[Congress:2, Federal Aviation Administration:1, Foundations of American Jewish Liberalism:1, Office of Management and Budget:1, Prudential:2, Trump International Golf Course:1, University of Florida:1, Yale Law School:1]",8,"[University of Pennsylvania, Fordham University, The Wharton School, New York Military Academy, The Kew-Forest School]",5,0.71
Donald Trump,Organization Founded By^-1,"[Democrats:1, National Rifle Association:1]",2,"[Trump Model Management, Trump Entertainment Resorts, The Trump Entrepreneur Initiative, Trump Mortgage, Donald J. Trump Foundation, Trump Home, American Manufacturing Council]",7,0.62
Donald Trump,Organization top employees^-1,"[AP:1, Boeing:1, Department of Veterans Affairs:1, USA Today:1, Yale’s China Center:1]",5,[],0,0.74
Donald Trump,Person Current and Past Location of Residence,"[Jerusalem:2, United States:2, Washington, D.C.:1]",3,[White House],1,0.33
Donald Trump,Person Employee or Member of,"[AP:1, Boeing:1, CNN:1, Democratic:1, Democrats:1, Department of Veterans Affairs:1, Deutsche Bank and Investors Bank:1, Gallup poll:1, Justice Department:3, Marble Collegiate Church:3, Obama administration:1, Oval Office:1, Republican:5, Republicans:6, Senate:1, Twitter:1, USA Today:1, White House:2, Yale Law School:1, Yale’s China Center:1]",20,[The World's Billionaires],1,0.62
Donald Trump,Person Parents,[],0,"[Fred Trump, Mary Anne MacLeod, John G. Trump, Jared Kushner, Frederick Trump, Elizabeth Trump]",6,0.45
Donald Trump,Person Parents^-1,"[Jared Kushner:1, Marla Maples:3]",2,"[Ivanka Trump, Donald Trump Jr., Eric Trump, Tiffany Trump, Barron Trump]",5,0.85
Donald Trump,Person Place of Birth,"[Jamaica Hospital:3, New York City:3, Queens:3]",3,[Jamaica Hospital],1,0.73
Donald Trump,Person Siblings,[],0,"[Maryanne Trump Barry, Robert Trump, Fred Trump Jr., Elizabeth Trump Grau]",4,0.42


# Menu-Driven

### Web & Reality

In [19]:
main_df = pd.read_pickle("data/dumps/web_reality-{}-{}.pkl".format(N,query))
web = main_df[['Object','Count']]
web = web.rename(index=str, columns={"Object": "Web", "Count": "Web_Count"})
reality = main_df[['Pseudo Ground Truth', 'Count_PGT']]

### Doc Comparison

In [21]:
## Displaying News Headines
print('*******  MAIN-MENU  *******')
for i,t in enumerate(list(headline_link.keys())[:N]):
    print(i,": ",t)
ch = input("Choose an option ")

lock = Lock()
isPerson = True if ('Citizen of' in list(web.reset_index()['Relationship'])) else False
link = list(headline_link.values())[int(ch)]
doc = News(link=link, name=query, lock=lock)
doc.join()
df = doc.get_main_df().drop('Confidence',axis=1)
message_id = u.get_id(query)
df = add_dummy(df, person=isPerson)
df = count_confidence(df)
df = df.join(web)
df = df.join(reality)
pd.set_option('display.max_colwidth', -1)
df

*******  MAIN-MENU  *******
0 :  Donald Trump, White House condemn New Zealand mosque shooting in Christchurch
1 :  Trump denounces 'horrible massacre' in New Zealand
2 :  Trump tweets 'warmest sympathy and best wishes' to New Zealand
3 :  Trump offers condolences to New Zealand following mosque terror attacks
4 :  Donald Trump offers US assistance after 'horrible massacre' in New Zealand
5 :  Donald Trump stirs controversy with Breitbart interview about his 'tough' supporters
6 :  Trump says 'there should be no Mueller report'
7 :  16 minutes that explain the Trump presidency
8 :  Trump applauds 'Jexodus' movement, calls on Jewish voters to become Republicans
9 :  House panel opens sweeping probe of Trump, his associates
10 :  What GOP split on border emergency means for Donald Trump and 2020
11 :  Is America Becoming Trump’s Banana Republic?
12 :  Biden will be a bust, Trump aides assure their boss
13 :  How Donald Trump is making illegal immigration worse
14 :  Trump said he's 'surp

Unnamed: 0_level_0,Unnamed: 1_level_0,Object,Count,Web,Web_Count,Pseudo Ground Truth,Count_PGT
Subject,Relationship,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Donald Trump,Citizen of,[],0,"[American:2, Americans:1, Chinese:1, Czech:3, German village:3, Hanoi:1, Kallstadt:3, New Zealand:1, Puerto Rico:1, Russian:2, United States:2, Yemen:1]",12,[United States of America],1
Donald Trump,Educated at,[],0,"[Congress:2, Federal Aviation Administration:1, Foundations of American Jewish Liberalism:1, Office of Management and Budget:1, Prudential:2, Trump International Golf Course:1, University of Florida:1, Yale Law School:1]",8,"[University of Pennsylvania, Fordham University, The Wharton School, New York Military Academy, The Kew-Forest School]",5
Donald Trump,Organization Founded By^-1,[],0,"[Democrats:1, National Rifle Association:1]",2,"[Trump Model Management, Trump Entertainment Resorts, The Trump Entrepreneur Initiative, Trump Mortgage, Donald J. Trump Foundation, Trump Home, American Manufacturing Council]",7
Donald Trump,Organization top employees^-1,[],0,"[AP:1, Boeing:1, Department of Veterans Affairs:1, USA Today:1, Yale’s China Center:1]",5,[],0
Donald Trump,Person Current and Past Location of Residence,[],0,"[Jerusalem:2, United States:2, Washington, D.C.:1]",3,[White House],1
Donald Trump,Person Employee or Member of,[],0,"[AP:1, Boeing:1, CNN:1, Democratic:1, Democrats:1, Department of Veterans Affairs:1, Deutsche Bank and Investors Bank:1, Gallup poll:1, Justice Department:3, Marble Collegiate Church:3, Obama administration:1, Oval Office:1, Republican:5, Republicans:6, Senate:1, Twitter:1, USA Today:1, White House:2, Yale Law School:1, Yale’s China Center:1]",20,[The World's Billionaires],1
Donald Trump,Person Parents,[],0,[],0,"[Fred Trump, Mary Anne MacLeod, John G. Trump, Jared Kushner, Frederick Trump, Elizabeth Trump]",6
Donald Trump,Person Parents^-1,[],0,"[Jared Kushner:1, Marla Maples:3]",2,"[Ivanka Trump, Donald Trump Jr., Eric Trump, Tiffany Trump, Barron Trump]",5
Donald Trump,Person Place of Birth,[],0,"[Jamaica Hospital:3, New York City:3, Queens:3]",3,[Jamaica Hospital],1
Donald Trump,Person Siblings,[],0,[],0,"[Maryanne Trump Barry, Robert Trump, Fred Trump Jr., Elizabeth Trump Grau]",4


### Extras+

In [None]:
message_id = u.get_id(query)

for i,df in enumerate(DFs):
    df = add_dummy(df, person=isPerson)
    df = count_confidence(df)
    df = df.join(web)
    DFs[i] = df.join(reality)

In [174]:
main_df = MERGED_DF.drop('Confidence',axis=1)
main_df = main_df.groupby(['Subject','Relationship','Object']).size().to_frame('c').reset_index()
main_df['Object'] = main_df.apply(lambda main_df: main_df['Object']+':'+str(main_df['c']), axis=1) 
main_df = main_df.drop('c',axis=1)
main_df = add_dummy(main_df)

In [175]:
main_df

Unnamed: 0,Subject,Relationship,Object
0,Donald Trump,Citizen of,New Zealand:1
1,Donald Trump,Citizen of,Russian:1
2,Donald Trump,Citizen of,United States:1
3,Donald Trump,Citizen of,Yemen:1
4,Donald Trump,Organization Founded By^-1,Democrats:1
5,Donald Trump,Organization Founded By^-1,National Rifle Association:1
6,Donald Trump,Person Employee or Member of,Democratic:1
7,Donald Trump,Person Employee or Member of,Democrats:1
8,Donald Trump,Person Employee or Member of,Oval Office:1
9,Donald Trump,Person Employee or Member of,Republican:1
