<a href="https://colab.research.google.com/github/Supriya090/Knowledge-Graph-Task/blob/master/KnowledgeGraphTask.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Web Scraping using Beautiful Soup




In [43]:
# importing the necessary libaries
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# preparing the csv file
news_data_csv = open('news_data.csv', 'w')
fieldnames = ['TITLE', 'CONTENT']
writer = csv.DictWriter(news_data_csv, fieldnames=fieldnames)
writer.writeheader()

In [4]:
# getting the headings and content of article
def get_content(url):
  page = requests.get(url)
  page_soup = BeautifulSoup(page.content, 'html.parser')     # parsing code in HTML

  #finding the heading
  news_title = page_soup.find_all('h1')
  print(news_title[0].get_text())

  #finding the content
  article_text=''
  news_content = page_soup.find('div', class_="post-content-wrap").findAll('p')   #getting only the content of <p> tag
  for element in news_content:
    article_text += '\n' + ''.join(element.findAll(text = True))
  print(article_text)

  #writing the title and content to CSV
  writer.writerow({'TITLE':news_title[0].get_text(), 'CONTENT':article_text})

In [5]:
# getting the headings of article
def get_links(url):
  page = requests.get(url)
  page_soup = BeautifulSoup(page.content, 'html.parser')     # parsing code in HTML
  news_headings = page_soup.find('div',class_="listical-news-big").find_all('h2')
  link_with_href = []
  link_list = []
  for heads in news_headings:
    link_with_href.append(heads.find('a', href=True))
    # print(link_with_href['href'])
  #   link_with_href.append()
  for i in range(20):
    link_list.append(link_with_href[i]['href'])
  return link_list

In [None]:
links = get_links("https://english.onlinekhabar.com/category/business")
for i in range(20):
  get_content(links[i])

news_data_csv.close()
print("CSV has been generated")

In [7]:
#reading the generated csv file
news_info = pd.read_csv('/content/news_data.csv')
news_info.head()

Unnamed: 0,TITLE,CONTENT
0,\nGovt collects Rs 92.78 billion in fuel taxes...,"\nKathmandu, May 24\nAs the fuel prices keep o..."
1,\n2nd fuel price hike in 8 days: Petrol Rs 180...,"\nKathmandu, May 23\nThe government-run fuel d..."
2,\nGovt reducing airport infrastructure budget ...,"\nKathmandu, May 22\nThe governing is reducing..."
3,\nNepal imported gold worth Rs 35.77 billion i...,"\nKathmandu, May 22\nWhile there are reports t..."
4,\nFood delivery services are booming in Kathma...,"\nA couple of weeks back, Bintika Kafle of Kat..."


In [8]:
#replacing the newline as blank space
news_info = news_info.replace(r'\n',' ', regex=True) 
#changing content column to lowercase
news_info['CONTENT'] = news_info['CONTENT'].str.lower()
#removing commas from content column
news_info['CONTENT'] = news_info['CONTENT'].str.replace(',','')
news_info.head()

Unnamed: 0,TITLE,CONTENT
0,Govt collects Rs 92.78 billion in fuel taxes ...,kathmandu may 24 as the fuel prices keep on i...
1,"2nd fuel price hike in 8 days: Petrol Rs 180,...",kathmandu may 23 the government-run fuel dist...
2,Govt reducing airport infrastructure budget t...,kathmandu may 22 the governing is reducing it...
3,Nepal imported gold worth Rs 35.77 billion in...,kathmandu may 22 while there are reports that...
4,Food delivery services are booming in Kathman...,a couple of weeks back bintika kafle of kathm...


In [9]:
#removing the date and place of report
news_info["CONTENT"] = news_info["CONTENT"].str.split().str[3:].str.join(sep=" ")
print(news_info.head())

                                               TITLE  \
0   Govt collects Rs 92.78 billion in fuel taxes ...   
1   2nd fuel price hike in 8 days: Petrol Rs 180,...   
2   Govt reducing airport infrastructure budget t...   
3   Nepal imported gold worth Rs 35.77 billion in...   
4   Food delivery services are booming in Kathman...   

                                             CONTENT  
0  as the fuel prices keep on increasing you migh...  
1  the government-run fuel distribution monopoly ...  
2  the governing is reducing its airport infrastr...  
3  while there are reports that the country’s eco...  
4  weeks back bintika kafle of kathmandu was home...  


## Sentence Segmentation with spaCy

In [None]:
!python -m spacy download en_core_web_lg

In [44]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

#Segmenting the news content into sentences and placing them in the same list
news = news_info["CONTENT"]
sentences = []
for each_news in news:
  indv_sentences = [i for i in nlp(each_news).sents]
  sentences.extend(indv_sentences)
print(sentences)

[as the fuel prices keep on increasing you might have already understood that one of the reasons is the high fuel taxes levied by the government., apparently the fuel prices are quite high as the government collected rs 92.78 billion in the past 10 months., following the recent fuel price hike on sunday the nepal oil corporation the government-run fuel distribution monopoly has to pay rs 63.23 in taxes to the government for a litre of petrol., likewise the corporation has to pay in taxes rs 45.91 for a litre of diesel and rs 13.66 for a litre of kerosene., meanwhile the new fuel tax rate for a cylinder of cooking gas is rs 327.27., whereas the government is under the pressure from consumer rights activists and other stakeholders to reduce tax rates the ministry of finance is not ready yet according to an official., the government-run fuel distribution monopoly nepal oil corporation (noc) has increased fuel prices once again for the second time in the past eight days., of late the corpo

The ideal case is one subject and one object but due to complex sentences in Online Khabar, it is diffult to obtain that.

In [42]:
# Checking the syntatic structure and parts of speech in the sentence
doc = nlp("apparently the fuel prices are quite high as the government collected rs 92.78 billion in the past 10 months")

for tok in doc:
  print(tok.text, "-->", tok.dep_, "-->", tok.pos_)

apparently --> advmod --> ADV
the --> det --> DET
fuel --> compound --> NOUN
prices --> nsubj --> NOUN
are --> ROOT --> AUX
quite --> advmod --> ADV
high --> acomp --> ADJ
as --> mark --> SCONJ
the --> det --> DET
government --> nsubj --> NOUN
collected --> advcl --> VERB
rs --> quantmod --> NOUN
92.78 --> compound --> NUM
billion --> dobj --> NUM
in --> prep --> ADP
the --> det --> DET
past --> amod --> ADJ
10 --> nummod --> NUM
months --> pobj --> NOUN


## Entity Pair Extraction

In [20]:
# getting the entities
def get_entities(entry):
  ent1 = ""
  ent2 = ""

  prev_tok_dep = ""     # dependency tag of previous token in the sentence
  prev_tok_text = ""    # previous token in the sentence

  # holds the text associated with the subject or the object
  prefix = ""
  modifier = ""


  for tok in nlp(entry):
    # if token is punctuation, move on to the next token
    if tok.dep_ != "punct":
      # checking if the token is a compound word
      if tok.dep_ == "compound":
        prefix = tok.text
        # if the previous word was also a compound, then adding the current word to it
        if prev_tok_dep == "compound":
          prefix = prev_tok_text + " "+ tok.text

      # checking if the token is a modifier
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        # if the previous word was also a compound, then adding the current word to it
        if prev_tok_dep == "compound":
          modifier = prev_tok_text + " "+ tok.text

      # capturing the token as first entity if it is a subject
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        # resetting the variables
        prefix = ""
        modifier = ""
        prev_tok_dep = ""
        prev_tok_text = ""

      # capturing the token as second entity if it is an object
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix + " "+ tok.text
        # resetting the variables
        prefix = ""
        modifier = ""
        prev_tok_dep = ""
        prev_tok_text = ""

      # updating previous tokens and dependencies
      prev_tok_dep = tok.dep_
      prev_tok_text = tok.text

  # returning the entities
  return [ent1.strip(), ent2.strip()]

In [23]:
# testing the get_entities function
get_entities("the decision comes into effect on wednesday")   #using a simple sentence for better functionality

['decision', 'wednesday']

In [37]:
# type conversion
print(type(sentences[0]))
print(type(sentences[0].text))
get_entities(sentences[0].text)

<class 'spacy.tokens.span.Span'>
<class 'str'>


['already  one', 'high fuel government']

In [40]:
#Extracting entity pairs for all the sentences
entity_pairs = []

for each_sentence in sentences:
  entity_pairs.append(get_entities(each_sentence.text))

print(entity_pairs)

[['already  one', 'high fuel government'], ['quite  government', '10  months'], ['run fuel distribution monopoly', 'petrol'], ['likewise  corporation', 'kerosene'], ['new fuel tax rate', 'cooking  gas'], ['ministry', 'yet  official'], ['run nepal oil corporation', 'eight  days'], ['late  corporation', 'fuel prices'], ['fortnight', 'decision'], ['statement', 'gas petrol'], ['that', 'july'], ['amount', 'already  operation'], ['amount', 'new  airport'], ['time  authority', 'smaller  amount'], ['government', 'domestic airport infrastructures'], ['big  chunk', 'this'], ['10 billion 10 which', 'tribhuvan international tribhuvan airport'], ['government data', 'gold'], ['also luxury item', '10  months'], ['nepal', 'fiscal  year'], ['country', 'same  period'], ['country', '10  months'], ['year  volume', 'imports'], ['', ''], ['', 'period'], ['government', 'revenues'], ['back bintika kafle', 'kathmandu'], ['she', ''], ['it', 'doorstep'], ['kafle', 'too  food'], ['which', 'food delivery services'

## Relation/Predicate Extraction using spaCy's Rule-based Matching

Assuming relation/predicate as the main verb in the sentence

In [49]:
# getting relations from sentences
def get_relation(entry):

  # creating a spaCy object
  doc = nlp(entry)

  # creating a matcher class object 
  matcher = Matcher(nlp.vocab)

  # defining the pattern 
  pattern = [{'DEP':'ROOT'},            # finds root word in the sentence
            {'DEP':'prep','OP':"?"},    # checks if it is followed by preposition
            {'DEP':'agent','OP':"?"},   # checks if it is followed by agent
            {'POS':'ADJ','OP':"?"}]     # checks adjective part of speech 

  # adding rule to the matcher
  matcher.add("match_1", None, pattern)

  matches = matcher(doc)
  k = len(matches) - 1
  span = doc[matches[k][1]:matches[k][2]] 

  return (span.text)

In [50]:
# testing get_relation function
get_relation("the decision comes into effect on wednesday")

'comes into'

In [53]:
# getting relations from all the sentences
relations = []

for each_sentence in sentences:
  relations.append(get_relation(each_sentence.text))

print(relations)

['understood', 'are', 'has', 'has', 'is', 'is under', 'increased', 'increasing', 'delayed by', 'is at', 'reducing', 'completed', 'is in', 'says', 'allocated', 'is', 'expected', 'are', 'rose', 'says', 'is', 'imported', 'was', 'billion', 'had', 'earned', 'was', 'was hungry', 'was', 'decided', 'chose', 'ordered', 'turned into', 'says', 'was', 'stopped', 'talks to', 'booming at', 'gone through similar', 'ordered', 'supposed', 'took', 'says', 'booming', 'dissatisfied with', 'taking', 'ordering from', 'takes', 'is', 'add more', 'follows many', 'lost', 'came', 'reads', 'showing', 'given', 'called', 'was', 'deleted', 'hamal', 'says', 'makes', 'adds', 'takes', 'is', 'says', 'stuck', 'adds', 'take', 'are', 'says', 'start', 'warns', 'echoes', 'wants', 'tried', 'hiked inter', 'is', 'said', 'made', 'comes into', 'find', 'says', 'is', 'means', 'blamed for poor', 'allocated', 'scaled', 'yet in', 'informs', 'is', 'meet', 'increased', 'increasing', 'delayed due', 'was in', 'made', 'is in', 'is', 'flyin

In [56]:
# looking at the most frequent relations
pd.Series(relations).value_counts()[:20]

says         29
is           14
was           5
states        3
say           3
said          3
are           3
been          3
takes         2
start         2
preparing     2
adds          2
made          2
given         2
stopped       2
ordered       2
wants         2
projected     2
is at         2
has           2
dtype: int64