In [0]:
# Importing Libraries
import requests
from bs4 import BeautifulSoup
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
%matplotlib inline
from warnings import filterwarnings
filterwarnings('ignore')

# ***Data Scrapping***
Scraping News Articles for Data Retrieval

In [0]:
# Links of URLs used for scrapping the data
seed_urls = ['https://inshorts.com/en/read/technology',
             'https://inshorts.com/en/read/sports',
             'https://inshorts.com/en/read/world']

In [0]:
def build_dataset(seed_urls):
  news_data = []
  for url in seed_urls:
    news_category = url.split('/')[-1]
    data = requests.get(url)
    soup = BeautifulSoup(data.content,'html.parser')

    news_articles = [{'news_headline':headline.find('span',attrs={"itemprop": "headline"}).string,
                      'news_article': article.find('div',attrs={"itemprop": "articleBody"}).string,
                      'news_category':news_category}
                     
                     for headline, article in zip(soup.find_all('div',class_=["news-card-title news-right-box"]),
                                                  soup.find_all('div',class_=["news-card-content news-right-box"]))
                    ]
    news_data.extend(news_articles)

  df = pd.DataFrame(news_data)
  df = df[['news_headline', 'news_article', 'news_category']]
  return df

In [8]:
news_df = build_dataset(seed_urls)
news_df.head()

Unnamed: 0,news_headline,news_article,news_category
0,"I've resigned from Reddit board, urged to fill...",Reddit Co-founder Alexis Ohanian resigned from...,technology
1,Jio Platforms raises extra $602 mn from Silver...,Reliance Industries has announced that US priv...,technology
2,US startup Kitty Hawk ends its 'flying car' pr...,"US-based aviation startup Kitty Hawk, which is...",technology
3,I don't agree: Bezos to user emailing against ...,Amazon CEO Jeff Bezos on Instagram posted his ...,technology
4,Airtel denies reports of Amazon looking to bu...,Airtel has denied reports that Amazon is in ea...,technology


In [9]:
news_df.news_category.value_counts()

sports        25
technology    25
world         25
Name: news_category, dtype: int64

# **Text Wrangling & Pre-processing**


In [15]:
import nltk
import spacy
from nltk.tokenize.toktok import ToktokTokenizer
import re
import unicodedata
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [17]:
# Split HTML Tags from scrapped data
def strip_html_tags(text):
  soup = BeautifulSoup(text, 'html.parser')
  stripped_text = soup.get_text()
  return stripped_text

strip_html_tags('<html><h2>Some important text</h2></html>')

'Some important text'

In [18]:
# Remove Accented Characters from scrapped data
def remove_accented_chars(text):
  text = unicodedata.normalize('NFKD', text).encode('ascii','ignore').decode('utf-8','ignore')
  return text

remove_accented_chars('Sōṁē āccēntēd tēxt')

'Some accented text'

In [20]:
# Remove Special Characters from scrapped data
def remove_special_chars(text,remove_digits=False):
  pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
  text = re.sub(pattern,'',text)
  return text

remove_special_chars('Well this was fun! What do you think? 123#@!',remove_digits=True)

'Well this was fun What do you think '

In [21]:
# Stemming Technique
# Simple Stemmer using nltk library (PorterStemmer())
def simple_stemmer(text):
  ps = nltk.porter.PorterStemmer()
  text = ' '.join([ps.stem(word) for word in text.split()])
  return text

simple_stemmer('My system keeps crashing his crashed yesterday, ours crashes daily')

'My system keep crash hi crash yesterday, our crash daili'

In [25]:
# Lemmatization Technique
def lemmatize_text(text):
  text = nlp(text)
  text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
  return text

lemmatize_text('My system keeps crashing his crashed yesterday, ours crashes daily')

'My system keep crash his crash yesterday , ours crash daily'

In [27]:
# Remove Stopwords
def remove_stopwords(text, is_lower_case=False):
  tokens = tokenizer.tokenize(text)
  tokens = [token.strip() for token in tokens]
  if is_lower_case:
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
  else:
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
  filtered_tokens = ' '.join(filtered_tokens)
  return filtered_tokens
remove_stopwords("The, and, if are stopwords, computer is not")

', , stopwords , computer not'

In [0]:
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lowercase=True,
                     text_lemmatization=True,stopwords_removal=True,
                     remove_digits=True,special_chars_removal=True):
  normalized_corpus = []

  for doc in corpus:
    if html_stripping:
      doc = strip_html_tags(doc)
    if accented_char_removal:
      doc = remove_accented_chars(doc)
    if text_lowercase:
      doc = doc.lower()
    doc = re.sub(r'[\r|\n|\r\n]+',' ',doc)
    if text_lemmatization:
      doc = lemmatize_text(doc)
    if stopwords_removal:
      doc = remove_stopwords(doc,is_lower_case=text_lowercase)
    if special_chars_removal:
      special_char_pattern = re.compile(r'([{.(-)!}])')
      doc = special_char_pattern.sub(" \\1 ", doc)
      doc = remove_special_chars(doc, remove_digits=remove_digits)
    doc = re.sub(' +',' ', doc)

    normalized_corpus.append(doc)
  
  return normalized_corpus


In [39]:
# combining headlines and articles together
news_df['full_text'] = news_df['news_headline'].map(str) + '. ' + news_df['news_article']
news_df['clean_text'] = normalize_corpus(news_df['full_text'])

norm_corpus = list(news_df['clean_text'])
news_df.iloc[1][['full_text', 'clean_text']].to_dict()

{'clean_text': 'jio platforms raise extra mn silver lake th deal week reliance industry announce us private equity fund silver lake co investor invest additional million jio platform investment come top million silver lake commit jio platform reliance sell nearly stake jio platform seven fundraising deal six week raise billion ',
 'full_text': 'Jio Platforms raises extra $602 mn from Silver Lake, its 7th deal in 6 weeks. Reliance Industries has announced that US private equity fund Silver Lake and its co-investors will invest an additional $602 million in Jio Platforms. The investment comes on top of the $749 million Silver Lake committed to Jio Platforms. Reliance has now sold a nearly 20% stake in Jio Platforms in seven fundraising deals in six weeks, raising $12.2 billion.'}

In [0]:
news_df.to_csv('news.csv',index=False,encoding='utf-8')

# *Understanding Language Syntax and Structure*
sentence → clauses → phrases → words

In [45]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [48]:
# Tagging Parts of Speech
#create a basic preprocessed corpus, don't lowercase to get POS context
corpus = normalize_corpus(news_df['full_text'], text_lowercase=False,text_lemmatization=False,special_chars_removal=False)

# Demo for POS tagging of news headlines
sentence = str(news_df.iloc[1].news_headline)
sentence_nlp = nlp(sentence)

# POS tagging with Spacy
spacy_pos_tagged = [(word,word.tag_,word.pos_) for word in sentence_nlp]
pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS Tag', 'Tag Type'])

# POS tagging with nltk
nltk_pos_tagged = nltk.pos_tag(sentence.split())
pd.DataFrame(nltk_pos_tagged, columns=['Word', 'POS tag'])

Unnamed: 0,Word,POS tag
0,Jio,NNP
1,Platforms,NNP
2,raises,VBZ
3,extra,JJ
4,$602,NNP
5,mn,NN
6,from,IN
7,Silver,NNP
8,"Lake,",NNP
9,its,PRP$


In [55]:
nltk.download('conll2000')

[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.


True

In [56]:
from nltk.corpus import conll2000

data = conll2000.chunked_sents()
train_data = data[:10900]
test_data = data[10900:]
print(len(train_data), len(test_data))
print(train_data[1]) 

10900 48
(S
  Chancellor/NNP
  (PP of/IN)
  (NP the/DT Exchequer/NNP)
  (NP Nigel/NNP Lawson/NNP)
  (NP 's/POS restated/VBN commitment/NN)
  (PP to/TO)
  (NP a/DT firm/NN monetary/JJ policy/NN)
  (VP has/VBZ helped/VBN to/TO prevent/VB)
  (NP a/DT freefall/NN)
  (PP in/IN)
  (NP sterling/NN)
  (PP over/IN)
  (NP the/DT past/JJ week/NN)
  ./.)


In [82]:
import emoji
print(emoji.emojize('NLP is :thumbsup:',use_aliases=True)) 

NLP is 👍
