# Web scraping

Web Scraping:
    Document loading -> parsing -> extraction -> transformation

Different packages which are capable of web scraping are:
    
    1.patterns
    
    2.scrapy
    
    3.mechanize
    
    4.beautiful soup
    
    5.Requests
    
    

In [1]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq

my_url = 'http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1'

uClient = uReq(my_url)
page_html=uClient.read()
uClient.close()
page_soup= soup(page_html,"html.parser")

In [15]:
movie_containers=page_soup.findAll("div",class_='lister-item mode-advanced')

In [16]:
print(len(movie_containers)) #no of pages displayed on that page

50


Now we’ll select only the first container, and extract, by turn, each item of interest:

The name of the movie.

The year of release.

The IMDB rating.

The Metascore.

The number of votes.

In [17]:
# Lists to store the scraped data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
# Extract data from individual movie container
for container in movie_containers:
# If the movie has Metascore, then extract:
    if container.find('div', class_ = 'ratings-metascore') is not None:
# The name
        name = container.h3.a.text
        names.append(name)
# The year
        year = container.h3.find('span', class_ = 'lister-item-year').text
        years.append(year)
# The IMDB rating
        imdb = float(container.strong.text)
        imdb_ratings.append(imdb)
# The Metascore
        m_score = container.find('span', class_ = 'metascore').text
        metascores.append(int(m_score))
# The number of votes
        vote = container.find('span', attrs = {'name':'nv'})['data-value']
        votes.append(int(vote))

In [18]:
import pandas as pd
test_df = pd.DataFrame({'movie': names,
'year': years,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes
})
print(test_df.info())
test_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   movie      42 non-null     object 
 1   year       42 non-null     object 
 2   imdb       42 non-null     float64
 3   metascore  42 non-null     int64  
 4   votes      42 non-null     int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 1.8+ KB
None


Unnamed: 0,movie,year,imdb,metascore,votes
0,Logan,(2017),8.1,77,619911
1,Thor: Ragnarok,(2017),7.9,74,556060
2,Guardians of the Galaxy Vol. 2,(2017),7.6,67,543184
3,Star Wars: The Last Jedi,(2017),7.0,84,542263
4,Wonder Woman,(2017),7.4,76,530226
5,Dunkirk,(2017),7.9,94,521806
6,Spider-Man: Homecoming,(2017),7.4,73,487738
7,Get Out,(I) (2017),7.7,85,464274
8,It,(I) (2017),7.3,69,436751
9,Blade Runner 2049,(2017),8.0,81,433837


In [20]:
movie_ratings = pd.DataFrame({'movie': names,
'year': years,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes
})
print(movie_ratings.info())
movie_ratings.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   movie      42 non-null     object 
 1   year       42 non-null     object 
 2   imdb       42 non-null     float64
 3   metascore  42 non-null     int64  
 4   votes      42 non-null     int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 1.8+ KB
None


Unnamed: 0,movie,year,imdb,metascore,votes
0,Logan,(2017),8.1,77,619911
1,Thor: Ragnarok,(2017),7.9,74,556060
2,Guardians of the Galaxy Vol. 2,(2017),7.6,67,543184
3,Star Wars: The Last Jedi,(2017),7.0,84,542263
4,Wonder Woman,(2017),7.4,76,530226
5,Dunkirk,(2017),7.9,94,521806
6,Spider-Man: Homecoming,(2017),7.4,73,487738
7,Get Out,(I) (2017),7.7,85,464274
8,It,(I) (2017),7.3,69,436751
9,Blade Runner 2049,(2017),8.0,81,433837


# cleaning the data

In [21]:
movie_ratings = movie_ratings[['movie', 'year', 'imdb', 'metascore', 'votes']]
movie_ratings.head()

Unnamed: 0,movie,year,imdb,metascore,votes
0,Logan,(2017),8.1,77,619911
1,Thor: Ragnarok,(2017),7.9,74,556060
2,Guardians of the Galaxy Vol. 2,(2017),7.6,67,543184
3,Star Wars: The Last Jedi,(2017),7.0,84,542263
4,Wonder Woman,(2017),7.4,76,530226


In [22]:
movie_ratings['year'].unique()


array(['(2017)', '(I) (2017)'], dtype=object)

In [4]:
print(soup.prettify(containers[0]))

<div class="lister-item mode-advanced">
 <div class="lister-top-right">
  <div class="ribbonize" data-caller="filmosearch" data-tconst="tt3315342">
  </div>
 </div>
 <div class="lister-item-image float-left">
  <a href="/title/tt3315342/">
   <img alt="Logan" class="loadlate" data-tconst="tt3315342" height="98" loadlate="https://m.media-amazon.com/images/M/MV5BYzc5MTU4N2EtYTkyMi00NjdhLTg3NWEtMTY4OTEyMzJhZTAzXkEyXkFqcGdeQXVyNjc1NTYyMjg@._V1_UX67_CR0,0,67,98_AL_.jpg" src="https://m.media-amazon.com/images/G/01/imdb/images/nopicture/large/film-184890147._CB466725069_.png" width="67"/>
  </a>
 </div>
 <div class="lister-item-content">
  <h3 class="lister-item-header">
   <span class="lister-item-index unbold text-primary">
    1.
   </span>
   <a href="/title/tt3315342/">
    Logan
   </a>
   <span class="lister-item-year text-muted unbold">
    (2017)
   </span>
  </h3>
  <p class="text-muted">
   <span class="certificate">
    A
   </span>
   <span class="ghost">
    |
   </span>
   <spa

In [9]:
container=containers[0]
#print(container.div.img["alt"])

# NER's

In [1]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

# Entity

In [15]:
from pprint import pprint

doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('$5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]


# Token

In [16]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])


[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'O', ''),
 (record, 'O', ''),
 ($, 'B', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


In [7]:
pip install html5lib

Collecting html5lib
  Downloading html5lib-1.1-py2.py3-none-any.whl (112 kB)
Installing collected packages: html5lib

Successfully installed html5lib-1.1


# Extracting named entity from an article


In [5]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://towardsdatascience.com/nlp-for-indian-languages-310d1d8a10b6')
article = nlp(ny_bb)
len(article.ents)


251

There are 251 entities present in the article

In [6]:
labels = [x.label_ for x in article.ents]

Counter(labels)

Counter({'NORP': 29,
         'PRODUCT': 4,
         'ORG': 79,
         'CARDINAL': 20,
         'LANGUAGE': 15,
         'GPE': 40,
         'PERCENT': 2,
         'MONEY': 2,
         'PERSON': 37,
         'ORDINAL': 1,
         'LOC': 1,
         'DATE': 14,
         'EVENT': 3,
         'WORK_OF_ART': 3,
         'QUANTITY': 1})

In [7]:
#Most 3 frequent tokens

items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Indian', 20), ('English', 20), ('India', 18)]

In [8]:
sentences = [x for x in article.sents]
print(sentences[20])

However, it’s worth noting that majority of the Indian population in India is still based in rural areas where teaching and learning would be in local languages, where communities are literate, but still are not familiar with English.


In [9]:
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')


In [10]:
displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True, options = {'distance': 120})


# Extracting parts of speech

In [11]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]


[('’s', 'PROPN', '’s'),
 ('worth', 'ADJ', 'worth'),
 ('noting', 'VERB', 'note'),
 ('majority', 'NOUN', 'majority'),
 ('Indian', 'ADJ', 'indian'),
 ('population', 'NOUN', 'population'),
 ('India', 'PROPN', 'India'),
 ('based', 'VERB', 'base'),
 ('rural', 'ADJ', 'rural'),
 ('areas', 'NOUN', 'area'),
 ('teaching', 'NOUN', 'teaching'),
 ('learning', 'NOUN', 'learning'),
 ('local', 'ADJ', 'local'),
 ('languages', 'NOUN', 'language'),
 ('communities', 'NOUN', 'community'),
 ('literate', 'ADJ', 'literate'),
 ('familiar', 'ADJ', 'familiar'),
 ('English', 'PROPN', 'English')]

In [12]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20])).ents])


{'Indian': 'NORP', 'India': 'GPE', 'English': 'LANGUAGE'}

In [13]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[20]])


[(However, 'O', ''), (,, 'O', ''), (it, 'O', ''), (’s, 'O', ''), (worth, 'O', ''), (noting, 'O', ''), (that, 'O', ''), (majority, 'O', ''), (of, 'O', ''), (the, 'O', ''), (Indian, 'B', 'NORP'), (population, 'O', ''), (in, 'O', ''), (India, 'B', 'GPE'), (is, 'O', ''), (still, 'O', ''), (based, 'O', ''), (in, 'O', ''), (rural, 'O', ''), (areas, 'O', ''), (where, 'O', ''), (teaching, 'O', ''), (and, 'O', ''), (learning, 'O', ''), (would, 'O', ''), (be, 'O', ''), (in, 'O', ''), (local, 'O', ''), (languages, 'O', ''), (,, 'O', ''), (where, 'O', ''), (communities, 'O', ''), (are, 'O', ''), (literate, 'O', ''), (,, 'O', ''), (but, 'O', ''), (still, 'O', ''), (are, 'O', ''), (not, 'O', ''), (familiar, 'O', ''), (with, 'O', ''), (English, 'B', 'LANGUAGE'), (., 'O', '')]


# Visulaizing the entities of entire article

In [14]:
displacy.render(article, jupyter=True, style='ent')
