In [0]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup

In [0]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None

def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [3]:
pip install mathematicians

[31mERROR: Could not find a version that satisfies the requirement mathematicians (from versions: none)[0m
[31mERROR: No matching distribution found for mathematicians[0m


In [0]:
###from mathematicians import simple_get ##dont have to import that

In [0]:
raw_html = simple_get('https://realpython.com/blog/')

In [6]:
len(raw_html)

39199

In [0]:
no_html = simple_get('https://realpython.com/blog/nope-not-gonna-find-it')

In [8]:
no_html is None

True

In [0]:
raw_html = simple_get('http://www.fabpedigree.com/james/mathmen.htm')


In [0]:
html = BeautifulSoup(raw_html, 'html.parser')


In [11]:
for i, li in enumerate(html.select('li')):
        print(i, li.text)

0  Isaac Newton
 Archimedes
 Carl F. Gauss
 Leonhard Euler
 Bernhard Riemann

1  Archimedes
 Carl F. Gauss
 Leonhard Euler
 Bernhard Riemann

2  Carl F. Gauss
 Leonhard Euler
 Bernhard Riemann

3  Leonhard Euler
 Bernhard Riemann

4  Bernhard Riemann

5  Henri Poincaré
 Joseph-Louis Lagrange
 Euclid  of Alexandria
 David Hilbert
 Gottfried W. Leibniz

6  Joseph-Louis Lagrange
 Euclid  of Alexandria
 David Hilbert
 Gottfried W. Leibniz

7  Euclid  of Alexandria
 David Hilbert
 Gottfried W. Leibniz

8  David Hilbert
 Gottfried W. Leibniz

9  Gottfried W. Leibniz

10  Alexandre Grothendieck
 Pierre de Fermat
 Évariste Galois
 John von Neumann
 René Descartes

11  Pierre de Fermat
 Évariste Galois
 John von Neumann
 René Descartes

12  Évariste Galois
 John von Neumann
 René Descartes

13  John von Neumann
 René Descartes

14  René Descartes

15  Karl W. T. Weierstrass
 Srinivasa Ramanujan
 Hermann K. H. Weyl
 Peter G. L. Dirichlet
 Niels Abel

16  Srinivasa Ramanujan
 Hermann K. H. Weyl
 

In [0]:
def get_names():
  
    """
    Downloads the page where the list of mathematicians is found
    and returns a list of strings, one per mathematician
    """
    url = 'http://www.fabpedigree.com/james/mathmen.htm'
    response = simple_get(url)

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        names = set()
        for li in html.select('li'):
            for name in li.text.split('\n'):
                if len(name) > 0:
                    names.add(name.strip())
        return list(names)

    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url))

In [0]:
def get_hits_on_name(name):
    """
    Accepts a `name` of a mathematician and returns the number
    of hits that mathematician's Wikipedia page received in the 
    last 60 days, as an `int`
    """
    # url_root is a template string that is used to build a URL.
    url_root = 'URL_REMOVED_SEE_NOTICE_AT_START_OF_ARTICLE'
    response = simple_get('https://xtools.wmflabs.org/articleinfo/en.wikipedia.org/{}')

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')

        hit_link = [a for a in html.select('a')
                    if a['href'].find('latest-60') > -1]

        if len(hit_link) > 0:
            # Strip commas
            link_text = hit_link[0].text.replace(',', '')
            try:
                # Convert to integer
                return int(link_text)
            except:
                log_error("couldn't parse {} as an `int`".format(link_text))

    log_error('No pageviews found for {}'.format(name))
    return None

In [14]:
if __name__ == '__main__':
    print('Getting the list of names....')
    names = get_names()
    print(len(names))
    print('... done.\n')

    results = []

    print('Getting stats for each name....')

    for name in names:
        try:
            hits = get_hits_on_name(name)
            if hits is None:
                hits = -1
            results.append((hits, name))
        except:
            results.append((-1, name))
            log_error('error encountered while processing '
                      '{}, skipping'.format(name))

    print('... done.\n')

    results.sort()
    results.reverse()

    if len(results) > 5:
        top_marks = results[:5]
    else:
        top_marks = results

    print('\nThe most popular mathematicians are:\n')
    for (mark, mathematician) in top_marks:
        print('{} with {} pageviews'.format(mathematician, mark))

    no_results = len([res for res in results if res[0] == -1])
    print('\nBut we did not find results for '
          '{} mathematicians on the list'.format(no_results))

Getting the list of names....
100
... done.

Getting stats for each name....
No pageviews found for F. L. Gottlob Frege
No pageviews found for Andrey N. Kolmogorov
No pageviews found for F. Gotthold Eisenstein
No pageviews found for Diophantus  of Alexandria
No pageviews found for Godfrey H. Hardy
No pageviews found for Isaac Newton
No pageviews found for Brahmagupta
No pageviews found for Christiaan Huygens
No pageviews found for Johann H. Lambert
No pageviews found for F.E.J. Émile Borel
No pageviews found for Jakob Steiner
No pageviews found for Gaspard Monge
No pageviews found for Carl F. Gauss
No pageviews found for John von Neumann
No pageviews found for Alhazen ibn al-Haytham
No pageviews found for Ernst E. Kummer
No pageviews found for Jean-Victor Poncelet
No pageviews found for Évariste Galois
No pageviews found for Bonaventura Cavalieri
No pageviews found for George D. Birkhoff
No pageviews found for Girolamo Cardano
No pageviews found for Joseph-Louis Lagrange
No pageviews f

As stated

*We’ve received an email from an XTools maintainer informing us that scraping XTools is harmful and that automation APIs should be used instead:This article on your site is essentially a guide to scraping XTools […] This is not necessary, and it’s causing problems for us. We have APIs that should be used for automation, and furthermore, for pageviews specifically folks should be using the official pageviews API.The example code in the article was modified to no longer make requests to the XTools website. The web scraping techniques demonstrated here are still valid, but please do not use them on web pages of the XTools project. Use the provided automation API instead.*


In [15]:
pip install wikipedia

Collecting wikipedia
  Downloading https://files.pythonhosted.org/packages/67/35/25e68fbc99e672127cc6fbb14b8ec1ba3dfef035bf1e4c90f78f24a80b7d/wikipedia-1.4.0.tar.gz
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-cp36-none-any.whl size=11686 sha256=23a705f565716d097d67d3c30da4466f82d97da1c4592bdfff9257f819276b7a
  Stored in directory: /root/.cache/pip/wheels/87/2a/18/4e471fd96d12114d16fe4a446d00c3b38fb9efcb744bd31f4a
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [0]:
import wikipedia

In [17]:
print(wikipedia.search("Issac newton"))

['Isaac Newton', 'Religious views of Isaac Newton', 'Early life of Isaac Newton', "Isaac Newton's occult studies", 'Later life of Isaac Newton', 'Isaac Newton Van Nuys', 'Christine King Farris', 'Institute of Physics Isaac Newton Medal', 'Isaac Newton Institute', 'Isaac Newton Group of Telescopes']


In [18]:
print(wikipedia.summary("Isaac Newton",sentences =3))

Sir Isaac Newton  (25 December 1642 – 20 March 1726/27) was an English mathematician, physicist, astronomer, theologian, and author (described in his own day as a "natural philosopher") who is widely recognised as one of the most influential scientists of all time and as a key figure in the scientific revolution. His book Philosophiæ Naturalis Principia Mathematica (Mathematical Principles of Natural Philosophy), first published in 1687, laid the foundations of classical mechanics. Newton also made seminal contributions to optics, and shares credit with Gottfried Wilhelm Leibniz for developing the infinitesimal calculus.


In [19]:
print(wikipedia.page("Isaac Newton").content)

Sir Isaac Newton  (25 December 1642 – 20 March 1726/27) was an English mathematician, physicist, astronomer, theologian, and author (described in his own day as a "natural philosopher") who is widely recognised as one of the most influential scientists of all time and as a key figure in the scientific revolution. His book Philosophiæ Naturalis Principia Mathematica (Mathematical Principles of Natural Philosophy), first published in 1687, laid the foundations of classical mechanics. Newton also made seminal contributions to optics, and shares credit with Gottfried Wilhelm Leibniz for developing the infinitesimal calculus.
In Principia, Newton formulated the laws of motion and universal gravitation that formed the dominant scientific viewpoint until it was superseded by the theory of relativity. Newton used his mathematical description of gravity to prove Kepler's laws of planetary motion, account for tides, the trajectories of comets, the precession of the equinoxes and other phenomena,

In [20]:
print(wikipedia.page("Isaac Newton").references)

['http://www.businessinsider.com.au/isaac-newton-lost-a-fortune-on-englands-hottest-stock-2016-1', 'http://www.historychannel.com.au/classroom/day-in-history/553/isaac-newton-is-knighted', 'http://cantic.bnc.cat/registres/CUCId/a10433399', 'http://data.rero.ch/02-A003638065', 'http://www.bncatalogo.cl/F?func=direct&local_base=red10&doc_number=000039641', 'http://www.amazon.com/dp/B0000CIHG7', 'http://www.astro.com/astro-databank/Newton,_Isaac', "http://www.christianpost.com/article/20070619/28049_Papers_Show_Isaac_Newton's_Religious_Side,_Predict_Date_of_Apocalypse.htm", 'http://www.earlymoderntexts.com/', 'http://news.nationalgeographic.com/2016/04/160404-isaac-newton-alchemy-mercury-recipe-chemistry-science/', 'http://www.online-literature.com/charlotte-yonge/john-keble/6/', 'http://www.oxforddnb.com/view/article/1541', 'http://www.phaser.com/modules/historic/newton/index.html', 'http://www.pierre-marteau.com/editions/1701-25-mint-reports/report-1717-09-25.html', 'http://www.tunablel

In [21]:
print(wikipedia.page("Isaac Newton").title)

Isaac Newton


In [22]:
print(wikipedia.page("Isaac Newton").categories)

['1642 births', '1727 deaths', '17th-century English mathematicians', '17th-century English writers', '17th-century Latin-language writers', '17th-century alchemists', '17th-century apocalypticists', '17th-century male writers', '18th-century British scientists', '18th-century English mathematicians', '18th-century English writers', '18th-century Latin-language writers', '18th-century alchemists', '18th-century apocalypticists', '18th-century male writers', 'AC with 29 elements', 'All articles with dead external links', 'All articles with specifically marked weasel-worded phrases', 'All articles with unsourced statements', 'All articles with vague or ambiguous time', 'Alumni of Trinity College, Cambridge', 'Antitrinitarians', 'Articles containing French-language text', 'Articles containing Hebrew-language text', 'Articles containing Latin-language text', 'Articles prone to spam from December 2018', 'Articles with Encyclopædia Britannica links', 'Articles with Internet Archive links', '

## github pageview-api
https://github.com/Commonists/pageview-api

In [23]:
pip install git+https://github.com/Commonists/pageview-api.git


Collecting git+https://github.com/Commonists/pageview-api.git
  Cloning https://github.com/Commonists/pageview-api.git to /tmp/pip-req-build-zssaggoc
  Running command git clone -q https://github.com/Commonists/pageview-api.git /tmp/pip-req-build-zssaggoc
Collecting attrdict
  Downloading https://files.pythonhosted.org/packages/ef/97/28fe7e68bc7adfce67d4339756e85e9fcf3c6fd7f0c0781695352b70472c/attrdict-2.0.1-py2.py3-none-any.whl
Building wheels for collected packages: pageviewapi
  Building wheel for pageviewapi (setup.py) ... [?25l[?25hdone
  Created wheel for pageviewapi: filename=pageviewapi-Undefined-cp36-none-any.whl size=5298 sha256=52273db3d261e575c65dfc15da3b656e158fefdd0960a243de027c3340e607a8
  Stored in directory: /tmp/pip-ephem-wheel-cache-5w0qceaq/wheels/72/11/91/779b84efbca7fdf9f0582e678083e54cf77ec3b299ba77d14c
Successfully built pageviewapi
Installing collected packages: attrdict, pageviewapi
Successfully installed attrdict-2.0.1 pageviewapi-Undefined


In [0]:
import pageviewapi

In [0]:
import pageviewapi.period

In [26]:
pageviewapi.period.sum_last('en.wikipedia', 'Bernhard Riemann', last=30,
                            access='all-access', agent='all-agents')

12761

In [27]:
pageviewapi.period.sum_last('fr.wikipedia','Taylor Swift', last=30,
                            access='all-access', agent='all-agents')

28258

In [28]:
if __name__ == '__main__':
    print('Getting the list of names....')
    names = get_names()
    print(len(names))
    print('... done.\n')

    results = []

    print('Getting stats for each name....')

    for name in names:
        try:


            hits = pageviewapi.period.sum_last('en.wikipedia', name, last=60,
                            access='all-access', agent='all-agents')
            if hits is None:
                hits = -1
            results.append((hits, name))
        except:
            results.append((-1, name))
            log_error('error encountered while processing '
                      '{}, skipping'.format(name))

    print('... done.\n')

    results.sort()
    results.reverse()

    if len(results) > 5:
        top_marks = results[:5]
    else:
        top_marks = results

    print('\nThe most popular mathematicians are:\n')
    for (mark, mathematician) in top_marks:
        print('{} with {} pageviews'.format(mathematician, mark))

    no_results = len([res for res in results if res[0] == -1])
    print('\nBut we did not find results for '
          '{} mathematicians on the list'.format(no_results))

Getting the list of names....
100
... done.

Getting stats for each name....
error encountered while processing F. L. Gottlob Frege, skipping
error encountered while processing F. Gotthold Eisenstein, skipping
error encountered while processing Diophantus  of Alexandria, skipping
error encountered while processing F.E.J. Émile Borel, skipping
error encountered while processing Alhazen ibn al-Haytham, skipping
error encountered while processing Ernst E. Kummer, skipping
error encountered while processing Euclid  of Alexandria, skipping
error encountered while processing Adrien M. Legendre, skipping
error encountered while processing Gottfried W. Leibniz, skipping
error encountered while processing Panini  of Shalatula, skipping
error encountered while processing Bháscara (II) Áchárya, skipping
error encountered while processing Archytas  of Tarentum, skipping
error encountered while processing James J. Sylvester, skipping
error encountered while processing Omar al-Khayyám, skipping
erro



The most popular mathematicians are:

1.  Albert Einstein with 1222398 pageviews
2.  Isaac Newton with 507906 pageviews
3.  Aristotle with 342921 pageviews
4.  Srinivasa Ramanujan with 320293 pageviews
5.  Galileo Galilei with 313525 pageviews

But we did not find results for 27 mathematicians on the list

In [29]:
pageviewapi.per_article('en.wikipedia', 'Yu_Shuxin', '20200601', '20200607',
                        access='all-access', agent='all-agents', granularity='daily')

AttrDict({'items': [{'project': 'en.wikipedia', 'article': 'Yu_Shuxin', 'granularity': 'daily', 'timestamp': '2020060100', 'access': 'all-access', 'agent': 'all-agents', 'views': 1541}, {'project': 'en.wikipedia', 'article': 'Yu_Shuxin', 'granularity': 'daily', 'timestamp': '2020060200', 'access': 'all-access', 'agent': 'all-agents', 'views': 1202}, {'project': 'en.wikipedia', 'article': 'Yu_Shuxin', 'granularity': 'daily', 'timestamp': '2020060300', 'access': 'all-access', 'agent': 'all-agents', 'views': 1110}, {'project': 'en.wikipedia', 'article': 'Yu_Shuxin', 'granularity': 'daily', 'timestamp': '2020060400', 'access': 'all-access', 'agent': 'all-agents', 'views': 1028}, {'project': 'en.wikipedia', 'article': 'Yu_Shuxin', 'granularity': 'daily', 'timestamp': '2020060500', 'access': 'all-access', 'agent': 'all-agents', 'views': 819}, {'project': 'en.wikipedia', 'article': 'Yu_Shuxin', 'granularity': 'daily', 'timestamp': '2020060600', 'access': 'all-access', 'agent': 'all-agents', '

In [30]:
pageviewapi.period.sum_last('en.wikipedia', 'Yu Shuxin', last=60,
                            access='all-access', agent='all-agents')

80077

In [31]:
pageviewapi.period.sum_last('en.wikipedia', 'Liu Yuxin', last=60,
                            access='all-access', agent='all-agents')

13165

In [32]:
pageviewapi.period.sum_last('en.wikipedia', 'Yu Yan', last=60,
                            access='all-access', agent='all-agents')

95

In [33]:
pageviewapi.period.avg_last('fr.wikipedia', 'Paris', last=30)

4138.433333333333

In [34]:
pageviewapi.legacy_pagecounts('fr.wikipedia', '2010010100', '2011010100', granularity='monthly')


AttrDict({'items': [{'project': 'fr.wikipedia', 'access-site': 'all-sites', 'granularity': 'monthly', 'timestamp': '2010010100', 'count': 516194646}, {'project': 'fr.wikipedia', 'access-site': 'all-sites', 'granularity': 'monthly', 'timestamp': '2010020100', 'count': 487973789}, {'project': 'fr.wikipedia', 'access-site': 'all-sites', 'granularity': 'monthly', 'timestamp': '2010030100', 'count': 509995947}, {'project': 'fr.wikipedia', 'access-site': 'all-sites', 'granularity': 'monthly', 'timestamp': '2010040100', 'count': 443366887}, {'project': 'fr.wikipedia', 'access-site': 'all-sites', 'granularity': 'monthly', 'timestamp': '2010050100', 'count': 452233260}, {'project': 'fr.wikipedia', 'access-site': 'all-sites', 'granularity': 'monthly', 'timestamp': '2010060100', 'count': 333644715}, {'project': 'fr.wikipedia', 'access-site': 'all-sites', 'granularity': 'monthly', 'timestamp': '2010070100', 'count': 333580000}, {'project': 'fr.wikipedia', 'access-site': 'all-sites', 'granularity':

##  Tutorial: Python Web Scraping Using BeautifulSoup

In [0]:
import requests
from bs4 import BeautifulSoup



In [36]:
page = requests.get("http://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168")
soup = BeautifulSoup(page.content, 'html.parser')
seven_day = soup.find(id="seven-day-forecast")
forecast_items = seven_day.find_all(class_="tombstone-container")
tonight = forecast_items[0]
print(tonight.prettify())

<div class="tombstone-container">
 <p class="period-name">
  Tonight
  <br/>
  <br/>
 </p>
 <p>
  <img alt="Tonight: Mostly clear, with a low around 57. West wind around 14 mph, with gusts as high as 26 mph. " class="forecast-icon" src="newimages/medium/nfew.png" title="Tonight: Mostly clear, with a low around 57. West wind around 14 mph, with gusts as high as 26 mph. "/>
 </p>
 <p class="short-desc">
  Mostly Clear
 </p>
 <p class="temp temp-low">
  Low: 57 °F
 </p>
</div>


In [37]:
period = tonight.find(class_="period-name").get_text()
short_desc = tonight.find(class_="short-desc").get_text()
temp = tonight.find(class_="temp").get_text()
print(period)
print(short_desc)
print(temp)

Tonight
Mostly Clear
Low: 57 °F


In [38]:
img = tonight.find("img")
desc = img['title']
print(desc)

Tonight: Mostly clear, with a low around 57. West wind around 14 mph, with gusts as high as 26 mph. 


In [39]:
period_tags = seven_day.select(".tombstone-container .period-name")
periods = [pt.get_text() for pt in period_tags]
periods

['Tonight',
 'Tuesday',
 'TuesdayNight',
 'Wednesday',
 'WednesdayNight',
 'Thursday',
 'ThursdayNight',
 'Friday',
 'FridayNight']

In [0]:
short_descs = [sd.get_text() for sd in seven_day.select(".tombstone-container .short-desc")]
temps = [t.get_text() for t in seven_day.select(".tombstone-container .temp")]
descs = [d["title"] for d in seven_day.select(".tombstone-container img")]


In [41]:
print(short_descs)
print(temps)
print(descs)

['Mostly Clear', 'Sunny', 'Clear', 'Sunny', 'Clear', 'Sunny', 'Mostly Clear', 'Sunny', 'Mostly Clear']
['Low: 57 °F', 'High: 70 °F', 'Low: 57 °F', 'High: 78 °F', 'Low: 57 °F', 'High: 80 °F', 'Low: 57 °F', 'High: 76 °F', 'Low: 57 °F']
['Tonight: Mostly clear, with a low around 57. West wind around 14 mph, with gusts as high as 26 mph. ', 'Tuesday: Sunny, with a high near 70. West wind 10 to 17 mph, with gusts as high as 24 mph. ', 'Tuesday Night: Clear, with a low around 57. West wind 10 to 15 mph becoming light. Winds could gust as high as 21 mph. ', 'Wednesday: Sunny, with a high near 78. West wind 7 to 13 mph, with gusts as high as 23 mph. ', 'Wednesday Night: Clear, with a low around 57. West wind 9 to 14 mph becoming light west southwest  after midnight. ', 'Thursday: Sunny, with a high near 80.', 'Thursday Night: Mostly clear, with a low around 57.', 'Friday: Sunny, with a high near 76.', 'Friday Night: Mostly clear, with a low around 57.']


In [42]:
 import pandas as pd
weather = pd.DataFrame({
    "period": periods,
    "short_desc": short_descs,
    "temp": temps,
    "desc":descs
})
weather

Unnamed: 0,period,short_desc,temp,desc
0,Tonight,Mostly Clear,Low: 57 °F,"Tonight: Mostly clear, with a low around 57. W..."
1,Tuesday,Sunny,High: 70 °F,"Tuesday: Sunny, with a high near 70. West wind..."
2,TuesdayNight,Clear,Low: 57 °F,"Tuesday Night: Clear, with a low around 57. We..."
3,Wednesday,Sunny,High: 78 °F,"Wednesday: Sunny, with a high near 78. West wi..."
4,WednesdayNight,Clear,Low: 57 °F,"Wednesday Night: Clear, with a low around 57. ..."
5,Thursday,Sunny,High: 80 °F,"Thursday: Sunny, with a high near 80."
6,ThursdayNight,Mostly Clear,Low: 57 °F,"Thursday Night: Mostly clear, with a low aroun..."
7,Friday,Sunny,High: 76 °F,"Friday: Sunny, with a high near 76."
8,FridayNight,Mostly Clear,Low: 57 °F,"Friday Night: Mostly clear, with a low around 57."


In [43]:
temp_nums = weather["temp"].str.extract("(?P<temp_num>\d+)", expand=False)
weather["temp_num"] = temp_nums.astype('int')
temp_nums

0    57
1    70
2    57
3    78
4    57
5    80
6    57
7    76
8    57
Name: temp_num, dtype: object

In [44]:
weather["temp_num"].mean()


65.44444444444444

In [45]:
is_night = weather["temp"].str.contains("Low")
weather["is_night"] = is_night
is_night

0     True
1    False
2     True
3    False
4     True
5    False
6     True
7    False
8     True
Name: temp, dtype: bool

In [46]:
weather[is_night]


Unnamed: 0,period,short_desc,temp,desc,temp_num,is_night
0,Tonight,Mostly Clear,Low: 57 °F,"Tonight: Mostly clear, with a low around 57. W...",57,True
2,TuesdayNight,Clear,Low: 57 °F,"Tuesday Night: Clear, with a low around 57. We...",57,True
4,WednesdayNight,Clear,Low: 57 °F,"Wednesday Night: Clear, with a low around 57. ...",57,True
6,ThursdayNight,Mostly Clear,Low: 57 °F,"Thursday Night: Mostly clear, with a low aroun...",57,True
8,FridayNight,Mostly Clear,Low: 57 °F,"Friday Night: Mostly clear, with a low around 57.",57,True


## Scrapy



Here we try to use the scrapy, which has 37.4k stars at github

In [48]:
pip install scrapy

Collecting scrapy
[?25l  Downloading https://files.pythonhosted.org/packages/9a/d3/5af102af577f57f706fcb302ea47d40e09355778488de904b3594d4e48d2/Scrapy-2.1.0-py2.py3-none-any.whl (239kB)
[K     |█▍                              | 10kB 17.5MB/s eta 0:00:01[K     |██▊                             | 20kB 2.2MB/s eta 0:00:01[K     |████                            | 30kB 2.7MB/s eta 0:00:01[K     |█████▌                          | 40kB 2.6MB/s eta 0:00:01[K     |██████▉                         | 51kB 2.5MB/s eta 0:00:01[K     |████████▏                       | 61kB 2.7MB/s eta 0:00:01[K     |█████████▋                      | 71kB 3.0MB/s eta 0:00:01[K     |███████████                     | 81kB 3.2MB/s eta 0:00:01[K     |████████████▎                   | 92kB 3.4MB/s eta 0:00:01[K     |█████████████▋                  | 102kB 3.4MB/s eta 0:00:01[K     |███████████████                 | 112kB 3.4MB/s eta 0:00:01[K     |████████████████▍               | 122kB 3.4MB/s eta 

In [0]:
import re
import csv
import requests
from bs4 import BeautifulSoup

# Code to scrap the weather data
from urllib.request import urlopen
import pandas as pd

In [0]:
url = 'https://www.timeanddate.com/weather/china/taiyuan/historic'
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', attrs={'id': 'wt-his'})

tablehr = table.find('thead').find_all('tr')
tablebr = table.find('tbody').find_all('tr')

In [0]:
data = []
tableheaders = []

wind_dir= []
all_td = table.find_all('td')
for row in all_td:
    for cols in row.find_all('span'):
        wind_dir.append(cols.get('title'))
        
df_wd = pd.DataFrame({'Wind':wind_dir})

for col in tablehr[1].find_all('th'):
    tableheaders.append(col.text.strip())
    

for row in tablebr:
    tabledata = []
    for cols in row.find_all(['td', 'th']):
        tabledata.append(cols.text.strip())
    data.append(tabledata)

In [0]:
WeatherData = pd.DataFrame(data, columns = ['Time', 'Fake', 'Temp', 'Weather', 'Wind', 'Direction', 'Humidity', 'Barometer', 'Visibility'])

del WeatherData['Time']
del WeatherData['Fake']

WeatherData['Direction'] = df_wd

In [12]:
WeatherData.to_csv('weatherdata_pune.csv',index=False)
WeatherData.shape

(48, 7)

In [13]:
WeatherData.size


336

In [14]:
WeatherData


Unnamed: 0,Temp,Weather,Wind,Direction,Humidity,Barometer,Visibility
0,68 °F,Passing clouds.,3 mph,Wind blowing from 210° South-southwest to Nort...,69%,"29.80 ""Hg",
1,68 °F,Passing clouds.,3 mph,Wind blowing from 210° South-southwest to Nort...,69%,"29.80 ""Hg",
2,68 °F,Passing clouds.,6 mph,Wind blowing from 200° South-southwest to Nort...,69%,"29.80 ""Hg",
3,68 °F,Passing clouds.,3 mph,Wind blowing from 220° Southwest to Northeast,69%,"29.80 ""Hg",
4,70 °F,Passing clouds.,6 mph,Wind blowing from 200° South-southwest to Nort...,64%,"29.80 ""Hg",
5,70 °F,Clear.,6 mph,Wind blowing from 220° Southwest to Northeast,60%,"29.83 ""Hg",6 mi
6,70 °F,Light rain. Clear.,3 mph,Wind blowing from 210° South-southwest to Nort...,60%,"29.80 ""Hg",6 mi
7,70 °F,Clear.,6 mph,Wind blowing from 220° Southwest to Northeast,64%,"29.83 ""Hg",4 mi
8,70 °F,Clear.,3 mph,Wind blowing from 220° Southwest to Northeast,64%,"29.83 ""Hg",5 mi
9,70 °F,Clear.,6 mph,Wind blowing from 250° West-southwest to East-...,60%,"29.83 ""Hg",4 mi
