In [19]:
# WAŻNE - nie wszystkie funkcje asyncio działają z Pythonem < 3.7

# WAŻNE 2 - ten skrypt jest zrobiony "na szybko" wyłącznie do sprawdzenia asyncio 
# i różnicy w "execution time" (89 sek. vs 795 sek.) - nie powinien być traktowany jako dobry przykład

# WAŻNE 3 - po uruchomieniu monitora zasobów (głównie sieciowych) widać, 
# kiedy skrypt wykonuje największą pracę, a kiedy czeka bezczynnie - tu jest pole do poprawy
# zmienię to jeśli znajdę chwilę

In [2]:
from bs4 import BeautifulSoup
import asyncio
from aiohttp import ClientSession

import sqlite3
from sqlite3 import Error

import os
import time
import pandas as pd

In [3]:
def createConnection(db_file):
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except Error as e:
        print(e)
 
    return conn

In [4]:
def createTable(conn, create_table_sql):
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except Error as e:
        print(e)

In [5]:
def createEntry(conn, entry):
    try:
        sql = ''' INSERT INTO articles(articleUrl, category, publicationTime, articleContent)
                  VALUES(?,?,?,?) '''
        cur = conn.cursor()
        cur.execute(sql, entry)
    except Error as e:
        print(e)
    return 

In [6]:
async def fetchSite(siteUrl, session):
#     print("Getting data from: {}".format(siteUrl)) # comment this one out for normal usage
    try:
        htmlFile = await session.get(siteUrl)
    except Exception as e:
        try:
            print("Exception {} @ {} - waiting for 5 seconds to retry...".format(str(e), siteUrl))
            await asyncio.sleep(5)
            htmlFile = await session.get(siteUrl)
        except Exception as e:
            try:
                print("Exception {} @ {} - waiting for 5 seconds to retry...".format(str(e), siteUrl))
                await asyncio.sleep(30)
                htmlFile = await session.get(siteUrl)
            except Exception as e:
                print("Exception {} @ {} - waiting for 5 seconds to retry...".format(str(e), siteUrl))
                raise

    return await htmlFile.text()

In [7]:
async def makeAsyncioSoup(urlList):
    async with ClientSession() as session:
        corutines = [fetchSite(siteUrl, session) for siteUrl in urlList]
        soupList = await asyncio.gather(*corutines)
    soupList = [BeautifulSoup(soup, "lxml") for soup in soupList]
    
    return soupList

In [8]:
async def extractLinks(siteUrl, sitePrefix):
    soup = await makeAsyncioSoup([siteUrl])
    soup = soup[0]
    links = [element.find("h3") for element in soup.find_all("div", {"class":"contDetails"})]
    links = [sitePrefix + element.find("a")["href"] for element in links]
    return links

In [9]:
async def drillCategory(baseUrl, sitePrefix, minArticlesNo = 100):
    articlesUrls = []
    
    currentUrl = baseUrl
    pageCounter = 1
    while len(articlesUrls) <= minArticlesNo:
        linksSoup = await extractLinks(currentUrl, sitePrefix)
        [articlesUrls.append(url) for url in linksSoup]
        
        pageCounter += 1
        currentUrl = baseUrl + ",nPack," + str(pageCounter)
        
        print("So far, extracted {} urls on {} pages...".format(len(articlesUrls), pageCounter - 1))
    
    return articlesUrls

In [10]:
async def runScraper(articlesSource, conn, minArticlesNo = 20, sitePrefix = ""):
    for category, categoryUrl in articlesSource.items():
        print("\n--- Category: {}\n".format(category))

        articlesCounter = 0
        
        # magic part :)
        categoryArticlesUrls = await drillCategory(categoryUrl, sitePrefix, minArticlesNo)
        categoryArticlesSoup = await makeAsyncioSoup(categoryArticlesUrls)
        
        for index in range(len(categoryArticlesSoup)):
            article = categoryArticlesSoup[index]
            url = categoryArticlesUrls[index]
            
            try:
                publicationTime = article.find("div", {"class":"article-date"}).find("meta")["content"]
            except: 
                print("E: publicationTime is missing @ {}".format(url))
                publicationTime = ""
            try:
                articleContent = " ".join([p.text for p in article.find("div", {"class":"articleContent"}).find_all("p")])
            except: 
                print("E: articleContent is missing @ {}".format(url))
                articleContent = ""

            createEntry(conn ,(url, category, publicationTime, articleContent))

            articlesCounter +=1
            if articlesCounter % 10 == 0: 
                print("Wow! Already have {} articles in category '{}'!".format(articlesCounter, category))

        print("Total number of articles in category '{}': {}".format(category, articlesCounter))

In [11]:
# measuring execution time of the whole script
executionTime = time.time()

# setting up source
articlesSource = {"kraj":"https://www.rmf24.pl/fakty/polska/",
                  "świat":"https://www.rmf24.pl/fakty/swiat/", 
                  "kultura":"https://www.rmf24.pl/kultura/", 
                  "ekonomia":"https://www.rmf24.pl/ekonomia/", 
                  "sport":"https://www.rmf24.pl/sport/", 
                  "nauka":"https://www.rmf24.pl/nauka/"}
minArticlesNo = 100
sitePrefix = "https://www.rmf24.pl"

# database connection setup
if not os.path.exists("../data"):
    os.mkdir("../data")
conn = createConnection("../data/newsDatabaseAsync.db")

# creating table for articles
sqlCreateArticlesTable = """ CREATE TABLE IF NOT EXISTS articles (
                                        id integer PRIMARY KEY,
                                        articleUrl text,
                                        category text,
                                        publicationTime text,
                                        articleContent text
                                    ); """
createTable(conn, sqlCreateArticlesTable)

# main scraper
await runScraper(articlesSource, conn, minArticlesNo, sitePrefix)

# closing database connection
conn.commit()
conn.close()

executionTime = time.time() - executionTime
print("\nTotal execution time = {} sec.".format(executionTime))


--- Category: kraj

So far, extracted 15 urls on 1 pages...
So far, extracted 30 urls on 2 pages...
So far, extracted 45 urls on 3 pages...
So far, extracted 60 urls on 4 pages...
So far, extracted 75 urls on 5 pages...
So far, extracted 90 urls on 6 pages...
So far, extracted 105 urls on 7 pages...
Wow! Already have 10 articles in category 'kraj'!
Wow! Already have 20 articles in category 'kraj'!
Wow! Already have 30 articles in category 'kraj'!
Wow! Already have 40 articles in category 'kraj'!
Wow! Already have 50 articles in category 'kraj'!
Wow! Already have 60 articles in category 'kraj'!
Wow! Already have 70 articles in category 'kraj'!
Wow! Already have 80 articles in category 'kraj'!
Wow! Already have 90 articles in category 'kraj'!
Wow! Already have 100 articles in category 'kraj'!
Total number of articles in category 'kraj': 105

--- Category: świat

So far, extracted 15 urls on 1 pages...
So far, extracted 30 urls on 2 pages...
So far, extracted 45 urls on 3 pages...
So far

In [12]:
# reading in data

In [15]:
conn = createConnection("../data/newsDatabaseAsync.db")

In [16]:
articlesData = pd.read_sql_query("SELECT * FROM articles", conn, index_col = "id")

In [17]:
articlesData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 630 entries, 1 to 630
Data columns (total 4 columns):
articleUrl         630 non-null object
category           630 non-null object
publicationTime    630 non-null object
articleContent     630 non-null object
dtypes: object(4)
memory usage: 24.6+ KB


In [18]:
articlesData.head(20)

Unnamed: 0_level_0,articleUrl,category,publicationTime,articleContent
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,https://www.rmf24.pl/fakty/news-1000-zl-doplat...,kraj,2019-08-25T14:18:00,Zdecydowaliśmy się o dopłacie do hektara w prz...
2,https://www.rmf24.pl/goraca-linia/wasze-fakty/...,kraj,2019-08-25T13:54:15,Jadę S3 na północ od Gorzowa Wielkopolskiego. ...
3,https://www.rmf24.pl/fakty/polska/news-lodzkie...,kraj,2019-08-25T11:58:11,Do zdarzenia doszło wczoraj po godz. 14 na par...
4,https://www.rmf24.pl/fakty/polska/news-szczers...,kraj,2019-08-25T11:18:26,Na briefingu prasowym przed Pałacem Prezydenck...
5,https://www.rmf24.pl/fakty/polska/news-czwartk...,kraj,2019-08-25T09:57:44,Jak poinformowała rzeczniczka prasowa wojewody...
6,https://www.rmf24.pl/fakty/polska/news-slaskie...,kraj,2019-08-25T09:50:55,Do zdarzenia doszło w sobotę. Poszkodowany 85-...
7,https://www.rmf24.pl/fakty/polska/news-strazak...,kraj,2019-08-25T09:28:00,Cel podczas pierwszej wizyty wielkopolskiego s...
8,https://www.rmf24.pl/fakty/polska/news-swiadek...,kraj,2019-08-24T18:14:00,Pan Marcin wybrał się felernego dnia na wyciec...
9,https://www.rmf24.pl/fakty/polska/news-dziela-...,kraj,2019-08-24T16:47:00,Naukowcy z Muzeum Narodowego w Krakowie będą m...
10,https://www.rmf24.pl/fakty/polska/news-byk-prz...,kraj,2019-08-24T15:40:00,Jak poinformowała w sobotę PAP asp. sztab. Mon...
