In [None]:
from bs4 import BeautifulSoup
import requests

import sqlite3
from sqlite3 import Error

import os
import time
import pandas as pd

In [None]:
def createConnection(db_file):
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except Error as e:
        print(e)
 
    return conn

In [None]:
def createTable(conn, create_table_sql):
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except Error as e:
        print(e)

In [None]:
def createEntry(conn, entry):
    try:
        sql = ''' INSERT INTO articles(articleUrl, category, publicationTime, articleContent)
                  VALUES(?,?,?,?) '''
        cur = conn.cursor()
        cur.execute(sql, entry)
    except Error as e:
        print(e)
    return 

In [None]:
def fetchSite(siteUrl):
#     print("Getting data from: {}".format(siteUrl)) # comment this one out for normal usage
    headers = requests.utils.default_headers()
    headers.update({'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',})
    
    try:
        htmlFile = requests.get(siteUrl, headers=headers)
    except:
        print("Some connection problem @ {} - waiting for 5 seconds to retry...".format(siteUrl))
        time.sleep(5)
        htmlFile = requests.get(siteUrl, headers=headers)
    return BeautifulSoup(htmlFile.content, "lxml")

In [None]:
def extractLinks(siteUrl, sitePrefix):
    links = [element.find("h3") for element in fetchSite(siteUrl).find_all("div", {"class":"contDetails"})]
    links = [sitePrefix + element.find("a")["href"] for element in links]
    return links

In [None]:
def drillCategory(baseUrl, sitePrefix, minArticlesNo = 100):
    articlesUrls = []
    
    currentUrl = baseUrl
    pageCounter = 1
    while len(articlesUrls) <= minArticlesNo:
        [articlesUrls.append(url) for url in extractLinks(currentUrl, sitePrefix)]
        
        pageCounter += 1
        currentUrl = baseUrl + ",nPack," + str(pageCounter)
        
        print("So far, extracted {} urls on {} pages...".format(len(articlesUrls), pageCounter - 1))
    
    return articlesUrls

In [None]:
def extractArticleDetails(articleUrl):
    article = fetchSite(articleUrl)
    
    publicationTime = article.find("div", {"class":"article-date"}).find("meta")["content"]
    articleContent = " ".join([p.text for p in article.find("div", {"class":"articleContent"}).find_all("p")])
    
    return publicationTime, articleContent

In [None]:
def runScraper(articlesSource, conn, minArticlesNo = 20, sitePrefix = ""):
    for category, categoryUrl in articlesSource.items():
        print("\n--- Category: {}\n".format(category))

        articlesCounter = 0
        for articleUrl in drillCategory(categoryUrl, sitePrefix, minArticlesNo):
            publicationTime, articleContent = extractArticleDetails(articleUrl)

            createEntry(conn ,(articleUrl, category, publicationTime, articleContent))

            articlesCounter +=1
            if articlesCounter % 10 == 0: 
                print("Wow! Already have {} articles in category '{}'!".format(articlesCounter, category))

        print("Total number of articles in category '{}': {}".format(category, articlesCounter))

In [None]:
# setting up source
articlesSource = {"kraj":"https://www.rmf24.pl/fakty/polska/", 
                  "świat":"https://www.rmf24.pl/fakty/swiat/", 
                  "kultura":"https://www.rmf24.pl/kultura/", 
                  "ekonomia":"https://www.rmf24.pl/ekonomia/", 
                  "sport":"https://www.rmf24.pl/sport/", 
                  "nauka":"https://www.rmf24.pl/nauka/"}
minArticlesNo = 1000
sitePrefix = "https://www.rmf24.pl"

# database connection setup
if not os.path.exists("../data"):
    os.mkdir("../data")
conn = createConnection("../data/newsDatabase.db")

# creating table for articles
sqlCreateArticlesTable = """ CREATE TABLE IF NOT EXISTS articles (
                                        id integer PRIMARY KEY,
                                        articleUrl text,
                                        category text,
                                        publicationTime text,
                                        articleContent text
                                    ); """
createTable(conn, sqlCreateArticlesTable)

# main part
runScraper(articlesSource, conn, minArticlesNo, sitePrefix)

In [None]:
articlesData = pd.read_sql_query("SELECT * FROM articles", conn, index_col = "id")

In [None]:
articlesData.info()

In [None]:
articlesData.head(20)