In [1]:
from bs4 import BeautifulSoup
import requests

import sqlite3
from sqlite3 import Error

import os
import time
import pandas as pd

In [2]:
def createConnection(db_file):
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except Error as e:
        print(e)
 
    return conn

In [3]:
def createTable(conn, create_table_sql):
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except Error as e:
        print(e)

In [4]:
def createEntry(conn, entry):
    try:
        sql = ''' INSERT INTO articles(articleUrl, category, publicationTime, articleContent)
                  VALUES(?,?,?,?) '''
        cur = conn.cursor()
        cur.execute(sql, entry)
    except Error as e:
        print(e)
    return 

In [5]:
def fetchSite(siteUrl):
#     print("Getting data from: {}".format(siteUrl)) # comment this one out for normal usage
    headers = requests.utils.default_headers()
    headers.update({'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',})
    
    try:
        htmlFile = requests.get(siteUrl, headers=headers)
    except:
        try:
            print("Some connection problem @ {} - waiting for 5 seconds to retry...".format(siteUrl))
            time.sleep(5)
            htmlFile = requests.get(siteUrl, headers=headers)
        except:
            print("Some connection problem @ {} - waiting for 30 seconds to retry...".format(siteUrl))
            time.sleep(30)
            htmlFile = requests.get(siteUrl, headers=headers)
    return BeautifulSoup(htmlFile.content, "lxml")

In [6]:
def extractLinks(siteUrl, sitePrefix):
    links = [element.find("h3") for element in fetchSite(siteUrl).find_all("div", {"class":"contDetails"})]
    links = [sitePrefix + element.find("a")["href"] for element in links]
    return links

In [7]:
def drillCategory(baseUrl, sitePrefix, minArticlesNo = 100):
    articlesUrls = []
    
    currentUrl = baseUrl
    pageCounter = 1
    while len(articlesUrls) <= minArticlesNo:
        [articlesUrls.append(url) for url in extractLinks(currentUrl, sitePrefix)]
        
        pageCounter += 1
        currentUrl = baseUrl + ",nPack," + str(pageCounter)
        
        print("So far, extracted {} urls on {} pages...".format(len(articlesUrls), pageCounter - 1))
    
    return articlesUrls

In [8]:
def extractArticleDetails(articleUrl):
    article = fetchSite(articleUrl)
    
    try:
        publicationTime = article.find("div", {"class":"article-date"}).find("meta")["content"]
    except: 
        print("E: publicationTime is missing @ {}".format(articleUrl))
        publicationTime = ""
    try:
        articleContent = " ".join([p.text for p in article.find("div", {"class":"articleContent"}).find_all("p")])
    except: 
        print("E: articleContent is missing @ {}".format(articleUrl))
        articleContent = ""
        
    return publicationTime, articleContent

In [9]:
def runScraper(articlesSource, conn, minArticlesNo = 20, sitePrefix = ""):
    for category, categoryUrl in articlesSource.items():
        print("\n--- Category: {}\n".format(category))

        articlesCounter = 0
        for articleUrl in drillCategory(categoryUrl, sitePrefix, minArticlesNo):
            publicationTime, articleContent = extractArticleDetails(articleUrl)

            createEntry(conn ,(articleUrl, category, publicationTime, articleContent))

            articlesCounter +=1
            if articlesCounter % 10 == 0: 
                print("Wow! Already have {} articles in category '{}'!".format(articlesCounter, category))

        print("Total number of articles in category '{}': {}".format(category, articlesCounter))

In [11]:
# setting up source
articlesSource = {"kraj":"https://www.rmf24.pl/fakty/polska/", 
                  "świat":"https://www.rmf24.pl/fakty/swiat/", 
                  "kultura":"https://www.rmf24.pl/kultura/", 
                  "ekonomia":"https://www.rmf24.pl/ekonomia/", 
                  "sport":"https://www.rmf24.pl/sport/", 
                  "nauka":"https://www.rmf24.pl/nauka/"}
minArticlesNo = 100
sitePrefix = "https://www.rmf24.pl"

# database connection setup
if not os.path.exists("../data"):
    os.mkdir("../data")
conn = createConnection("../data/newsDatabase.db")

# creating table for articles
sqlCreateArticlesTable = """ CREATE TABLE IF NOT EXISTS articles (
                                        id integer PRIMARY KEY,
                                        articleUrl text,
                                        category text,
                                        publicationTime text,
                                        articleContent text
                                    ); """
createTable(conn, sqlCreateArticlesTable)

# main part
runScraper(articlesSource, conn, minArticlesNo, sitePrefix)


--- Category: kraj

So far, extracted 15 urls on 1 pages...
So far, extracted 30 urls on 2 pages...
So far, extracted 45 urls on 3 pages...
So far, extracted 60 urls on 4 pages...
So far, extracted 75 urls on 5 pages...
So far, extracted 90 urls on 6 pages...
So far, extracted 105 urls on 7 pages...
Wow! Already have 10 articles in category 'kraj'!
Wow! Already have 20 articles in category 'kraj'!
Wow! Already have 30 articles in category 'kraj'!
Wow! Already have 40 articles in category 'kraj'!
Wow! Already have 50 articles in category 'kraj'!
Wow! Already have 60 articles in category 'kraj'!
Wow! Already have 70 articles in category 'kraj'!
Wow! Already have 80 articles in category 'kraj'!
Wow! Already have 90 articles in category 'kraj'!
Wow! Already have 100 articles in category 'kraj'!
Some connection problem @ https://www.rmf24.pl/fakty/polska/news-naczelnik-topr-o-akcji-w-jaskini-musimy-sie-przygotowac-na-d,nId,3155396 - waiting for 5 seconds to retry...
Total number of article

In [16]:
articlesData = pd.read_sql_query("SELECT * FROM articles", conn, index_col = "id")

In [13]:
articlesData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 630 entries, 1 to 630
Data columns (total 4 columns):
articleUrl         630 non-null object
category           630 non-null object
publicationTime    630 non-null object
articleContent     630 non-null object
dtypes: object(4)
memory usage: 24.6+ KB


In [15]:
articlesData.head(20)

Unnamed: 0_level_0,articleUrl,category,publicationTime,articleContent
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,https://www.rmf24.pl/fakty/polska/news-wspolpr...,kraj,2019-08-08T18:21:00,Według danych serwisu Fixly.pl aż 66% Polaków ...
2,https://www.rmf24.pl/fakty/polska/news-akcja-r...,kraj,2019-08-21T14:06:59,Akcja poszukiwawcza dwójki grotołazów trwa już...
3,https://www.rmf24.pl/fakty/news-wyciek-danych-...,kraj,2019-08-21T13:19:00,"""Oczekujemy niezwłocznego zabezpieczenia sprzę..."
4,https://www.rmf24.pl/fakty/polska/news-na-moto...,kraj,2019-08-21T13:04:30,Do zdarzenia doszło w minioną sobotę. Policjan...
5,https://www.rmf24.pl/fakty/polska/news-jacht-z...,kraj,2019-08-21T12:50:47,Żeglarz jest już bezpieczny. Jak usłyszał od s...
6,https://www.rmf24.pl/fakty/polska/news-kuria-o...,kraj,2019-08-21T12:33:56,Europoseł PiS Ryszard Czarnecki udostępnił na ...
7,https://www.rmf24.pl/fakty/polska/news-schetyn...,kraj,2019-08-21T12:05:00,Schetyna odnosząc się do publikacji portalu On...
8,https://www.rmf24.pl/fakty/polska/news-bil-kam...,kraj,2019-08-21T11:56:07,57-latek z Serocka na Lubelszczyźnie oskarżył ...
9,https://www.rmf24.pl/fakty/polska/news-bialyst...,kraj,2019-08-21T11:04:55,"Policjanci z Białegostoku zatrzymali 20-latka,..."
10,https://www.rmf24.pl/fakty/polska/news-gwaltow...,kraj,2019-08-21T10:47:00,"Jak relacjonuje Frątczak, Straż Pożarna prowad..."
