## WIKINEWS SCRAPER

In [None]:
import numpy as np
import pandas as pd

import datefinder
import requests

from bs4 import BeautifulSoup as bs

In [None]:
def create_soup(url):
    site = requests.get(url)
    return bs(site.text, "html.parser")

soup = create_soup('https://en.wikinews.org/wiki/Category:Politics_and_conflicts')

assigned_letters = "ABCDEFGHIJKLMNOPRSTUVWZABCDEFGHIJKLMNOPRSTUVWZ"[21%23:21%23+10]

In [None]:
def valid_links(soup):
    links=[]
    def is_valid(tag):
        return tag.name == 'a' and tag.get('title')[0] in assigned_letters
    section = {'id':"mw-pages"}
    subsection = {'class':"mw-category mw-category-columns"}
    for link in soup.find('div', section).find('div', subsection).find_all(is_valid):
        links.append("https://en.wikinews.org" + link.get('href'))
    return links

def next_page_url(soup):
    return "https://en.wikinews.org" + soup.find('a', string="next page").get('href')
    
def no_more_pages(soup):
    return soup.find('a', string="next page") is None

def get_links(soup, links):
    links += valid_links(soup)

    if no_more_pages(soup):
        return links
    else:
        next_soup = create_soup(next_page_url(soup))
        return get_links(next_soup, links)

In [None]:
links = get_links(soup, [])
len(links)

In [None]:
def content_parser(soup):
    publish_date = ""
    content = ""

    terminators = ('Have an opinion on this story', 'Share this', 'This article has passed through community review',
        'This page is archived')

    for tag in soup.find('div', {'id':"bodyContent"}).find('div', {'class':"mw-parser-output"}).find_all(['p','dl']):
        datetag = tag.find('strong', {'class': "published"})
        
        if tag.text.startswith(terminators):
            break
        elif datetag != None:
            publish_date = datetag.span['title']
            content += tag.text
            content = content[len(datetag.text):] 
        else:
            content += (tag.text + "\n")
    
    return content, publish_date

def footer_parser(soup):
    modified_date = ""
    modified_tag = soup.find('li', {'id':"footer-info-lastmod"})
    if modified_tag != None:
        modified_text = modified_tag.text
        dates_from_text = list(datefinder.find_dates(modified_text))
        if len(dates_from_text) > 0:
            modified_date = dates_from_text[0].strftime('%Y-%m-%d')
    
    return modified_date

def source_parser(soup):
    sources = []
    for tag in soup.find_all('span', {'class':"sourceTemplate"}):
        source_text = tag.find('i')
        if source_text != None:
            sources.append(source_text.text)
    
    return sources

def categories_parser(soup):
    categories = []

    # Remove common categories
    excluded = ['','Published', 'Archived', 'Politics and conflicts']

    for tag in soup.find('div', {'id':"mw-normal-catlinks"}).find_all('li'):
        if tag.text not in excluded:
            categories.append(tag.text)
    
    # Remove date category
    if len(list(datefinder.find_dates(categories[0]))) > 0:
        categories = categories[1:]

    return categories
    

def scrape(url):
    soup = create_soup(url)

    title = soup.find('h1', {'id':"firstHeading"}).text

    content, publish_date = content_parser(soup)

    modified_date = footer_parser(soup)

    sources = source_parser(soup)

    categories = categories_parser(soup)

    return title, publish_date, modified_date, sources, categories, content

In [None]:
#scraped_data = pd.DataFrame(map(scrape, links), columns = ['title', 'publish_date', 'modified_date', 'sources', 'categories', 'content'])

In [None]:
#scraped_data.to_csv('wikinews.csv', encoding='utf-8')