In [70]:
import os
import json
import re
import string
from pprint import pprint
from pathlib import Path
from collections import defaultdict

In [93]:
data_root = os.path.join("naija_highlights", "data", "bronze")
punch_root = os.path.join(data_root, "punchng")
sun_root = os.path.join(data_root, "Sunnewsonline")

def get_data_paths(scraped_data_root):
    """ get paths to data"""
    data_paths = defaultdict(list)
    for root, dirs, files in os.walk(scraped_data_root):
        if "items.json" in files:
            key = Path(root).stem
            for f in files:
                value = os.path.join(root, f)
                data_paths[key].append(value)
    return data_paths


def read_data(path):
    """ Read Json Data"""
    data = []
    with open(path, "r") as f:
        for line in f:
            data.append(json.loads(line))
    
    return data

In [250]:
punch_data_paths = get_data_paths(punch_root)
sun_data_paths = get_data_paths(sun_root)
sun_data_paths

defaultdict(list,
            {'day=20': ['naija_highlights\\data\\bronze\\Sunnewsonline\\year=2023\\month=3\\day=20\\items.json'],
             'day=21': ['naija_highlights\\data\\bronze\\Sunnewsonline\\year=2023\\month=3\\day=21\\items.json'],
             'day=22': ['naija_highlights\\data\\bronze\\Sunnewsonline\\year=2023\\month=3\\day=22\\items.json'],
             'day=27': ['naija_highlights\\data\\bronze\\Sunnewsonline\\year=2023\\month=3\\day=27\\items.json'],
             'day=28': ['naija_highlights\\data\\bronze\\Sunnewsonline\\year=2023\\month=3\\day=28\\items.json'],
             'day=29': ['naija_highlights\\data\\bronze\\Sunnewsonline\\year=2023\\month=3\\day=29\\items.json']})

## Punch

In [254]:
sample_number = 3
sample_data = read_data(punch_data_paths["day=29"][0])[sample_number]
sample_data

{'weblink': 'https://punchng.com/nysc-trust-fund-will-improve-staff-corpers-welfare-minister/',
 'title': ' NYSC trust fund will improve staff, corpers’ welfare – Minister ',
 'postdate': [29, 3, 2023],
 'thumbnaillink': 'https://cdn.punchng.com/wp-content/uploads/2021/10/22112508/SUNDAY-DARE-2.jpg',
 'author': 'Nathaniel Shaibu',
 'body': ['The Minister for Youth and Sports Development Sunday Dare has disclosed that a reform bill which has been submitted to the President Major General Muhammadu Buhari retd. for assent will see the injection of N14bn into the National Youth Service Corps.',
  'The minister on Tuesday made this known during his welcome address at the opening ceremony of the 2023 Annual Management Conference held in Abuja',
  'The theme of the conference was “Optimising the operations of the NYSC at 50.”',
  'Dare who congratulated the management of the NYSC ahead of the scheme’s 50th anniversary said discussions on reforms had to top the agenda of the conference adding 

## Sun Paper

In [228]:
def clean_words(words):
    """ clean word"""
    punctuation = "".join([i for i in\
                string.punctuation if i not in [".", "?", "-", "!"]])
    words = re.sub(r"&(amp)", "and", words)
    words = re.sub(r"[\xa0\n{}]".format(punctuation),"", words).strip(" ")
    return words

def clean_html(words):
    """ removes html """
    return re.sub(r"<.*?>", "", words)

def preprocess_author_and_body(body):
    """ extract author from author or body """
    
    def extract_author(kw, query_string):
        """extract author from query string"""
        index = query_string.index(kw)
        author = query_string[index:]
        author = clean_words(query_string.split(kw)[-1])
        return author
    
    key_words = ["By", "From"]
    body_content = []
    author = None
    
    for i, line in enumerate(body):
        for kw in key_words:
            if kw in line and author is None:
                author = body.pop(i)
                author = author.split("<em>")[0]
                author = clean_html(author)
                author = extract_author(kw, author)
        
        if not any([i in line for i in key_words]):
            body_content.append(clean_words(clean_html(line)))
        
    author = "Anonymous" if author is None else author
    
    return body_content, author


In [248]:
sample_number = 8
sample_data = read_data(sun_data_paths["day=27"][0])[sample_number]
sample_data

{'weblink': 'https://sunnewsonline.com/apc-professionals-council-warns-opposition-against-unsavoury-inciting-utterances-2/',
 'title': 'APC Professionals Council warns opposition against unsavoury, inciting utterances',
 'postdate': [27, 3, 2023],
 'thumbnaillink': 'https://assets.sunnewsonline.com/wp-content/uploads/2023/03/APC.jpg',
 'author': 'Lukman Olabiyi',
 'body': ['The council in a statement signed by National Director General Hon. Seyi Bamigbade said the inflammatory comments of the opposition who are threatening to disrupt swearing-in of the president-elect Bola Tinubu on May 29 could be a recipe to stir crisis.',
  'The council appealed to parties who are aggrieved about the Feb. 25 presidential and National Assembly elections to utilize legal means to seek redress rather than heating up the polity with unguarded utterances.',
  '“We have watched with utmost concern how the opposition parties particularly the Labour Party and the PDP have consistently made efforts to underm