In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import re

# Import os module
import os

# Define the media folder path
media_folder = 'media'

# Ensure the media folder exists
os.makedirs(media_folder, exist_ok=True)

# # Function to generate a valid filename from the title
# def generate_valid_filename(title):
#     # Remove invalid characters
#     valid_filename = re.sub(r'[\\/*?:"<>|]', "", title)
#     # Limit the filename length and add extension
#     return valid_filename[:100] + '.jpg'

# Function to download image
def download_img(image_url, local_image_path):
    try:
        img_data = requests.get(image_url).content
        with open(local_image_path, 'wb') as handler:
            handler.write(img_data)
    except Exception as e:
        print(f"Error downloading image: {e}")

# Define your custom function
def save_to_dataframe(title, link, image_link, local_image_path, current_time, source, df):
    try:
        recent = pd.DataFrame({
            'title': [title],
            'link': [link],
            'image_link': [image_link],
            'local_image_path': [local_image_path],
            'source': [source],
            'time_stamp': [current_time]
        })
        df = pd.concat([df, recent], ignore_index=True)
        
        # Saving image to local storage
        download_img(image_link, local_image_path)

        # Optionally save the DataFrame to a CSV file
        # df.to_csv('ominiscopia_db.csv', index=False)
        return df
    except Exception as e:
        print(f"Error saving to DataFrame: {e}")
        return df

# Initialize an empty DataFrame
df = pd.DataFrame(columns=['title', 'link', 'image_link', 'local_image_path', 'source', 'time_stamp'])

# Scraping Daily Trust
try:
    url = 'https://dailytrust.com/topics/news/'
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')

    content = soup.find_all('li', class_='list_list')
    print(f"Daily Trust articles found: {len(content)}")

    for insight in content:
        title = insight.a.text.strip()
        link = insight.a['href']
        image_link = insight.img['data-src']
        source = 'dailytrust'
        current_time = pd.Timestamp.now()
        filename = title
        local_image_path = os.path.join(media_folder, filename)

        df = save_to_dataframe(title, link, image_link, local_image_path, current_time, source, df)
except Exception as e:
    print(f"Error scraping Daily Trust: {e}")

# Scraping Vanguard
try:
    url = 'https://www.vanguardngr.com/latest-news/'
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    articles = soup.find_all('article', class_='entry entry-list-card entry-card-larger')
    print(f"Vanguard articles found: {len(articles)}")
    source = 'vanguardngr'

    for article in articles:
        img_tag = article.find('img')
        if img_tag:
            title = img_tag.get('alt')
            image_link = img_tag.get('src')
            link_tag = article.find('a', href=True)
            link = link_tag['href'] if link_tag else ''
            current_time = pd.Timestamp.now()
            filename = title
            local_image_path = os.path.join(media_folder, filename)

            df = save_to_dataframe(title, link, image_link, local_image_path, current_time, source, df)
except Exception as e:
    print(f"Error scraping Vanguard: {e}")

# Scraping ThisDayLive
try:
    url = 'https://www.thisdaylive.com/index.php/category/nigeria/'
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')

    key = soup.find_all('div', class_='col-xs-12 col-md-4')
    print(f"ThisDayLive sections found: {len(key)}")
    source = 'thisdaylive'

    for k in key:
        articles = k.find_all('article', class_='typography horizontal')
        for a in articles:
            image = a.img['src']
            title = a.h2.a.text.strip()
            link = a.h2.a['href']
            current_time = pd.Timestamp.now()
            filename = title
            local_image_path = os.path.join(media_folder, filename)

            df = save_to_dataframe(title, link, image, local_image_path, current_time, source, df)
except Exception as e:
    print(f"Error scraping ThisDayLive: {e}")

# Scraping Punch
try:
    url = 'https://punchng.com/all-posts/'
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')

    news = soup.find_all('article')
    source = 'punchng'

    for n in news[1:]:
        title = n.h1.a.text.strip()
        link = n.h1.a['href']
        img_tag = n.find('img')
        image_link = img_tag['data-src'] if img_tag and 'data-src' in img_tag.attrs else ''
        current_time = pd.Timestamp.now()
        filename = title
        local_image_path = os.path.join(media_folder, filename)

        df = save_to_dataframe(title, link, image_link, local_image_path, current_time, source, df)
except Exception as e:
    print(f"Error scraping Punch: {e}")

# Scraping Tribune
try:
    url = 'https://tribuneonlineng.com/category/latest-news/'
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')

    nt = soup.find('div', class_='jeg_posts jeg_load_more_flag')
    if nt:
        articles = nt.find_all('article')
        print(f"Tribune articles found: {len(articles)}")
        source = 'tribuneonlineng'

        for article in articles:
            news_link = article.div.a['href']
            news_title = article.h3.a.text.strip()
            img_tag = article.find('img')
            image_link = img_tag['data-src'] if img_tag and 'data-src' in img_tag.attrs else ''
            current_time = pd.Timestamp.now()
            filename = news_title
            local_image_path = os.path.join(media_folder, filename)

            df = save_to_dataframe(news_title, news_link, image_link, local_image_path, current_time, source, df)
    else:
        print('No articles found in Tribune')
except Exception as e:
    print(f"Error scraping Tribune: {e}")

# Reset the index and print the DataFrame
df.reset_index(drop=True, inplace=True)
print(df)


Daily Trust articles found: 5


  df = pd.concat([df, recent], ignore_index=True)


Vanguard articles found: 2
ThisDayLive sections found: 57
Error downloading image: [Errno 22] Invalid argument: 'media\\VIDEO: Solidarity in action: Police hand out water to protesters in Lagos'
Tribune articles found: 10
                                                 title  \
0    Nigeria needs independence from state capture ...   
1    64th Independence: Makinde expands small scale...   
2                   Oct 1: Kano residents shun protest   
3                   Police teargas protesters in Abuja   
4    El-Rufai: I’m ready to swear with Quran I didn...   
..                                                 ...   
98   Nigeria@64: Gov Ododo, Bago call for unity, peace   
99   ‘We must evolve’: Natasha calls for sociopolit...   
100  Bodija market traders resolve lingering leader...   
101  Nigeria@64: See peace, unity as collective res...   
102  Keep believing, better days ahead, Tinubu assu...   

                                                  link  \
0    https://dailytrust

In [3]:
df

Unnamed: 0,title,link,image_link,local_image_path,source,time_stamp
0,Nigeria needs independence from state capture ...,https://dailytrust.com/nigeria-needs-independe...,https://dailytrust.com/wp-content/uploads/2023...,media\Nigeria needs independence from state ca...,dailytrust,2024-10-02 00:35:49.875486
1,64th Independence: Makinde expands small scale...,https://dailytrust.com/64th-independence-makin...,https://dailytrust.com/wp-content/uploads/2024...,media\64th Independence: Makinde expands small...,dailytrust,2024-10-02 00:35:51.640544
2,Oct 1: Kano residents shun protest,https://dailytrust.com/oct-1-kano-residents-sh...,https://dailytrust.com/wp-content/uploads/2024...,media\Oct 1: Kano residents shun protest,dailytrust,2024-10-02 00:35:53.342693
3,Police teargas protesters in Abuja,https://dailytrust.com/police-teargas-proteste...,https://dailytrust.com/wp-content/uploads/2022...,media\Police teargas protesters in Abuja,dailytrust,2024-10-02 00:35:55.043632
4,El-Rufai: I’m ready to swear with Quran I didn...,https://dailytrust.com/el-rufai-im-ready-to-sw...,https://dailytrust.com/wp-content/uploads/2024...,media\El-Rufai: I’m ready to swear with Quran ...,dailytrust,2024-10-02 00:35:56.504191
...,...,...,...,...,...,...
98,"Nigeria@64: Gov Ododo, Bago call for unity, peace",https://tribuneonlineng.com/nigeria64-gov-odod...,https://tribuneonlineng.com/wp-content/uploads...,"media\Nigeria@64: Gov Ododo, Bago call for uni...",tribuneonlineng,2024-10-02 00:38:45.825351
99,‘We must evolve’: Natasha calls for sociopolit...,https://tribuneonlineng.com/we-must-evolve-nat...,https://tribuneonlineng.com/wp-content/uploads...,media\‘We must evolve’: Natasha calls for soci...,tribuneonlineng,2024-10-02 00:38:47.067346
100,Bodija market traders resolve lingering leader...,https://tribuneonlineng.com/bodija-market-trad...,https://tribuneonlineng.com/wp-content/uploads...,media\Bodija market traders resolve lingering ...,tribuneonlineng,2024-10-02 00:38:48.491530
101,"Nigeria@64: See peace, unity as collective res...",https://tribuneonlineng.com/nigeria64-see-peac...,https://tribuneonlineng.com/wp-content/uploads...,"media\Nigeria@64: See peace, unity as collecti...",tribuneonlineng,2024-10-02 00:38:50.115774
