<a href="https://colab.research.google.com/github/Uchebuzz/wazobia_scrapper/blob/main/Wazobia_scrapper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Scrapper

In [40]:
# Wazobia FM Article Scraper
# =========================

import requests
from bs4 import BeautifulSoup
import json
import time
from datetime import datetime
from urllib.parse import urljoin, urlparse
from typing import List, Dict, Optional
import re
from dataclasses import dataclass, asdict
import pandas as pd
import yaml

print("✅ Libraries imported successfully!")

# ## Load Configuration

def load_config(config_path: str = "config.yml") -> dict:
    """Load configuration from YAML file"""
    try:
        with open(config_path, 'r', encoding='utf-8') as f:
            return yaml.safe_load(f)
    except FileNotFoundError:
        print(f"❌ Configuration file {config_path} not found")
        raise
    except yaml.YAMLError as e:
        print(f"❌ Invalid YAML in configuration file: {e}")
        raise

CONFIG = load_config()
print(f"✅ Configuration loaded for {CONFIG['site_config']['name']}")

# ## Article Data Structure

@dataclass
class ArticleData:
    """Data structure for article information"""
    title: str = ""
    content: str = ""
    author: str = ""
    date: str = ""
    url: str = ""
    tags: List[str] = None
    images: List[str] = None
    scraped_at: str = ""
    content_length: int = 0
    category: str = ""

    def __post_init__(self):
        if self.tags is None:
            self.tags = []
        if self.images is None:
            self.images = []
        if not self.scraped_at:
            self.scraped_at = datetime.now().isoformat()
        self.content_length = len(self.content)

        # Extract category from URL
        if self.url and not self.category:
            url_parts = self.url.split('/')
            categories = CONFIG['wazobia_fm_urls']['content_categories']
            for part in url_parts:
                if part in categories:
                    self.category = part
                    break

    def to_dict(self):
        return asdict(self)

# ## Scraper Class

class WazobiaScraper:
    """Wazobia FM article scraper"""

    def __init__(self, config: dict = None):
        self.config = config or CONFIG
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': self.config['site_config']['user_agent']
        })
        self.articles = []
        self.scraped_urls = set()

    def log_progress(self, message: str):
        print(f"📊 {message}")

    def get_page(self, url: str) -> Optional[BeautifulSoup]:
        """Fetch and parse a web page"""
        try:
            response = self.session.get(url, timeout=self.config['site_config']['timeout'])
            response.raise_for_status()
            time.sleep(self.config['site_config']['delay_between_requests'])
            return BeautifulSoup(response.content, 'html.parser')
        except requests.RequestException as e:
            print(f"❌ Error fetching {url}: {e}")
            return None

    def get_nested_value(self, data, path):
        """Get nested value from dictionary using dot notation"""
        keys = path.split('.')
        value = data

        for key in keys:
            if isinstance(value, dict) and key in value:
                value = value[key]
            elif isinstance(value, list) and value and isinstance(value[0], dict) and key in value[0]:
                value = value[0][key]
            else:
                return None
        return value

    def extract_from_json_ld(self, soup: BeautifulSoup) -> Optional[ArticleData]:
        """Extract article data from JSON-LD structured data"""
        json_scripts = soup.find_all('script', {'type': 'application/ld+json'})

        for script in json_scripts:
            try:
                json_content = script.string
                if not json_content:
                    continue

                data = json.loads(json_content)
                if isinstance(data, list):
                    data = data[0] if data else {}

                article_types = self.config['extraction_methods']['json_ld']['article_types']
                if data.get('@type') not in article_types:
                    continue

                article_data = ArticleData()
                field_mapping = self.config['extraction_methods']['json_ld']['field_mapping']

                # Extract title
                for path in field_mapping['title']:
                    title = self.get_nested_value(data, path)
                    if title:
                        article_data.title = self.clean_text(str(title))
                        break

                # Extract content
                for path in field_mapping['content']:
                    content = self.get_nested_value(data, path)
                    if content:
                        article_data.content = self.clean_text(str(content))
                        break

                # Extract author
                for path in field_mapping['author']:
                    author = self.get_nested_value(data, path)
                    if author:
                        if isinstance(author, dict) and 'name' in author:
                            article_data.author = self.clean_text(str(author['name']))
                        else:
                            article_data.author = self.clean_text(str(author))
                        break

                # Extract date
                for path in field_mapping['date']:
                    date = self.get_nested_value(data, path)
                    if date:
                        article_data.date = self.clean_text(str(date))
                        break

                # Extract images
                for path in field_mapping['image']:
                    image = self.get_nested_value(data, path)
                    if image:
                        if isinstance(image, list):
                            article_data.images.extend([str(img) for img in image])
                        else:
                            article_data.images.append(str(image))
                        break

                # Extract tags
                for path in field_mapping['tags']:
                    tags = self.get_nested_value(data, path)
                    if tags:
                        if isinstance(tags, list):
                            article_data.tags.extend([self.clean_text(str(tag)) for tag in tags])
                        elif isinstance(tags, str):
                            article_data.tags.extend([self.clean_text(tag.strip()) for tag in tags.split(',')])
                        break

                if article_data.title and article_data.content:
                    return article_data

            except (json.JSONDecodeError, KeyError, AttributeError):
                continue

        return None

    def find_element_by_selectors(self, soup: BeautifulSoup, selectors: List[str]):
        """Find element using multiple CSS selectors"""
        for selector in selectors:
            try:
                element = soup.select_one(selector)
                if element:
                    return element
            except Exception:
                continue
        return None

    def find_elements_by_selectors(self, soup: BeautifulSoup, selectors: List[str]):
        """Find elements using multiple CSS selectors"""
        elements = []
        for selector in selectors:
            try:
                found = soup.select(selector)
                if found:
                    elements.extend(found)
            except Exception:
                continue
        return elements

    def extract_from_html_selectors(self, soup: BeautifulSoup) -> Optional[ArticleData]:
        """Extract article data using HTML selectors"""
        # Remove unwanted elements
        for selector in self.config['content_filters']['exclude_selectors']:
            for element in soup.select(selector):
                element.decompose()

        article_data = ArticleData()

        # Extract title
        title_element = self.find_element_by_selectors(soup, self.config['article_selectors']['title'])
        if title_element:
            article_data.title = self.clean_text(title_element.get_text())

        # Extract content
        content_elements = self.find_elements_by_selectors(soup, self.config['article_selectors']['content'])
        content_parts = []
        for element in content_elements:
            text = self.clean_text(element.get_text())
            if text and len(text) > 20:
                content_parts.append(text)
        article_data.content = ' '.join(content_parts)

        # Extract author
        author_element = self.find_element_by_selectors(soup, self.config['article_selectors']['author'])
        if author_element:
            article_data.author = self.clean_text(author_element.get_text())

        # Extract date
        date_element = self.find_element_by_selectors(soup, self.config['article_selectors']['date'])
        if date_element:
            article_data.date = self.clean_text(date_element.get_text())

        # Extract tags
        tag_elements = self.find_elements_by_selectors(soup, self.config['article_selectors']['tags'])
        article_data.tags = [self.clean_text(tag.get_text()) for tag in tag_elements if tag.get_text().strip()]

        # Extract images
        img_elements = self.find_elements_by_selectors(soup, self.config['article_selectors']['images'])
        for img in img_elements:
            src = img.get('src')
            if src:
                full_img_url = urljoin(article_data.url or '', src)
                article_data.images.append(full_img_url)

        return article_data if article_data.title or article_data.content else None

    def clean_text(self, text: str) -> str:
        """Clean and normalize text content"""
        if not text:
            return ""

        text = ' '.join(text.split())
        text = text.replace('&nbsp;', ' ').replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')
        text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\']+', '', text)
        return text.strip()

    def extract_article_links(self, soup: BeautifulSoup) -> List[str]:
        """Extract article links from page"""
        links = []
        link_elements = self.find_elements_by_selectors(soup, self.config['article_selectors']['article_links'])

        for link in link_elements:
            href = link.get('href')
            if href:
                full_url = urljoin(self.config['site_config']['base_url'], href)
                if self.is_valid_article_url(full_url):
                    links.append(full_url)

        return list(set(links))

    def is_valid_article_url(self, url: str) -> bool:
        """Check if URL is a valid article URL"""
        try:
            parsed = urlparse(url)
            base_domain = urlparse(self.config['site_config']['base_url']).netloc

            if parsed.netloc != base_domain:
                return False

            # Skip unwanted patterns
            if any(pattern in url.lower() for pattern in self.config['content_filters']['skip_patterns']):
                return False

            # Only include valid content patterns
            is_valid_content = any(pattern in url.lower() for pattern in self.config['content_filters']['valid_patterns'])

            return is_valid_content and url not in self.scraped_urls
        except:
            return False

    def get_news_sections(self) -> List[str]:
        """Get all news section URLs"""
        base_url = self.config['site_config']['base_url']
        return [urljoin(base_url, section) for section in self.config['wazobia_fm_urls']['news_sections']]

    def extract_article_data(self, url: str) -> Optional[ArticleData]:
        """Extract article data using multiple methods"""
        soup = self.get_page(url)
        if not soup:
            return None

        # Try extraction methods in priority order
        for method in self.config['extraction_methods']['priority']:
            if method == 'json_ld':
                article_data = self.extract_from_json_ld(soup)
                if article_data:
                    print(f"✅ JSON-LD: {article_data.title[:50]}...")
                    break
            elif method == 'html_selectors':
                article_data = self.extract_from_html_selectors(soup)
                if article_data and (article_data.title or len(article_data.content) > 100):
                    print(f"✅ HTML: {article_data.title[:50]}...")
                    break
            elif method == 'fallback':
                title = soup.find('title')
                if title:
                    article_data = ArticleData()
                    article_data.title = self.clean_text(title.get_text())
                    paragraphs = soup.find_all('p')
                    content_parts = [self.clean_text(p.get_text()) for p in paragraphs if len(p.get_text().strip()) > 50]
                    article_data.content = ' '.join(content_parts)
                    if article_data.content:
                        print(f"⚠️  Fallback: {article_data.title[:50]}...")
                        break

        if not article_data:
            return None

        article_data.url = url
        content_length = len(article_data.content)
        min_length = self.config['content_filters']['min_content_length']

        if content_length < min_length:
            print(f"⚠️  Content too short ({content_length} chars), skipping...")
            return None

        return article_data

    def scrape_articles(self, max_articles: int = 20, scrape_all_sections: bool = True) -> List[ArticleData]:
        """Main scraping method"""

        if scrape_all_sections:
            self.log_progress("Scraping from all Wazobia FM news sections...")
            all_links = []

            news_sections = self.get_news_sections()
            for section_url in news_sections[:5]:
                self.log_progress(f"Checking section: {section_url}")
                soup = self.get_page(section_url)
                if soup:
                    section_links = self.extract_article_links(soup)
                    all_links.extend(section_links)
                    self.log_progress(f"Found {len(section_links)} articles in this section")
        else:
            start_url = self.config['site_config']['base_url']
            self.log_progress(f"Starting scrape from {start_url}")
            soup = self.get_page(start_url)
            if not soup:
                print(f"❌ Failed to fetch page: {start_url}")
                return []
            all_links = self.extract_article_links(soup)

        # Remove duplicates and limit
        unique_links = list(set(all_links))
        self.log_progress(f"Found {len(unique_links)} unique article links")

        if len(unique_links) > max_articles:
            unique_links = unique_links[:max_articles]

        # Scrape individual articles
        scraped_count = 0
        for i, link in enumerate(unique_links, 1):
            if scraped_count >= max_articles:
                break

            if link in self.scraped_urls:
                continue

            self.log_progress(f"Scraping article {i}/{len(unique_links)}")

            article_data = self.extract_article_data(link)
            if article_data and article_data.title:
                self.articles.append(article_data)
                self.scraped_urls.add(link)
                scraped_count += 1
            else:
                print(f"⚠️  Skipped: No valid content found")

        self.log_progress(f"Scraping completed! Successfully scraped {scraped_count} articles")
        return self.articles

    def get_dataframe(self) -> pd.DataFrame:
        """Convert articles to DataFrame"""
        if not self.articles:
            return pd.DataFrame()

        data = [article.to_dict() for article in self.articles]
        df = pd.DataFrame(data)
        df['scraped_at'] = pd.to_datetime(df['scraped_at'])
        return df

    def save_to_json(self, filename: str = None) -> str:
        """Save articles to JSON file"""
        if not filename:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"wazobia_fm_articles_{timestamp}.json"

        data = [article.to_dict() for article in self.articles]

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False, default=str)

        print(f"✅ Data exported to {filename}")
        return filename

    def save_to_csv(self, filename: str = None) -> str:
        """Save articles to CSV file"""
        if not filename:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"wazobia_fm_articles_{timestamp}.csv"

        df = self.get_dataframe()

        # Convert list columns to strings
        df['tags'] = df['tags'].apply(lambda x: ', '.join(x) if x else '')
        df['images'] = df['images'].apply(lambda x: ', '.join(x) if x else '')

        df.to_csv(filename, index=False, encoding='utf-8')
        print(f"✅ Data exported to {filename}")
        return filename

print("✅ WazobiaScraper class ready!")


# ## Ready to Use!
print("\n🎉 Wazobia FM Scraper Ready!")
print("📖 Usage:")
print("   scraper = scrape_wazobia_fm(max_articles=20)")
print("   scraper.save_to_json()")
print("   scraper.save_to_csv()")

✅ Libraries imported successfully!
✅ Configuration loaded for Wazobia FM
✅ WazobiaScraper class ready!

🎉 Wazobia FM Scraper Ready!
📖 Usage:
   scraper = scrape_wazobia_fm(max_articles=20)
   scraper.save_to_json()
   scraper.save_to_csv()


In [None]:
#main_funtion

In [41]:
def scrape_wazobia_fm(max_articles=15):
    """Scrape articles from Wazobia FM"""
    print("📻 Starting Wazobia FM Scraping")
    print("=" * 50)

    scraper = WazobiaScraper()
    articles = scraper.scrape_articles(max_articles=max_articles)

    if articles:
        print(f"\n📋 Successfully scraped {len(articles)} articles")

        # Show sample articles
        for i, article in enumerate(articles[:3], 1):
            print(f"\n{i}. {article.title}")
            print(f"   Author: {article.author or 'Not specified'}")
            print(f"   Date: {article.date or 'Not specified'}")
            print(f"   Category: {article.category or 'General'}")
            print(f"   Content: {article.content[:150]}...")

        csv_file = scraper.save_to_csv()
        print(f"\n💾 Articles automatically saved to: {csv_file}")

        return scraper
    else:
        print("❌ No articles found")
        return None


In [None]:
#call the function

In [51]:
 scrape_wazobia_fm( max_articles= 10)

📻 Starting Wazobia FM Scraping
📊 Scraping from all Wazobia FM news sections...
📊 Checking section: https://www.wazobiafm.com/lagos/news/
📊 Found 28 articles in this section
📊 Checking section: https://www.wazobiafm.com/abuja/news/
📊 Found 28 articles in this section
📊 Checking section: https://www.wazobiafm.com/kano/news/
📊 Found 28 articles in this section
📊 Checking section: https://www.wazobiafm.com/onitsha/news/
📊 Found 28 articles in this section
📊 Checking section: https://www.wazobiafm.com/port-harcourt/news/
📊 Found 28 articles in this section
📊 Found 117 unique article links
📊 Scraping article 1/10
✅ JSON-LD: PWAN Vs. Scott Iguma: Influencer Faces Trial Over ...
📊 Scraping article 2/10
✅ JSON-LD: Chelsea Beat PSG 30 to Win Club World Cup...
📊 Scraping article 3/10
✅ JSON-LD: Legendary Super Eagles Goalkeeper, Peter Rufai, Do...
📊 Scraping article 4/10
✅ HTML: Fact Check...
📊 Scraping article 5/10
✅ HTML: News...
📊 Scraping article 6/10
✅ JSON-LD: Yusuf Buhari Thank Nigerians f

<__main__.WazobiaScraper at 0x7b396c4f6790>

#Clean the data
The data that is gotten from Wazobia is in a Jsonld, so its difficult to see the content. Here you just apply the function to the column

In [52]:
import pandas as pd
df = pd.read_csv("/content/wazobia_fm_articles_20250723_140635.csv")


In [53]:
df

Unnamed: 0,title,content,author,date,url,tags,images,scraped_at,content_length,category
0,PWAN Vs. Scott Iguma: Influencer Faces Trial O...,"pp data-end""438"" data-start""216""Iguma begin tr...",Emeka Ezem,2025-07-17T14:59:1700:00,https://www.wazobiafm.com/onitsha/news/hottori...,,https://mmo.aiircdn.com/370/68790fbc02649.jpg,2025-07-23 14:06:05.204906,0,
1,Chelsea Beat PSG 30 to Win Club World Cup,"pp data-end""387"" data-start""302""Di Premier Lea...",Sheriff Quadry,2025-07-14T12:24:2100:00,https://www.wazobiafm.com/onitsha/news/footbal...,,https://mmo.aiircdn.com/370/6874f6cca3f75.jpg,2025-07-23 14:06:08.553973,0,
2,"Legendary Super Eagles Goalkeeper, Peter Rufai...","pp data-end""214"" data-start""152""The Nigeria Fo...",Emeka Ezem,2025-07-03T19:33:1200:00,https://www.wazobiafm.com/onitsha/news/footbal...,,https://mmo.aiircdn.com/370/6866daefb5915.jpg,2025-07-23 14:06:11.328019,0,
3,Fact Check,Claim: Chris Ngige bin advise APC make dem no ...,,,https://www.wazobiafm.com/abuja/news/fact-check/,,"https://mmo.aiircdn.com/370/60e424d614caa.png,...",2025-07-23 14:06:14.313269,0,
4,News,Claim: Chris Ngige bin advise APC make dem no ...,,,https://www.wazobiafm.com/abuja/news/,,"https://mmo.aiircdn.com/370/60e424d614caa.png,...",2025-07-23 14:06:16.398214,0,
5,Yusuf Buhari Thank Nigerians for FEC Meeting a...,"pp data-end""677"" data-start""481""The meeting we...",Emeka Ezem,2025-07-17T20:15:5100:00,https://www.wazobiafm.com/abuja/news/hottori/y...,,https://mmo.aiircdn.com/370/6879599bb42e7.jpg,2025-07-23 14:06:19.683471,0,
6,House of Reps Wan Pass Bill Wey Go Stop Public...,"pp data-end""119"" data-start""0""The bill wey Rep...",Emeka Ezem,2025-07-23T11:16:1500:00,https://www.wazobiafm.com/abuja/news/hottori/h...,,https://mmo.aiircdn.com/370/6880c4657ba7c.jpg,2025-07-23 14:06:23.932016,0,
7,Liverpool player Diogo Jota Don Die for Car Ac...,"pspan style""font-size:12pt""span style""font-fam...",Emeka Ezem,2025-07-03T18:18:4200:00,https://www.wazobiafm.com/kano/news/footballre...,,https://mmo.aiircdn.com/370/6866c9742cd2b.jpg,2025-07-23 14:06:27.288200,0,


In [54]:
df['content']

Unnamed: 0,content
0,"pp data-end""438"" data-start""216""Iguma begin tr..."
1,"pp data-end""387"" data-start""302""Di Premier Lea..."
2,"pp data-end""214"" data-start""152""The Nigeria Fo..."
3,Claim: Chris Ngige bin advise APC make dem no ...
4,Claim: Chris Ngige bin advise APC make dem no ...
5,"pp data-end""677"" data-start""481""The meeting we..."
6,"pp data-end""119"" data-start""0""The bill wey Rep..."
7,"pspan style""font-size:12pt""span style""font-fam..."


In [55]:
def clean_content(text):
    if pd.isnull(text):
        return ""

    # Remove all data-start and data-end tags
    text = re.sub(r'data-(start|end)"\d+"', '', text)

    # Remove all standalone "p" or "pp" tags
    text = re.sub(r'\bpp?\b', '', text)

    # Replace common HTML character entities and malformed characters
    html_replacements = {
        'ndash;': '–',
        'mdash;': '—',
        'Joatilde;o': 'João',
        'Saacute;nchez': 'Sánchez',
    }
    for code, replacement in html_replacements.items():
        text = text.replace(code, replacement)

    # Clean extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [56]:
df['content_clean'] = df['content'].apply(clean_content)

In [58]:
clean_data = df['content_clean']

In [59]:
clean_data

Unnamed: 0,content_clean
0,Iguma begin trend on social media since strong...
1,Di Premier League team start di match well and...
2,The Nigeria Football Federation (NFF) confirme...
3,Claim: Chris Ngige bin advise APC make dem no ...
4,Claim: Chris Ngige bin advise APC make dem no ...
5,The meeting wey happen for Aso Rock on Wednesd...
6,The bill wey Rep. Amobi Ogah (LP–Abia) sponsor...
7,"pspan style""font-size:12pt""span style""font-fam..."
