## Build a News ETL Data Pipeline Using Python and SQLite

### Task 1: Import Libraries and Connect to the News API

In [4]:
import pandas as pd
import logging
import sqlite3
import newsapi
from newsapi import NewsApiClient

In [5]:
news_api = NewsApiClient("2acc3d3e79894baf9b48cc843c0de778")

### Task 2: Retrieve and Print News Articles

In [6]:
def extract_news_data():
    try:
        res = news_api.get_everything(q="AI", language="en",sort_by='publishedAt')
        logging.info("Connection is successful.")
        return res["articles"]
    except Exception as e:
        logging.error(f"Connection is unsuccessful. Error: {e}")
        return None

articles = extract_news_data()

print(articles[:3])

[{'source': {'id': None, 'name': 'Pypi.org'}, 'author': 'michael.bommarito@gmail.com', 'title': 'pyenvsearch 0.1.0', 'description': 'Python library navigation and AI-powered analysis tool for developers and AI agents. Combines traditional code search with LLM-powered package insights.', 'url': 'https://pypi.org/project/pyenvsearch/0.1.0/', 'urlToImage': None, 'publishedAt': '2025-08-25T01:15:20Z', 'content': 'A required part of this site couldnt load. This may be due to a browser\r\n extension, network issues, or browser settings. Please check your\r\n connection, disable any ad blockers, or try using a diffe… [+12 chars]'}, {'source': {'id': None, 'name': 'Pypi.org'}, 'author': 'michael.bommarito@gmail.com', 'title': 'pyenvsearch added to PyPI', 'description': 'Python library navigation and AI-powered analysis tool for developers and AI agents. Combines traditional code search with LLM-powered package insights.', 'url': 'https://pypi.org/project/pyenvsearch/', 'urlToImage': None, 'pub

### Task 3: Clean Author Column

In [7]:
def clean_author_column(text):
    try:
        return text.split(",")[0].title()
    except AttributeError:
        return "No Author"
    

### Task 4: Transform News Data

In [8]:
def transform_news_data(articles):
    article_list = []
    for i in articles:
        article_list.append([value.get("name", 0) if key == "source" else value for key, value in i.items() if key in ["author", "title", "publishedAt", "content", "url", "source"]])

    df = pd.DataFrame(article_list, columns=["Source", "Author Name", "News Title", "URL", "Date Published", "Content"])

    df["Date Published"] = pd.to_datetime(df["Date Published"]).dt.strftime('%Y-%m-%d %H:%M:%S')
    df["Author Name"] = df["Author Name"].apply(clean_author_column)
 
    return df
t = transform_news_data(articles)

#print(t)

### Task 5: Load the Data into SQLite Database


In [11]:
def load_news_data(data):
    with sqlite3.connect("/usercode/news_data.sqlite") as connection:
        cursor = connection.cursor()
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS news_table (
                "Source" VARCHAR(30),
                "Author Name" TEXT,
                "News Title" TEXT,
                "URL" TEXT,
                "Date Published" TEXT,
                "Content" TEXT
            )
        ''')
    data.to_sql(name="news_table", con=connection, index=False, if_exists="append")
 
load_news_data(t)

### Task 8: Verify Data Loading

In [13]:
with sqlite3.connect("/usercode/news_data.sqlite") as connection:
    df = pd.read_sql("SELECT * FROM news_table;", connection)
df.head()

Unnamed: 0,Source,Author Name,News Title,URL,Date Published,Content
0,Yahoo Entertainment,Simply Wall St,Velesto Energy Berhad Second Quarter 2025 Earn...,https://finance.yahoo.com/news/velesto-energy-...,2025-08-24 01:36:42,<ul><li>Revenue: RM199.9m (down 49% from 2Q 20...
1,Pypi.org,Hello@Kumo.Ai,kumoai 2.8.0.dev202508231831,https://pypi.org/project/kumoai/2.8.0.dev20250...,2025-08-24 01:34:43,A required part of this site couldnt load. Thi...
2,Biztoc.com,Apnews.Com,Shilo Sanders ejected from Buccaneers' preseas...,https://biztoc.com/x/eb95f31ef7d71603,2025-08-24 01:33:35,"{ window.open(this.href, '_blank'); }, 200); r..."
3,Buddytv.com,Buddy Tv,‘South Park’ Mocks Donald Trump in Bold New Ep...,https://www.buddytv.com/south-park-mocks-donal...,2025-08-24 01:25:03,South Park has never been shy about tackling c...
4,Biztoc.com,Investing.Com,Bank of England's Bailey says UK has 'acute ch...,https://biztoc.com/x/179841c00b9f9893,2025-08-24 01:22:17,"{ window.open(this.href, '_blank'); }, 200); r..."
