In [5]:
!pip install sqlalchemy psycopg2 feedparser




In [6]:
from sqlalchemy import create_engine, Column, String, Integer, DateTime
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker

# Define the base class
Base = declarative_base()

# Define the news_articles table
class NewsArticle(Base):
    __tablename__ = 'news_articles'
    id = Column(Integer, primary_key=True)
    title = Column(String, nullable=False)
    content = Column(String)
    publication_date = Column(DateTime)
    source_url = Column(String, unique=True, nullable=False)
    category = Column(String, nullable=False)

# Correct password encoding for special characters in the password
DATABASE_URL = 'postgresql://postgres:Saty%402677sa@localhost:5432/news_db'

# Create an engine that connects to the PostgreSQL database
engine = create_engine(DATABASE_URL)

# Create the table in the database (if it doesn't already exist)
Base.metadata.create_all(engine)

# Set up a session to interact with the database
Session = sessionmaker(bind=engine)
session = Session()

print("Database connected and table created successfully!")


  Base = declarative_base()


Database connected and table created successfully!


In [7]:
from sqlalchemy import create_engine, Column, String, Integer, DateTime
from sqlalchemy.orm import declarative_base, sessionmaker  # Updated import

# Define the base class
Base = declarative_base()  # No longer using sqlalchemy.ext.declarative

# Define the news_articles table
class NewsArticle(Base):
    __tablename__ = 'news_articles'
    id = Column(Integer, primary_key=True)
    title = Column(String, nullable=False)
    content = Column(String)
    publication_date = Column(DateTime)
    source_url = Column(String, unique=True, nullable=False)
    category = Column(String, nullable=False)

# Define your database connection URL (make sure to use your password)
DATABASE_URL = 'postgresql://postgres:Saty%402677sa@localhost:5432/news_db'

# Create an engine that connects to the PostgreSQL database
engine = create_engine(DATABASE_URL)

# Create the table in the database (if it doesn't already exist)
Base.metadata.create_all(engine)

# Set up a session to interact with the database
Session = sessionmaker(bind=engine)
session = Session()

print("Database connected and table created successfully!")


Database connected and table created successfully!


In [14]:
from datetime import datetime

# Rollback any previous transactions to ensure the session is clean
session.rollback()

# List of sample news articles
sample_articles = [
    {
        "title": "Sample News Article 1",
        "content": "This is a summary of the first sample news article.",
        "publication_date": datetime.now(),
        "source_url": "https://example.com/sample-article-1",
        "category": "Others"
    },
    {
        "title": "Sample News Article 2",
        "content": "This is a summary of the second sample news article.",
        "publication_date": datetime.now(),
        "source_url": "https://example.com/sample-article-2",
        "category": "Positive/Uplifting"
    },
    {
        "title": "Sample News Article 3",
        "content": "This is a summary of the third sample news article.",
        "publication_date": datetime.now(),
        "source_url": "https://example.com/sample-article-3",
        "category": "Natural Disasters"
    }
]

# Insert each sample article into the database after checking for duplicates
for article in sample_articles:
    try:
        # Check if the article with the same source_url already exists
        existing_article = session.query(NewsArticle).filter_by(source_url=article['source_url']).first()
        
        if existing_article is None:  # If no article with the same URL exists, insert it
            new_article = NewsArticle(
                title=article['title'],
                content=article['content'],
                publication_date=article['publication_date'],
                source_url=article['source_url'],
                category=article['category']
            )
            session.add(new_article)
            print(f"Article '{article['title']}' added successfully.")
        else:
            print(f"Article with URL '{article['source_url']}' already exists in the database.")
    
    except Exception as e:
        print(f"Error while inserting article '{article['title']}': {str(e)}")
        session.rollback()  # Rollback in case of any failure

# Commit the transaction to insert the new articles
try:
    session.commit()
    print("All articles committed successfully!")
except Exception as e:
    print(f"Error during commit: {str(e)}")
    session.rollback()


Article with URL 'https://example.com/sample-article-1' already exists in the database.
Article with URL 'https://example.com/sample-article-2' already exists in the database.
Article with URL 'https://example.com/sample-article-3' already exists in the database.
All articles committed successfully!


  session.rollback()


In [12]:
import feedparser
from datetime import datetime

# List of RSS feed URLs
feed_urls = [
    'http://rss.cnn.com/rss/cnn_topstories.rss',
    'http://qz.com/feed',
    'http://feeds.foxnews.com/foxnews/politics',
    'http://feeds.reuters.com/reuters/businessNews'
]

# Function to parse RSS feeds
def parse_feeds():
    articles = []
    for url in feed_urls:
        feed = feedparser.parse(url)
        for entry in feed.entries:
            # Extract relevant information from each entry
            articles.append({
                'title': entry.title,
                'link': entry.link,
                'published': entry.published if 'published' in entry else datetime.now(),
                'summary': entry.summary if 'summary' in entry else "No summary available."
            })
    return articles

# Get parsed articles from RSS feeds
parsed_articles = parse_feeds()
print(f"Parsed {len(parsed_articles)} articles from RSS feeds.")


Parsed 143 articles from RSS feeds.


In [15]:
# Insert parsed articles into the database
for article in parsed_articles:
    # Check for duplicates based on source URL
    existing_article = session.query(NewsArticle).filter_by(source_url=article['link']).first()
    if not existing_article:
        new_article = NewsArticle(
            title=article['title'],
            content=article['summary'],
            publication_date=datetime.now(),  # If the published date is not available
            source_url=article['link'],
            category="Uncategorized"  # Default category, you can classify later
        )
        session.add(new_article)

# Commit the transaction to insert the articles
session.commit()

print(f"Inserted {len(parsed_articles)} new articles into the database.")


Inserted 143 new articles into the database.


In [16]:
# Keywords for different categories
category_keywords = {
    "Terrorism / protest / political unrest / riot": ["terrorism", "protest", "riot", "political unrest"],
    "Positive/Uplifting": ["hope", "positive", "uplift", "inspiring", "good news"],
    "Natural Disasters": ["earthquake", "flood", "hurricane", "disaster", "tornado"]
}

# Function to categorize an article based on keywords
def categorize_article(content):
    for category, keywords in category_keywords.items():
        if any(keyword in content.lower() for keyword in keywords):
            return category
    return "Others"  # Default category if no keywords match

# Fetch all uncategorized articles from the database
uncategorized_articles = session.query(NewsArticle).filter_by(category="Uncategorized").all()

# Categorize each article
for article in uncategorized_articles:
    article.category = categorize_article(article.content)
    session.add(article)  # Update the article with the new category

# Commit the changes to the database
session.commit()

print(f"Categorized {len(uncategorized_articles)} articles.")


Categorized 20 articles.


In [17]:
from sqlalchemy import func

# Count the number of articles in each category
category_counts = session.query(NewsArticle.category, func.count(NewsArticle.id)).group_by(NewsArticle.category).all()

for category, count in category_counts:
    print(f"Category: {category}, Count: {count}")


Category: Natural Disasters, Count: 9
Category: Others, Count: 152
Category: Positive/Uplifting, Count: 4


In [18]:
# Fetch articles in the 'Natural Disasters' category
natural_disasters_articles = session.query(NewsArticle).filter_by(category="Natural Disasters").all()

for article in natural_disasters_articles:
    print(f"Title: {article.title}, URL: {article.source_url}")


Title: Sample News Article 3, URL: https://example.com/sample-article-3
Title: Biden undermines Harris claim that Ron DeSantis is politicizing hurricane response: 'Doing a great job', URL: https://www.foxnews.com/politics/biden-undermines-harris-claim-ron-desantis-politicizing-hurricane-response-doing-great-job
Title: North Carolina residents will see changes to early voting after Hurricane Helene, URL: https://www.foxnews.com/politics/north-carolina-residents-see-changes-early-voting-after-hurricane-helene
Title: Here are all the airports closing down ahead of Hurricane Milton, URL: https://qz.com/hurricane-milton-airport-closures-orlando-tampa-1851667895
Title: Boeing is restarting talks to end a strike some compare to an economic Hurricane Helene, URL: https://qz.com/boeing-strike-negotiations-act-of-god-1851666732
Title: Red Lobster's new CEO admits it: 'Endless Shrimp' was a disaster, URL: https://qz.com/red-lobster-ceo-endless-shrimp-disaster-1851666647
Title: Biden cancels overs

In [19]:
pip install flask





In [20]:
pip install flask-sqlalchemy


Note: you may need to restart the kernel to use updated packages.


In [10]:
pip install psycopg2


Note: you may need to restart the kernel to use updated packages.


# News Article Aggregator

## Project Description

This project is a **News Article Aggregator** that collects news articles from various RSS feeds, stores them in a database, and categorizes them into predefined categories. It is designed to process articles asynchronously using Celery, with text classification handled via Natural Language Processing (NLP).

### Features:
- Collects news from multiple RSS feeds.
- Categorizes news articles into the following categories:
  - Terrorism/Protest/Political Unrest/Riot
  - Positive/Uplifting News
  - Natural Disasters
  - Others
- Stores the articles in a PostgreSQL database.
- Uses Celery for task queue management, ensuring the app can handle large amounts of incoming news articles efficiently.
- Simple Flask interface to view and manage the parsed news.

## Project Structure

```plaintext
├── app.py                  # Main application file to run the Flask server
├── feed_parser.py           # Script to parse RSS feeds
├── tasks.py                 # Celery tasks to manage asynchronous processing
├── models.py                # Database models and schema
├── requirements.txt         # Python dependencies needed for the project
└── README.md                # Project documentation


## Technologies Used

- **Programming Language**: Python
- **Libraries**: pandas, numpy, matplotlib, scikit-learn
- **Framework**: Flask (for web server)
- **Database**: PostgreSQL
- **Task Queue**: Celery
- **NLP Libraries**: NLTK or spaCy for categorizing news articles
- **RSS Parsing**: Feedparser
