In [6]:
!pip install feedparser




RSS Feed Parsing

In [7]:
import feedparser

RSS_FEEDS = [
    "http://rss.cnn.com/rss/cnn_topstories.rss",
    "http://qz.com/feed",
    "http://feeds.foxnews.com/foxnews/politics",
    "http://feeds.reuters.com/reuters/businessNews",
    "http://feeds.feedburner.com/NewshourWorld",
    "https://feeds.bbci.co.uk/news/world/asia/india/rss.xml"
]

def parse_feeds():
    articles = []
    for feed_url in RSS_FEEDS:
        feed = feedparser.parse(feed_url)
        for entry in feed.entries:
            article = {
                "title": entry.get("title", ""),
                "content": entry.get("summary", ""),
                "pub_date": entry.get("published", ""),
                "source_url": entry.get("link", "")
            }
            articles.append(article)
    return articles

# Test parsing
parsed_articles = parse_feeds()
parsed_articles[:2]  # Show first 2 articles for a quick check


[{'title': 'Some on-air claims about Dominion Voting Systems were false, Fox News acknowledges in statement after deal is announced',
  'content': '',
  'pub_date': 'Wed, 19 Apr 2023 12:44:51 GMT',
  'source_url': 'https://www.cnn.com/business/live-news/fox-news-dominion-trial-04-18-23/index.html'},
 {'title': 'Dominion still has pending lawsuits against election deniers such as Rudy Giuliani and Sidney Powell',
  'content': '',
  'pub_date': '',
  'source_url': 'https://www.cnn.com/business/live-news/fox-news-dominion-trial-04-18-23/h_8d51e3ae2714edaa0dace837305d03b8'}]

Database Setup Using SQLAlchemy

In [8]:
from sqlalchemy import create_engine, Column, String, DateTime
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from datetime import datetime
import feedparser

# Use SQLite for simplicity if you don't have PostgreSQL
DATABASE_URL = 'sqlite:///news_articles.db'  # For local development
# DATABASE_URL = "postgresql://user:password@localhost/rss_feed"  # Use this if PostgreSQL is available

Base = declarative_base()

class NewsArticle(Base):
    __tablename__ = 'news_articles'
    title = Column(String, primary_key=True)
    content = Column(String)
    pub_date = Column(DateTime)
    source_url = Column(String)
    category = Column(String)

# Connect and create table
engine = create_engine(DATABASE_URL)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)

# Function to convert pub_date string to datetime object
def convert_pub_date(pub_date_str):
    try:
        # Convert pub_date string to datetime object
        return datetime.strptime(pub_date_str, "%a, %d %b %Y %H:%M:%S %Z")
    except ValueError:
        return None  # Return None if the date format is invalid or missing

# Function to store articles
def store_articles(articles):
    session = Session()
    for article in articles:
        # Convert the pub_date string to a datetime object
        article["pub_date"] = convert_pub_date(article["pub_date"])
        
        # Check if the article already exists in the database
        if not session.query(NewsArticle).filter_by(title=article["title"]).first():
            new_article = NewsArticle(**article)
            session.add(new_article)
    session.commit()
    session.close()

# Example parsed articles from your feedparser logic (for testing)
parsed_articles = [
    {
        "title": "Example News Title 1",
        "content": "This is the content of the news article.",
        "pub_date": "Wed, 19 Apr 2023 12:44:51 GMT",
        "source_url": "https://example.com/news1",
        "category": "Positive/Uplifting"
    },
    {
        "title": "Example News Title 2",
        "content": "This is another news article.",
        "pub_date": "Fri, 21 Apr 2023 10:30:00 GMT",
        "source_url": "https://example.com/news2",
        "category": "Natural Disasters"
    }
]

# Store parsed articles into the database
store_articles(parsed_articles)



  Base = declarative_base()


In [10]:
import multiprocessing
from sqlalchemy import create_engine, Column, String, DateTime
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from datetime import datetime
import feedparser

# Database setup
DATABASE_URL = 'sqlite:///news_articles.db'  # For local development
Base = declarative_base()

class NewsArticle(Base):
    __tablename__ = 'news_articles'
    title = Column(String, primary_key=True)
    content = Column(String)
    pub_date = Column(DateTime)
    source_url = Column(String)
    category = Column(String)

# Connect and create table
engine = create_engine(DATABASE_URL)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)

# Function to convert pub_date string to datetime object
def convert_pub_date(pub_date_str):
    try:
        return datetime.strptime(pub_date_str, "%a, %d %b %Y %H:%M:%S %Z")
    except ValueError:
        return None

# Function to store articles
def store_articles(articles):
    session = Session()
    for article in articles:
        article["pub_date"] = convert_pub_date(article["pub_date"])
        if not session.query(NewsArticle).filter_by(title=article["title"]).first():
            new_article = NewsArticle(**article)
            session.add(new_article)
    session.commit()
    session.close()

# Placeholder for the classify_article function
def classify_article(content):
    # Replace this logic with your actual classification logic
    if "positive" in content.lower():
        return "Positive/Uplifting"
    elif "disaster" in content.lower():
        return "Natural Disasters"
    else:
        return "Others"

# Function to classify articles in batches
def classify_articles(articles):
    for article in articles:
        article["category"] = classify_article(article["content"])
    return articles

def process_articles(articles, batch_size=10):
    for i in range(0, len(articles), batch_size):
        batch = articles[i:i + batch_size]
        classified_articles = classify_articles(batch)
        store_articles(classified_articles)

# Example parsed articles (you should replace this with actual parsed data)
parsed_articles = [
    {
        "title": "Example News Title 1",
        "content": "This is the content of the news article mentioning a positive event.",
        "pub_date": "Wed, 19 Apr 2023 12:44:51 GMT",
        "source_url": "https://example.com/news1",
        "category": "Positive/Uplifting"
    },
    {
        "title": "Example News Title 2",
        "content": "This article is about a natural disaster.",
        "pub_date": "Fri, 21 Apr 2023 10:30:00 GMT",
        "source_url": "https://example.com/news2",
        "category": "Natural Disasters"
    }
]

# Call this function to process and categorize all articles
process_articles(parsed_articles)


  Base = declarative_base()


In [11]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

def classify_article(content):
    sentiment_score = sid.polarity_scores(content)["compound"]
    if sentiment_score > 0.3:
        return "Positive/Uplifting"
    elif "terrorism" in content.lower() or "protest" in content.lower():
        return "Terrorism / protest / political unrest / riot"
    elif "earthquake" in content.lower() or "flood" in content.lower():
        return "Natural Disasters"
    else:
        return "Others"

# Test classification on the first article
sample_article = parsed_articles[0]
sample_article["category"] = classify_article(sample_article["content"])
sample_article


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\SANIYA\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


{'title': 'Example News Title 1',
 'content': 'This is the content of the news article mentioning a positive event.',
 'pub_date': datetime.datetime(2023, 4, 19, 12, 44, 51),
 'source_url': 'https://example.com/news1',
 'category': 'Positive/Uplifting'}

In [12]:
import pandas as pd

def export_to_csv():
    session = Session()
    articles = session.query(NewsArticle).all()
    df = pd.DataFrame([{
        "title": a.title,
        "content": a.content,
        "pub_date": a.pub_date,
        "source_url": a.source_url,
        "category": a.category
    } for a in articles])
    df.to_csv('news_articles.csv', index=False)

# Export to CSV
export_to_csv()


In [13]:
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def log_error(e, article_title=""):
    logging.error(f"Error processing article '{article_title}': {e}")

# Example usage: log_error(Exception("Sample error"), "Sample article")


In [14]:
def process_article_with_error_handling(article):
    try:
        article["category"] = classify_article(article["content"])
        store_articles([article])
    except Exception as e:
        log_error(e, article["title"])

def process_articles_with_error_handling(articles):
    for article in articles:
        process_article_with_error_handling(article)

# Process all articles with error handling
process_articles_with_error_handling(parsed_articles)


2024-10-07 21:37:25,659 - ERROR - Error processing article 'Example News Title 1': strptime() argument 1 must be str, not datetime.datetime
2024-10-07 21:37:25,662 - ERROR - Error processing article 'Example News Title 2': strptime() argument 1 must be str, not datetime.datetime
