In [39]:
from sqlalchemy import create_engine, Column, Integer, String, DateTime
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from celery import Celery
import feedparser
from datetime import datetime
from typing import List
import logging
import datetime

In [40]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [41]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [42]:
# Database Configuration
DATABASE_URL = 'sqlite:///ten-times-file.db'
engine = create_engine(DATABASE_URL)
Base = declarative_base()

In [43]:
# Define Article Model
class Article(Base):
    __tablename__ = 'articles'

    id = Column(Integer, primary_key=True, autoincrement=True)
    title = Column(String)
    content = Column(String)
    published = Column(DateTime)
    source_url = Column(String)

In [44]:
# Celery Configuration
app = Celery('tasks', broker='redis://localhost:6379/0')

In [45]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [46]:
def process_text(text):
    tokens = word_tokenize(text)
    filtered_tokens = [stemmer.stem(word.lower()) for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return filtered_tokens

def classify_category(text):
    return 'politics'

In [47]:
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)

def parse_feed(url):
    feed = feedparser.parse(url)
    articles = []
    for entry in feed.entries:
        article = {
            'title': entry.get('title'),
            'content': entry.get('summary'),
            'published': datetime.strptime(entry.get('published'), '%a, %d %b %Y %H:%M:%S %Z'),
            'source_url': entry.get('link')
        }
        articles.append(article)
    return articles

In [48]:

def parse_feed(url):
    feed = feedparser.parse(url)
    articles = []
    for entry in feed.entries:
        title = entry.get('title')
        content = entry.get('summary')
        published = entry.get('published')
        if published is not None:
            published = datetime.datetime.strptime(published, '%a, %d %b %Y %H:%M:%S %Z')
        source_url = entry.get('link')
        article = {'title': title, 'content': content, 'published': published, 'source_url': source_url}
        articles.append(article)
    return articles


In [49]:
# Database Storage
def store_articles(articles: List[dict]):
    session = Session()
    try:
        for article in articles:
            existing_article = session.query(Article).filter_by(source_url=article['source_url']).first()
            if not existing_article:
                new_article = Article(**article)
                session.add(new_article)
        session.commit()
        logger.info("Articles stored successfully")
    except Exception as e:
        logger.error(f"Error storing articles: {e}")
        session.rollback()
    finally:
        session.close()

In [50]:

# Task Queue and News Processing
@app.task
def process_articles():
    session = Session()
    try:
        articles = session.query(Article).filter_by(category=None).all()
        for article in articles:
            tokens = process_text(article.content)
            category = classify_category(tokens)
            article.category = category
        session.commit()
        logger.info("Articles processed successfully")
    except Exception as e:
        logger.error(f"Error processing articles: {e}")
        session.rollback()
    finally:
        session.close()

In [None]:
if __name__ == '__main__':
    feed_urls = [
        "http://rss.cnn.com/rss/cnn_topstories.rss",
        "http://qz.com/feed",
        "http://feeds.foxnews.com/foxnews/politics",
        "http://feeds.reuters.com/reuters/businessNews",
        "http://feeds.feedburner.com/NewshourWorld",
        "https://feeds.bbci.co.uk/news/world/asia/india/rss.xml"
    ]

    for url in feed_urls:
        articles = parse_feed(url)
        store_articles(articles)
        process_articles.delay()