<a href="https://colab.research.google.com/github/SwetaAgarwal30/DataInternAssignment/blob/main/DataInternAssignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install feedparser sqlalchemy spacy
!python -m spacy download en_core_web_sm


Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m908.8 kB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6047 sha256=4e5258d5fe3b4c6ce516e0cf8a72626560334d762764b34e43ff9ed835d55c66
  Stored in directory: /root/.cache/pip/wheels/f0/69/93/a47e9d621be168e9e33c7ce60524393c0b92ae83cf6c6e89c5
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser
Successfully installed feedparser-6.0.11 sgmllib3k-1.0.0
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/sp

In [None]:
import feedparser
from datetime import datetime
import sqlite3

conn = sqlite3.connect('news_articles.db')
c = conn.cursor()

c.execute('''CREATE TABLE IF NOT EXISTS news_articles (
                source_url TEXT PRIMARY KEY,
                title TEXT,
                content TEXT,
                published TEXT,
                category TEXT)''')

rss_feeds = [
    "http://rss.cnn.com/rss/cnn_topstories.rss",
    "http://qz.com/feed",
    "http://feeds.foxnews.com/foxnews/politics",
    "http://feeds.reuters.com/reuters/businessNews",
    "http://feeds.feedburner.com/NewshourWorld",
    "https://feeds.bbci.co.uk/news/world/asia/india/rss.xml"
]

def fetch_articles():
    articles = []
    for feed in rss_feeds:
        parsed_feed = feedparser.parse(feed)
        for entry in parsed_feed.entries:
          content = entry.get('summary', entry.get('description', entry.get('content', 'No content available')))
          article = {
                'title': entry.title,
                'content': content,
                'published': entry.published if 'published' in entry else str(datetime.now()),
                'source_url': entry.link
            }
          articles.append(article)
    return articles

def store_articles(articles):
    for article in articles:
        try:
            c.execute('''INSERT OR IGNORE INTO news_articles (source_url, title, content, published, category)
                         VALUES (?, ?, ?, ?, ?)''',
                         (article['source_url'], article['title'], article['content'], article['published'], 'Uncategorized'))
        except Exception as e:
            print(f"Error inserting into database: {e}")
    conn.commit()

articles = fetch_articles()
store_articles(articles)


In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

categories = {
    'Terrorism': ['attack', 'terrorist', 'riot', 'protest'],
    'Positive': ['success', 'achievement', 'uplifting', 'positive'],
    'Natural Disasters': ['earthquake', 'hurricane', 'flood', 'disaster']
}

def categorize_article(content):
    doc = nlp(content)
    for category, keywords in categories.items():
        if any(keyword in doc.text for keyword in keywords):
            return category
    return 'Others'

def categorize_and_update_articles():
    c.execute('SELECT source_url, content FROM news_articles WHERE category = "Uncategorized"')
    uncategorized_articles = c.fetchall()
    for article in uncategorized_articles:
        source_url, content = article
        category = categorize_article(content)
        c.execute('UPDATE news_articles SET category = ? WHERE source_url = ?', (category, source_url))
    conn.commit()

categorize_and_update_articles()


In [None]:
import pandas as pd

df = pd.read_sql_query("SELECT * FROM news_articles", conn)
df.to_csv('news_articles.csv', index=False)

df.to_json('news_articles.json', orient='records', lines=True)


In [None]:
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def fetch_articles_with_logging():
    articles = []
    for feed in rss_feeds:
        try:
            parsed_feed = feedparser.parse(feed)
            logging.info(f"Parsed {len(parsed_feed.entries)} articles from {feed}")
            for entry in parsed_feed.entries:
                article = {
                    'title': entry.title,
                    'content': entry.summary,
                    'published': entry.published if 'published' in entry else str(datetime.now()),
                    'source_url': entry.link
                }
                articles.append(article)
        except Exception as e:
            logging.error(f"Error parsing {feed}: {e}")
    return articles


In [None]:
df = pd.read_sql_query("SELECT * FROM news_articles", conn)
df.head()

Unnamed: 0,source_url,title,content,published,category
0,https://www.cnn.com/business/live-news/fox-new...,Some on-air claims about Dominion Voting Syste...,No content available,"Wed, 19 Apr 2023 12:44:51 GMT",Others
1,https://www.cnn.com/business/live-news/fox-new...,Dominion still has pending lawsuits against el...,No content available,2024-10-09 06:24:36.322985,Others
2,https://www.cnn.com/2023/04/17/media/dominion-...,Here are the 20 specific Fox broadcasts and tw...,"• Fox-Dominion trial delay 'is not unusual,' j...","Mon, 17 Apr 2023 16:01:11 GMT",Others
3,https://www.cnn.com/2023/04/18/media/fox-domin...,Judge in Fox News-Dominion defamation trial: '...,The judge just announced in court that a settl...,"Wed, 19 Apr 2023 08:28:17 GMT",Others
4,https://www.cnn.com/videos/politics/2023/04/18...,'Difficult to say with a straight face': Tappe...,A settlement has been reached in Dominion Voti...,"Tue, 18 Apr 2023 21:17:44 GMT",Others


In [None]:
from google.colab import files
df.to_csv('news_articles.csv', index=False)
files.download('news_articles.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>