In [1]:
import nltk
from bs4 import BeautifulSoup
import requests
from collections import Counter
from nltk.corpus import stopwords
import re
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

def get_with_retry(url):
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
    session = requests.Session()
    session.mount('http://', HTTPAdapter(max_retries=retries))
    session.mount('https://', HTTPAdapter(max_retries=retries))

    try:
        response = session.get(url)
        response.raise_for_status()
        return response
    except requests.exceptions.RequestException as err:
        return None

def preprocess_text(text):
    # Remove HTML code
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Tokenization, lowercasing, and line changes on punctuations
    words = nltk.word_tokenize(text.lower())
    words = [word if word.isalpha() else '\n' if any(char in word for char in ['.', '!', '?']) else word for word in words]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalpha() and word not in stop_words]

    return words

def process_text_with_pos(text, threshold=2):
    words = preprocess_text(text)

    # POS tagging
    pos_tags = nltk.pos_tag(words)

    # Filter words based on specific POS tags
    relevant_pos_tags = ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS']
    filtered_words = [word for word, pos in pos_tags if pos in relevant_pos_tags]

    # Count word frequencies
    word_counts = Counter(filtered_words)

    # Remove least frequently occurring words (adjust threshold as needed)
    filtered_words = [word for word in filtered_words if word_counts[word] >= threshold]

    # Join the remaining words
    filtered_text = ' '.join(filtered_words)
    return filtered_text

def get_articles_with_pos(topic):
    url = f'https://news.google.com/search?q={topic}&hl=en-IN&gl=IN&ceid=IN%3Aen'
    page = get_with_retry(url)

    if page is None:
        return []

    soup = BeautifulSoup(page.text, 'html.parser')
    articles = soup.find_all('a', class_='WwrzSb')
    articles_list = []

    for article in articles:
        link = 'https://news.google.com' + article['href'][1:]
        p1 = get_with_retry(link)

        if p1 is None:
            continue  # Skip this article if unable to retrieve the content

        if p1.status_code == 403:  # Access Denied
            print(f"Access Denied for article: {link}")
            continue

        if p1.status_code == 404:  # Not Found
            print(f"Article Not Found: {link}")
            continue

        soup1 = BeautifulSoup(p1.text, 'html.parser')
        heading_tag = soup1.find('h1')

        if heading_tag:
            heading = heading_tag.get_text(strip=True)
            merged_text = '\n'.join([p.get_text(strip=True) for p in soup1.find_all('p')])

            # Process and filter text with POS tagging
            filtered_text = process_text_with_pos(merged_text)

            # Append processed text to the list
            articles_list.append((heading, filtered_text))

    return articles_list

# Get user input for the topic
topic = input("Enter a topic: ")

# Get articles related to the topic with POS tagging
articles_list = get_articles_with_pos(topic)

# Print processed articles
for heading, filtered_text in articles_list:
    print(heading)
    print(filtered_text)
    print('--------------------------------------------------------------------------------------------------------------------------------')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sidhesh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sidhesh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sidhesh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Enter a topic:  domino


Domino’s Pizza stock climbs over 9% after better-than-expected same-store sales
domino pizza cent monday sales chain reported profit reported quarterly share profit quarterly cent share comparable sales cent said domino pizza domino pizza stock monday increase domino pizza chief officer weiner monday said pizza get increase company program program added weiner added domino comparable sales cent domino chief officer reddy said company chain reddy said domino pizza take points cent points cent points cent news website newsletters stock news login get website take bookmark image bookmark image please login newsletters please get
--------------------------------------------------------------------------------------------------------------------------------
Domino's Pizza loyalty program revamp, Uber Eats tie-up boost sales
uber eats domino uber eats partnership insider sales first sales domino india real cheese cheese real cheese uber eats food delivery robots tokyo partnership mitsubishi 