In [2]:
import nltk
from bs4 import BeautifulSoup
import requests
from collections import Counter
from nltk.corpus import stopwords
import re
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

def get_with_retry(url):
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
    session = requests.Session()
    session.mount('http://', HTTPAdapter(max_retries=retries))
    session.mount('https://', HTTPAdapter(max_retries=retries))

    try:
        response = session.get(url)
        response.raise_for_status()
        return response
    except requests.exceptions.RequestException as err:
        return None

def preprocess_text(text):
    # Remove HTML code
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Tokenization, lowercasing, and line changes on punctuations
    words = nltk.word_tokenize(text.lower())
    words = [word if word.isalpha() else '\n' if any(char in word for char in ['.', '!', '?']) else word for word in words]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalpha() and word not in stop_words]

    return words

def process_text_with_pos(text, threshold=2):
    words = preprocess_text(text)

    # POS tagging
    pos_tags = nltk.pos_tag(words)

    # Filter words based on specific POS tags
    relevant_pos_tags = ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS']
    filtered_words = [word for word, pos in pos_tags if pos in relevant_pos_tags]

    # Count word frequencies
    word_counts = Counter(filtered_words)

    # Remove least frequently occurring words (adjust threshold as needed)
    filtered_words = [word for word in filtered_words if word_counts[word] >= threshold]

    # Join the remaining words
    filtered_text = ' '.join(filtered_words)
    return filtered_text

def get_articles_with_pos(topic):
    url = f'https://news.google.com/search?q={topic}&hl=en-IN&gl=IN&ceid=IN%3Aen'
    page = get_with_retry(url)

    if page is None:
        return []

    soup = BeautifulSoup(page.text, 'html.parser')
    articles = soup.find_all('a', class_='WwrzSb')
    articles_list = []

    for article in articles:
        link = 'https://news.google.com' + article['href'][1:]
        p1 = get_with_retry(link)

        if p1 is None:
            continue  # Skip this article if unable to retrieve the content

        if p1.status_code == 403:  # Access Denied
            print(f"Access Denied for article: {link}")
            continue

        if p1.status_code == 404:  # Not Found
            print(f"Article Not Found: {link}")
            continue

        soup1 = BeautifulSoup(p1.text, 'html.parser')
        heading_tag = soup1.find('h1')

        if heading_tag:
            heading = heading_tag.get_text(strip=True)
            merged_text = '\n'.join([p.get_text(strip=True) for p in soup1.find_all('p')])

            # Process and filter text with POS tagging
            filtered_text = process_text_with_pos(merged_text)

            # Append processed text to the list
            articles_list.append((heading, filtered_text))

    return articles_list

# Get user input for the topic
topic = input("Enter a topic: ")

# Get articles related to the topic with POS tagging
articles_list = get_articles_with_pos(topic)

# Print processed articles
for heading, filtered_text in articles_list:
    print(heading)
    print(filtered_text)
    print('--------------------------------------------------------------------------------------------------------------------------------')


[nltk_data] Downloading package punkt to /Users/apple/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/apple/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Enter a topic:  google


3 new Chrome features to get more helpful suggestions
chrome get helpful search suggestions chrome based others looking see connection poor looking today new chrome give inspiration chrome new see suggestions google search based others looking see suggestions chrome images search suggestions looking android ios chrome helpful images based search poor connection information need chrome android ios give search suggestions connection get helpful suggestions today information inspiration need get google
--------------------------------------------------------------------------------------------------------------------------------
Google Chrome packs more searches into the Search Bar
khalid khalid new search chrome google search box chrome display queries started typing google example user korean see images similar korean search box people search search suggestions images new feature engine google started people search chrome search suggestions feature typing pesto sandwich similar search q

In [3]:
import nltk
from bs4 import BeautifulSoup
import requests
from collections import Counter
from nltk.corpus import stopwords
import re
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import pandas as pd

# Download NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

def get_with_retry(url):
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
    session = requests.Session()
    session.mount('http://', HTTPAdapter(max_retries=retries))
    session.mount('https://', HTTPAdapter(max_retries=retries))

    try:
        response = session.get(url)
        response.raise_for_status()
        return response
    except requests.exceptions.RequestException as err:
        return None

def preprocess_text(text):
    # Remove HTML code
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Tokenization, lowercasing, and line changes on punctuations
    words = nltk.word_tokenize(text.lower())
    words = [word if word.isalpha() else '\n' if any(char in word for char in ['.', '!', '?']) else word for word in words]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalpha() and word not in stop_words]

    return words

def process_text_with_pos(text, threshold=2):
    words = preprocess_text(text)

    # POS tagging
    pos_tags = nltk.pos_tag(words)

    # Filter words based on specific POS tags
    relevant_pos_tags = ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS']
    filtered_words = [word for word, pos in pos_tags if pos in relevant_pos_tags]

    # Count word frequencies
    word_counts = Counter(filtered_words)

    # Remove least frequently occurring words (adjust threshold as needed)
    filtered_words = [word for word in filtered_words if word_counts[word] >= threshold]

    # Join the remaining words
    filtered_text = ' '.join(filtered_words)
    return filtered_text

def get_articles_with_pos(topic):
    url = f'https://news.google.com/search?q={topic}&hl=en-IN&gl=IN&ceid=IN%3Aen'
    page = get_with_retry(url)

    if page is None:
        return []

    soup = BeautifulSoup(page.text, 'html.parser')
    articles = soup.find_all('a', class_='WwrzSb')
    articles_list = []

    for article in articles:
        link = 'https://news.google.com' + article['href'][1:]
        p1 = get_with_retry(link)

        if p1 is None:
            continue  # Skip this article if unable to retrieve the content

        if p1.status_code == 403:  # Access Denied
            print(f"Access Denied for article: {link}")
            continue

        if p1.status_code == 404:  # Not Found
            print(f"Article Not Found: {link}")
            continue

        soup1 = BeautifulSoup(p1.text, 'html.parser')
        heading_tag = soup1.find('h1')

        if heading_tag:
            heading = heading_tag.get_text(strip=True)
            merged_text = '\n'.join([p.get_text(strip=True) for p in soup1.find_all('p')])

            # Process and filter text with POS tagging
            filtered_text = process_text_with_pos(merged_text)

            # Append processed text to the list
            articles_list.append((heading, filtered_text))

    return articles_list

# Get user input for the topic
topic = input("Enter a topic: ")

# Get articles related to the topic with POS tagging
articles_list = get_articles_with_pos(topic)

# Create a DataFrame from the articles list
df = pd.DataFrame(articles_list, columns=['Heading', 'Filtered_Text'])

# Define the filename based on the input topic
filename = f"{topic}.csv"

# Save the DataFrame to a CSV file
df.to_csv(filename, index=False)

print(f"Processed articles saved to {filename}")


[nltk_data] Downloading package punkt to /Users/apple/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/apple/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Enter a topic:  apollo


Processed articles saved to apollo.csv


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset and select only 10,000 rows
df = pd.read_csv('articles1.csv', nrows=10000)

# Assuming 'content' is the column you want to preprocess
X = df['content']

# Split the dataset into training and testing sets
X_train, X_test = train_test_split(X, test_size=0.3, random_state=0)

# Preprocess the training and testing data
X_train_processed = X_train.apply(process_text_with_pos)
X_test_processed = X_test.apply(process_text_with_pos)

# Display the shapes of the split datasets
print("Training data shape:", X_train_processed.shape)
print("Testing data shape:", X_test_processed.shape)


Training data shape: (7000,)
Testing data shape: (3000,)


In [29]:
df = pd.read_csv('./apollo.csv')
df['Filtered_Text']

0     upstox hearing public interest litigation pil ...
1     top water top water top top top top top top to...
2     short buildup price stock short buildup short ...
3     stock analysis buying wealth companies see sha...
4     trending news news slams air march ipad pro ne...
                            ...                        
76    apollo hospitals cent yoy consolidated profit ...
77    shares micro systems wednesday cent session ne...
78    systems brokerage firm choice broking ams serv...
79                                                  NaN
80    economic times news get market news subscribe ...
Name: Filtered_Text, Length: 81, dtype: object

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit on the training data and transform it into a Bag of Words representation
X_train_bow = vectorizer.fit_transform(X_train_processed)

# Print the shape of the Bag of Words representation
print("Shape of Bag of Words (training data):", X_train_bow.shape)


Shape of Bag of Words (training data): (7000, 33468)
