In [1]:
import nltk
from bs4 import BeautifulSoup
import requests
from collections import Counter
from nltk.corpus import stopwords
import re
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

def get_with_retry(url):
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
    session = requests.Session()
    session.mount('http://', HTTPAdapter(max_retries=retries))
    session.mount('https://', HTTPAdapter(max_retries=retries))

    try:
        response = session.get(url)
        response.raise_for_status()
        return response
    except requests.exceptions.RequestException as err:
        return None

def preprocess_text(text):
    # Remove HTML code
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Tokenization, lowercasing, and line changes on punctuations
    words = nltk.word_tokenize(text.lower())
    words = [word if word.isalpha() else '\n' if any(char in word for char in ['.', '!', '?']) else word for word in words]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalpha() and word not in stop_words]

    return words

def process_text_with_pos(text, threshold=2):
    words = preprocess_text(text)

    # POS tagging
    pos_tags = nltk.pos_tag(words)

    # Filter words based on specific POS tags
    relevant_pos_tags = ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS']
    filtered_words = [word for word, pos in pos_tags if pos in relevant_pos_tags]

    # Count word frequencies
    word_counts = Counter(filtered_words)

    # Remove least frequently occurring words (adjust threshold as needed)
    filtered_words = [word for word in filtered_words if word_counts[word] >= threshold]

    # Join the remaining words
    filtered_text = ' '.join(filtered_words)
    return filtered_text

def get_articles_with_pos(topic):
    url = f'https://news.google.com/search?q={topic}&hl=en-IN&gl=IN&ceid=IN%3Aen'
    page = get_with_retry(url)

    if page is None:
        return []

    soup = BeautifulSoup(page.text, 'html.parser')
    articles = soup.find_all('a', class_='WwrzSb')
    articles_list = []

    for article in articles:
        link = 'https://news.google.com' + article['href'][1:]
        p1 = get_with_retry(link)

        if p1 is None:
            continue  # Skip this article if unable to retrieve the content

        if p1.status_code == 403:  # Access Denied
            print(f"Access Denied for article: {link}")
            continue

        if p1.status_code == 404:  # Not Found
            print(f"Article Not Found: {link}")
            continue

        soup1 = BeautifulSoup(p1.text, 'html.parser')
        heading_tag = soup1.find('h1')

        if heading_tag:
            heading = heading_tag.get_text(strip=True)
            merged_text = '\n'.join([p.get_text(strip=True) for p in soup1.find_all('p')])

            # Process and filter text with POS tagging
            filtered_text = process_text_with_pos(merged_text)

            # Append processed text to the list
            articles_list.append((heading, filtered_text))

    return articles_list

# Get user input for the topic
topic = input("Enter a topic: ")

# Get articles related to the topic with POS tagging
articles_list = get_articles_with_pos(topic)

# Print processed articles
for heading, filtered_text in articles_list:
    print(heading)
    print(filtered_text)
    print('--------------------------------------------------------------------------------------------------------------------------------')


[nltk_data] Downloading package punkt to /Users/apple/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/apple/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Enter a topic:  redbull


All you need to know about Red Bull’s Christian Horner: From powerful partnerships to marriage with a Spice Girl
high motor racing environment dominated run horner joined high team boss red bull racing motorsport mclaren became environment became public scrutiny became drivers constructors titles horner team racing management style allowed sport successful drivers built team dominant history personal style social given public drivers month team company red bull announced inquiry allegations last month lack transparency allowed take management style company announced inquiry gave horner joined team race year horner inquiry team sport email leaks began social leaks horner process given allegations horner allowed team boss red bull racing process horner personal relationships important ecclestone chief horner racing car motor racing horner gave began team team red bull team ecclestone horner take austrian company mateschitz mateschitz marko austrian horner motorsport horner said latest em

In [3]:
import nltk
from bs4 import BeautifulSoup
import requests
from collections import Counter
from nltk.corpus import stopwords
import re
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import pandas as pd

# Download NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

def get_with_retry(url):
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
    session = requests.Session()
    session.mount('http://', HTTPAdapter(max_retries=retries))
    session.mount('https://', HTTPAdapter(max_retries=retries))

    try:
        response = session.get(url)
        response.raise_for_status()
        return response
    except requests.exceptions.RequestException as err:
        return None

def preprocess_text(text):
    # Remove HTML code
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Tokenization, lowercasing, and line changes on punctuations
    words = nltk.word_tokenize(text.lower())
    words = [word if word.isalpha() else '\n' if any(char in word for char in ['.', '!', '?']) else word for word in words]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalpha() and word not in stop_words]

    return words

def process_text_with_pos(text, threshold=2):
    words = preprocess_text(text)

    # POS tagging
    pos_tags = nltk.pos_tag(words)

    # Filter words based on specific POS tags
    relevant_pos_tags = ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS']
    filtered_words = [word for word, pos in pos_tags if pos in relevant_pos_tags]

    # Count word frequencies
    word_counts = Counter(filtered_words)

    # Remove least frequently occurring words (adjust threshold as needed)
    filtered_words = [word for word in filtered_words if word_counts[word] >= threshold]

    # Join the remaining words
    filtered_text = ' '.join(filtered_words)
    return filtered_text

def get_articles_with_pos(topic):
    url = f'https://news.google.com/search?q={topic}&hl=en-IN&gl=IN&ceid=IN%3Aen'
    page = get_with_retry(url)

    if page is None:
        return []

    soup = BeautifulSoup(page.text, 'html.parser')
    articles = soup.find_all('a', class_='WwrzSb')
    articles_list = []

    for article in articles:
        link = 'https://news.google.com' + article['href'][1:]
        p1 = get_with_retry(link)

        if p1 is None:
            continue  # Skip this article if unable to retrieve the content

        if p1.status_code == 403:  # Access Denied
            print(f"Access Denied for article: {link}")
            continue

        if p1.status_code == 404:  # Not Found
            print(f"Article Not Found: {link}")
            continue

        soup1 = BeautifulSoup(p1.text, 'html.parser')
        heading_tag = soup1.find('h1')

        if heading_tag:
            heading = heading_tag.get_text(strip=True)
            merged_text = '\n'.join([p.get_text(strip=True) for p in soup1.find_all('p')])

            # Process and filter text with POS tagging
            filtered_text = process_text_with_pos(merged_text)

            # Append processed text to the list
            articles_list.append((heading, filtered_text))

    return articles_list

# Get user input for the topic
topic = input("Enter a topic: ")

# Get articles related to the topic with POS tagging
articles_list = get_articles_with_pos(topic)

# Create a DataFrame from the articles list
df = pd.DataFrame(articles_list, columns=['Heading', 'Filtered_Text'])

# Define the filename based on the input topic
filename = f"{topic}.csv"

# Save the DataFrame to a CSV file
df.to_csv(filename, index=False)

print(f"Processed articles saved to {filename}")


[nltk_data] Downloading package punkt to /Users/apple/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/apple/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Enter a topic:  apollo


Processed articles saved to apollo.csv


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset and select only 10,000 rows
df = pd.read_csv('articles1.csv', nrows=10000)

# Assuming 'content' is the column you want to preprocess
X = df['content']

# Split the dataset into training and testing sets
X_train, X_test = train_test_split(X, test_size=0.3, random_state=0)

# Preprocess the training and testing data
X_train_processed = X_train.apply(process_text_with_pos)
X_test_processed = X_test.apply(process_text_with_pos)

# Display the shapes of the split datasets
print("Training data shape:", X_train_processed.shape)
print("Testing data shape:", X_test_processed.shape)


Training data shape: (7000,)
Testing data shape: (3000,)


In [2]:
import pandas as pd

df1 = pd.read_csv('./Fin_Cleaned.csv')
df2 = pd.read_csv('./Sentiment_dataset.csv')

In [3]:
df1

Unnamed: 0,Date_published,Headline,Synopsis,Full_text,Final Status
0,2022-06-21,"Banks holding on to subsidy share, say payment...",The companies have written to the National Pay...,ReutersPayments companies and banks are at log...,Negative
1,2022-04-19,Digitally ready Bank of Baroda aims to click o...,"At present, 50% of the bank's retail loans are...",AgenciesThe bank presently has 20 million acti...,Positive
2,2022-05-27,Karnataka attracted investment commitment of R...,Karnataka is at the forefront in attracting in...,PTIKarnataka Chief Minister Basavaraj Bommai.K...,Positive
3,2022-04-06,Splitting of provident fund accounts may be de...,The EPFO is likely to split accounts only at t...,Getty ImagesThe budget for FY22 had imposed in...,Negative
4,2022-06-14,Irdai weighs proposal to privatise Insurance I...,"Set up in 2009 as an advisory body, IIB collec...",AgenciesThere is a view in the insurance indus...,Positive
...,...,...,...,...,...
395,2022-06-10,"Banks take a cue from RBI, hike lending rates",These banks raised their respective external b...,"PTIICICI Bank, Bank of Baroda, Punjab National...",Negative
396,2022-06-29,Sebi issues Rs 27 lakh recovery notice to indi...,"In the event of non-payment, it will recover t...",ReutersThe logo of the Securities and Exchange...,Negative
397,2022-06-06,Apollo Hospital shares drop 0.68% as Sensex ...,"A total of 10,105 shares changed hands on the ...",Getty ImagesShrikant Chouhan of Kotak Securiti...,Negative
398,2022-05-16,SBI at Rs 710? What makes analysts see up to 5...,Calling the stock 'attractively valued' analys...,AgenciesThe PSU bank reported a 41.27 per cent...,Positive


In [4]:
df2

Unnamed: 0,news_title,reddit_title,sentiment,text,url
0,Mark Cuban launches generic drug company,Billionaire Mark Cuban just launched a drug co...,1.0,Billionaire investor and Shark Tank star Mark ...,https://www.beckershospitalreview.com/pharmacy...
1,From Defendant to Defender: One Wrongfully Con...,"Man falsely imprisoned for 10 years, uses pris...",1.0,Attorney Jarrett Adams recently helped overtur...,https://www.nbcnews.com/news/us-news/defendant...
2,"Amazon Tribe Wins Lawsuit Against Big Oil, Sav...",Amazon tribe wins legal battle against oil com...,1.0,The Amazon Rainforest is well known across the...,https://www.disclose.tv/amazon-tribe-wins-laws...
3,Newark police: No officer fired a single shot ...,Newark police: No officer fired a single shot ...,1.0,Newark police: No officer fired a single shot ...,https://newjersey.news12.com/newark-police-no-...
4,Ingen barn døde i trafikken i 2019,No children died in traffic accidents in Norwa...,1.0,I 1970 døde det 560 mennesker i den norske tra...,https://www.nrk.no/trondelag/ingen-barn-dode-i...
...,...,...,...,...,...
843,Dee Why attack: Man allegedly choked and threa...,Dee Why attack: Man allegedly choked and threa...,0.0,Frightening details have emerged about a toile...,https://www.9news.com.au/2018/11/30/17/55/sydn...
844,Africa: Children and HIV/Aids - 'We Need to Ta...,Africa: Children and HIV/Aids - 'We Need to Ta...,0.0,"interview\n\nJohannesburg — 360,000 adolescent...",https://allafrica.com/stories/201811300567.html
845,Terrorism suspected in Eilat attack,Terrorism suspected in Eilat attack,0.0,A violent attack in the southern Israeli port ...,http://www.israelnationalnews.com/News/News.as...
846,Anti-Semitism never disappeared in Europe. It'...,Anti-Semitism never disappeared in Europe. It'...,0.0,"It's a 17-year-old boy, too frightened to wear...",https://edition.cnn.com/2018/11/27/europe/anti...


In [17]:
df1['Final Status'] = df1['Final Status'].replace({'Positive': 1.0, 'Negative': 0.0})
new_df = df1[['Full_text', 'Final Status']].copy()
new_df.columns = ['text', 'sentiment']
new_df = pd.concat([new_df, df2[['text', 'sentiment']]], axis=0)
new_df['sentiment'] = pd.to_numeric(new_df['sentiment'], errors='coerce')
new_df[['text', 'sentiment']].to_csv('big_news_articles.csv', index=False)

In [20]:
import os
import numpy as np
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Bidirectional, Flatten, Dropout
from keras.preprocessing import sequence
from keras.models import model_from_json, load_model


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/apple/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/vm/sl1j2v913dd5mfx0sqw5pmch0000gn/T/ipykernel_16974/1392041970.py", line 4, in <module>
    from keras.models import Sequential
  File "/Users/apple/anaconda3/lib/python3.11/site-packages/keras/__init__.py", line 3, in <module>
    from keras import __internal__
  File "/Users/apple/anaconda3/lib/python3.11/site-packages/keras/__internal__/__init__.py", line 6, in <module>
    from keras.__internal__ import models
  File "/Users/apple/anaconda3/lib/python3.11/site-packages/keras/__internal__/models/__init__.py", line 3, in <module>
    from keras.src.models.cloning import clone_and_build_model
  File "/Users/apple/anaconda3/lib/python3.11/site-packages/keras/src/__init__.py", line 21, in <module>
    from keras.src import applications
  File "/Users/apple/anacon