In [None]:
#importing all the libraries
from newsapi import NewsApiClient
from bs4 import BeautifulSoup
import json, xml
import requests
import html
import re
import xml.etree.ElementTree
import itertools
import os, tempfile, gcsfs
import newspaper
import pathlib
from transformers import BertTokenizer, BertModel, pipeline
import torch
from summarizer import Summarizer
from collections import Counter
import pandas as pd

In [None]:
# De-contracting english phrases
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return str(phrase)

In [None]:
#text cleaning
def clean_text(text):
    #Clean the html tags
    text = html.unescape(text)

    # remove hyperlinks
    text = re.sub(r'https?:\/\/.\S+', "", text)

    # remove hashtags
    # only removing the hash # sign from the word
    text = re.sub(r'#', '', text)
    text = re.sub(r'\"', '', text)
    text = re.sub(r'’','\'',text)
    text = re.sub(r'”','',text)
    text = re.sub(r'“','',text)

    # remove old style retweet text "RT"
    text = re.sub(r'^RT[\s]+', '', text)

    #One letter in a word should not be present more than twice in continuation
    text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))

    return decontracted(text)


In [None]:
#removing tags

def remove_tags(text):
    try:
        output = ''.join(xml.etree.ElementTree.fromstring(text).itertext())
    except Exception:
        output = text
    return output

def basicScrapper(url):
    try:
        article = newspaper.Article(url=url, language='en')
        article.download()
        article.parse()
    except Exception as e:
        print(f'Error in Basic Scrapper using NewsPaper3K: {e}')
        return ''
    return str(article.text)


In [None]:
#text chunking
def chunk_text(text, max_length=512):
    chunks = []
    words = text.split()
    current_chunk = ''
    for word in words:
        if len(current_chunk) + len(word) < max_length:
            current_chunk += ' ' + word
        else:
            chunks.append(current_chunk.strip())
            current_chunk = word
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks[:512]

In [None]:
#API call for articles
num_art = 30
def getNews(company, writeCloud=False, proxies=''):
    apikey = '126f8f56e1634dbc862a85d391a21fe4'#'5df82aee71714503b341fb5046e5bf55'#'cd51b89890444ff4b367a6e14cbac7a8'#'769053c5c1c4424aa08588285b547c22'
    keys = [apikey]
    for i in range(len(keys)):
        try:
            key = keys[i]
            print(f'Trying API Key {i+1} for NewsAPI request: {key}')
            newsapi = NewsApiClient(api_key = key)
            #news = newsapi.get_top_headlines(category='business', language='en', country = 'us')
            break
        except Exception as e:
            if not e.args[0]['code'] == 'rateLimited':
                raise Exception(f'NewsAPI Error!! {e}')

    company = company.strip()
    queries = ['Stock Market', 'Stocks', 'Financial Markets', 'Equity Markets', 'Trading',
    'Stock Exchange', 'Share Prices', 'Market Volatility',
    'Market Indices (e.g., S&P 500, Dow Jones Industrial Average, NASDAQ)',
    'Stock Performance', 'Market Trends', 'Economic Indicators', 'Corporate Earnings',
    'Investment Strategies', 'Market Analysis', 'Market Outlook',
    'Initial Public Offering (IPO)', 'Dividends', 'Market Sentiment', 'Market News',
    'MetaTrader', 'NVIDIA', 'Microsoft', 'Amazon', 'Google',
    'Wall Street', 'Bull Market', 'Bear Market', 'Blue Chip Stocks',
    'Penny Stocks', 'Futures Market', 'Options Market', 'ETFs', 'Mutual Funds',
    'Hedge Funds', 'Technical Analysis', 'Fundamental Analysis', 'Day Trading',
    'Swing Trading', 'Long-term Investing', 'Short Selling', 'Market Capitalization',
    'Market Liquidity', 'Market Order', 'Limit Order', 'Stop Order', 'Margin Trading',
    'Risk Management''META',
    'NVDA',
    'MSFT',
    'AMZN',
    'GOOG'
]



    output_obj = []
    titles = []

    for query in queries:
        news = newsapi.get_top_headlines(q=(company+' '+query).strip(), category='business', language='en', country='us')
        for article in news['articles']:
            if article['title'] not in titles and company.lower() in article['title'].lower():
                del article['source'], article['author'], article['urlToImage']
                if len(titles) == num_art:
                    break
                article['content'] = getFullArticleContent(company=company, url = article['url'], pre_content = article['content'], proxies=proxies)
                if len(article['content'])!= 0 and len(article['description'])!= 0:
                    output_obj.append(article)
                    titles.append(article['title'])

    if not len(titles) == num_art:
        news = newsapi.get_everything(q=(company).strip(), language='en', from_param='2024-03-24', to='2024-04-03')
        for article in news['articles']:
            if article['title'] not in titles and company.lower() in article['title'].lower():
                del article['source'], article['author'], article['urlToImage']
                if len(titles) == num_art:
                    break
                article['content'] = getFullArticleContent(company=company, url = article['url'], pre_content = article['content'], proxies=proxies)
                if len(article['content'])!= 0 and len(article['description'])!= 0:
                    output_obj.append(article)
                    titles.append(article['title'])

    if not len(titles) == num_art:
        for article in news['articles']:
            if len(titles) == num_art:
                break
            if article['title'] not in titles and company.lower() in article['title'].lower():
                content = basicScrapper(article['url'])
                if len(content) == 0:
                    article['content'] = clean_text(remove_tags(article['content'].split('… [')[0]))
                else:
                    article['content'] = content
                if len(article['content'])!= 0 and len(article['description'])!= 0:
                    output_obj.append(article)
                    titles.append(article['title'])
    # Initialize BERT Tokenizer and Model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = Summarizer()

    for article in output_obj:
      article_text = article['title'] + ' ' + article['description'] + ' ' + article['content']
      # Split text into chunks
      chunks = chunk_text(article_text)
      # Initialize list to store summarized chunks
      summarized_chunks = []
      # Use BERT Summarizer on each chunk
      for chunk in chunks:
        summarized_chunk = bert_model(chunk, min_length=60, max_length=200)
        summarized_chunks.append(summarized_chunk)
    # Concatenate summarized chunks to get final summary
    summary = ' '.join(summarized_chunks)
    article['summary'] = summary

    # Initialize the sentiment analysis pipeline outside the loop
    sentiment_pipeline = pipeline("sentiment-analysis")
    for article in output_obj:
      # Combine title, description, and content into a single text
      text = article['title'] + ' ' + article['description'] + ' ' + article['content']
      # Split the text into chunks
      chunks = chunk_text(text)
      sentiment_labels = []
      # Perform sentiment analysis on each chunk
      for chunk in chunks:
        result = sentiment_pipeline(chunk)
        sentiment_labels.extend([res['label'] for res in result])
      # Store the sentiment labels for each chunk
      sentiment_counter = Counter(sentiment_labels)
      majority_sentiment_label = sentiment_counter.most_common(1)[0][0]
      article['sentiment'] = majority_sentiment_label
    return titles, output_obj

In [None]:
#content to retrieve from API
def getFullArticleContent(company, url, pre_content='', proxies=''):
    if not pre_content:
        pre_content=''
    content = ''
    try:
        response = requests.get(url, proxies=proxies)
    except Exception:
        print(f'URL not reachable: {url}')
        return ''
    if response.status_code == 200:
        body = response.content
        soup1 = BeautifulSoup(body, 'html.parser')
        news = soup1.find_all('script')
        for article in news:
            try:
                if article.has_attr('type'):
                    if 'json' in article['type']:
                        obj = json.loads(article.contents[0])
                        if '@type' in obj.keys():
                            if obj['@type'] ==  'NewsArticle':
                                content+=' '+ str(obj["articleBody"])
            except Exception:
                pass
        news_div = soup1.find_all('div')
        for div in news_div:
            try:
                paras = div.find_all('p')
                for para in paras:
                    data = str(remove_tags(str(para)))
                    if company.lower() in data.lower() and data.lower() not in content.lower():
                        content+=' '+data
            except Exception:
                pass
        if len(content) == 0:
            news_div = soup1.find_all('div', class_='article-text')
            for article in news_div:
                try:
                    content+=' '+ str(remove_tags(str(article)))
                except Exception:
                    pass
            news_div = soup1.find_all('div', class_='article-content')
            for article in news_div:
                try:
                    paragraphs = article.find_all('p')
                    for para in paragraphs:
                        content+=' '+ str(remove_tags(str(para)))
                except Exception:
                    pass
            news_div = soup1.find_all('div', class_='entry-content clearfix')
            for article in news_div:
                try:
                    paragraphs = article.find_all('p')
                    for para in paragraphs:
                        content+=' '+ str(remove_tags(str(para)))
                except Exception:
                    pass
    else:
        content= ''
    if len(content) == 0:
        content = ''
    return clean_text(content)

In [None]:
#META data retreival
if __name__=='__main__':
    # Generate an API key from https://newsapi.org/register
    #companies = ['Apple','Microsoft','Amazon','Walmart','Alphabet','Meta','Tesla','NVIDIA','Pfizer','Netflix']
    #companies = ['Meta','Nvidia','Microsoft','Amazon','Google']
    companies = ['Meta']
    data = []
    k = 1
    for company in companies:
        urls, output_objs = getNews(company=company,  writeCloud=False)
        for url, article in zip(urls, output_objs):
            data.append({
                'Serial Number': k,
                'Company': company,
                'URL': url,
                'Title': article['title'],
                'Description': article['description'],
                'Content': article['content'],
                'published_at' : article['publishedAt'],
                'Sentiment': article['sentiment']
            })
            k = k + 1

    df = pd.DataFrame(data)
    df.to_csv('/Users/Desktop/ie 517 final project/untitled folder/finalMETA.csv', mode='a', index=False, header=False)

In [None]:
#Nvidia data retreival
if __name__=='__main__':
    # Generate an API key from https://newsapi.org/register
    #companies = ['Apple','Microsoft','Amazon','Walmart','Alphabet','Meta','Tesla','NVIDIA','Pfizer','Netflix']
    #companies = ['Meta','Nvidia','Microsoft','Amazon','Google']
    companies = ['Nvidia']
    data = []
    k = 1
    for company in companies:
        urls, output_objs = getNews(company=company,  writeCloud=False)
        for url, article in zip(urls, output_objs):
            data.append({
                'Serial Number': k,
                'Company': company,
                'URL': url,
                'Title': article['title'],
                'Description': article['description'],
                'Content': article['content'],
                'published_at' : article['publishedAt'],
                'Sentiment': article['sentiment']
            })
            k = k + 1

    df = pd.DataFrame(data)
    df.to_csv('/Users/Desktop/ie 517 final project/untitled folder/finalNVIDIA.csv', mode='a', index=False, header=False)

In [None]:
#Microsoft data retreival
if __name__=='__main__':
    # Generate an API key from https://newsapi.org/register
    #companies = ['Apple','Microsoft','Amazon','Walmart','Alphabet','Meta','Tesla','NVIDIA','Pfizer','Netflix']
    #companies = ['Meta','Nvidia','Microsoft','Amazon','Google']
    companies = ['Microsoft']
    data = []
    k = 1
    for company in companies:
        urls, output_objs = getNews(company=company,  writeCloud=False)
        for url, article in zip(urls, output_objs):
            data.append({
                'Serial Number': k,
                'Company': company,
                'URL': url,
                'Title': article['title'],
                'Description': article['description'],
                'Content': article['content'],
                'published_at' : article['publishedAt'],
                'Sentiment': article['sentiment']
            })
            k = k + 1

    df = pd.DataFrame(data)
    df.to_csv('/Users/Desktop/ie 517 final project/untitled folder/finalMSFT.csv', mode='a', index=False, header=False)

In [None]:
#Amazon data retreival
if __name__=='__main__':
    # Generate an API key from https://newsapi.org/register
    #companies = ['Apple','Microsoft','Amazon','Walmart','Alphabet','Meta','Tesla','NVIDIA','Pfizer','Netflix']
    #companies = ['Meta','Nvidia','Microsoft','Amazon','Google']
    companies = ['Amazon']
    data = []
    k = 1
    for company in companies:
        urls, output_objs = getNews(company=company,  writeCloud=False)
        for url, article in zip(urls, output_objs):
            data.append({
                'Serial Number': k,
                'Company': company,
                'URL': url,
                'Title': article['title'],
                'Description': article['description'],
                'Content': article['content'],
                'published_at' : article['publishedAt'],
                'Sentiment': article['sentiment']
            })
            k = k + 1

    df = pd.DataFrame(data)
    df.to_csv('/Users/Desktop/ie 517 final project/untitled folder/finalAMZN.csv', mode='a', index=False, header=False)

In [None]:
#Google data retreival
if __name__=='__main__':
    # Generate an API key from https://newsapi.org/register
    #companies = ['Apple','Microsoft','Amazon','Walmart','Alphabet','Meta','Tesla','NVIDIA','Pfizer','Netflix']
    #companies = ['Meta','Nvidia','Microsoft','Amazon','Google']
    companies = ['Google']
    data = []
    k = 1
    for company in companies:
        urls, output_objs = getNews(company=company,  writeCloud=False)
        for url, article in zip(urls, output_objs):
            data.append({
                'Serial Number': k,
                'Company': company,
                'URL': url,
                'Title': article['title'],
                'Description': article['description'],
                'Content': article['content'],
                'published_at' : article['publishedAt'],
                'Sentiment': article['sentiment']
            })
            k = k + 1

    df = pd.DataFrame(data)
    df.to_csv('/Users/Desktop/ie 517 final project/untitled folder/finalGOOGLE.csv', mode='a', index=False, header=False)