## Import and Organize Article Data

In [1]:
import pandas as pd
import numpy as np
import json
from datetime import datetime, timedelta

import os
from dotenv import load_dotenv, find_dotenv

import requests
import re
import openai
from google.cloud import bigquery

from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains import SimpleSequentialChain

from apikey import apikey_news_source
from apikey import apikey_openai


# DELETE FOR PRODUCTION
import random

In [2]:
# Use find_dotenv to locate the file
dotenv_path = find_dotenv()

# Load the .env file
load_dotenv(dotenv_path)

# Fetch the specific keys
apikey_openai = os.getenv('apikey_openai')
GOOGLE_CSE_ID = os.getenv('apikey_GOOGLE_CSE_ID')
GOOGLE_API_KEY = os.getenv('apikey_GOOGLE')

In [2]:
# Create an empty DataFrame to store the news articles
articles_df = pd.DataFrame(columns=['Source', 'Author', 'Title', 'Description', 'URL', 'Published At', 'Content'])

# Iterate over unique micro-sectors
# Format the API URL for the specific" micro-sector   
api_url = f"https://newsapi.org/v2/top-headlines?country=us&from=2023-05-29&apiKey={apikey_news_source}"
    
    
# Make the API request
response = requests.get(api_url)  
    
# Process the response and extract relevant information
articles = response.json().get('articles')

if articles:
    # Extract article information and add micro-sector column
    articles_data = [(article.get('source').get('name'), article.get('author'), 
                    article.get('title'), article.get('description'), 
                    article.get('url'), article.get('publishedAt'), article.get('content'))  for article in articles]
        
    # Convert the list of articles to a DataFrame
    articles_sector_df = pd.DataFrame(articles_data, columns=['Source', 'Author', 'Title',
                                                            'Description', 'URL', 'Published At', 'Content'])
        
        # Append the articles for the current micro-sector to the main DataFrame
    headlines_articles_df = pd.concat([articles_df, articles_sector_df], ignore_index=True)

In [3]:
# All Macro keywords in a single list
keywords = [
    'finance', 'economy', 'banking', 'inflation', 'GDP', 'Unemployment', 
    'Interest Rate', 'Federal Reserve', 'Fiscal Policy', 'Monetary Policy', 
    'Trade Deficit', 'Balance of Payments', 'Foreign Exchange', 'Derivatives', 
    'Asset Management', 'Hedge Funds', 'Private Equity', 'Venture Capital', 
    'IPO', 'Mergers and Acquisitions', 'Credit Ratings', 'credit swaps','Stock Market', 
    'Recession', 'Economic Growth', 'Inflation Rate', 'Consumer Spending', 
    'Consumer Confidence', 'Retail Sales', 'Industrial Production', 'Consumer Debt',
    'Housing Market', 'Business Cycle', 'Central Bank', 'NFP', 'Unemployment',
    'employment', 'crypto','cryptocurrency', 'Bitcoin', 'BTC','blockchain','ethereum',
    'S&P 500', 'Nasdaq','Dow Jones','crypto exchange','supply','demand','cash-flow','revenue'
    'cap-ex', 'costs'
]


# Get today's date
today = datetime.now()

# Subtract one day
one_day_ago = today - timedelta(days=1)

# Convert it to a string in 'YYYY-MM-DD' format
date_str = one_day_ago.strftime('%Y-%m-%d')
#date_str = today

In [4]:
# Maximum number of keywords per request
max_keywords_per_request = 5  # Adjust this to the maximum allowed by your API

# Number of requests
num_requests = int(np.ceil(len(keywords) / max_keywords_per_request))

# Distribute keywords across requests
requests_keywords = np.array_split(keywords, num_requests)

# Generate URLs
base_url = 'https://newsapi.org/v2/everything?q={}&from={}&sortBy=popularity&apikey={}'
urls = [base_url.format('%20OR%20'.join(kw_list), date_str, apikey_news_source) for kw_list in requests_keywords]

macro_df = pd.DataFrame(columns=['Source', 'Author', 'Title', 'Description', 'URL', 'Published At', 'Content'])

for url in urls:
    response = requests.get(url).json()
    df_temp = pd.json_normalize(response['articles'])
    macro_df = pd.concat([macro_df, df_temp], ignore_index=True)

#macro_df = macro_df.drop_duplicates().reset_index(drop=True)

In [50]:
macro_df = macro_df[['source.name', 'author', 'title', 'description', 'url', 'publishedAt', 'content']].rename(columns={'source.name': 'Source', 'author': 'Author', 'title': 'Title', 'description':'Description', 'url':'URL', 'publishedAt':'Published At', 'content':'Content'})


In [52]:
# Path to your service account key
service_path = r"C:\Users\samir\OneDrive\Desktop\News Stock Relevance Project\service path\newspulse-1b847-92ee0b8c89f0.json"

# Create a "Client" object
client = bigquery.Client.from_service_account_json(service_path)

# Create a "Client" object
#client = bigquery.Client()

# Construct a reference to the "dataset_id" dataset
dataset_ref = client.dataset("stocklist")  # replace "dataset_id" with your dataset ID

# Construct a reference to the "table_id" table
table_ref = dataset_ref.table("us-stocklist")  # replace "table_id" with your table ID

# API request - fetch the table
table = client.get_table(table_ref)

# Load table data to a DataFrame
microsectors_df = client.list_rows(table).to_dataframe()

In [53]:
# Extract micro-sectors from merged dataframe
micro_sectors = microsectors_df['Micro-sectors'].tolist()

# Remove duplicate micro-sectors
unique_micro_sectors = list(set([sector.strip() for sectors in micro_sectors for sector in sectors.split(',')]))

In [54]:
# DELETE IN PRODUCTION
# To pick a random item
unique_micro_sectors = random.choice(unique_micro_sectors)

In [55]:
# Create an empty DataFrame to store the news articles
sector_articles_df = pd.DataFrame(columns=['Source', 'Author', 'Title', 'Description', 'URL', 'Published At', 'Content'])

# Iterate over unique micro-sectors
for sector in unique_micro_sectors:
    # Format the API URL for the specific micro-sector
    #api_url = f"https://newsapi.org/v2/everything?q={sector}&apiKey={apikey_news_source}"
    api_url = f"https://newsapi.org/v2/everything?q={sector}&from=2023-05-28&to=2023-06-28&apiKey={apikey_news_source}"
    
    # Make the API request
    response = requests.get(api_url)
    
    # Process the response and extract relevant information
    articles = response.json().get('articles')
    if articles:
        # Extract article information and add micro-sector column
        articles_data = [(article.get('source').get('name'), article.get('author'), 
                          article.get('title'), article.get('description'), 
                          article.get('url'), article.get('publishedAt'), article.get('content'))  for article in articles]
        
        # Convert the list of articles to a DataFrame
        articles_sector_df = pd.DataFrame(articles_data, columns=['Source', 'Author', 'Title',
                                                                   'Description', 'URL', 'Published At', 'Content'])
        
        # Append the articles for the current micro-sector to the main DataFrame
        sector_articles_df = pd.concat([sector_articles_df, articles_sector_df], ignore_index=True)

None
None
None
None


In [56]:
all_articles_df = pd.concat([headlines_articles_df, macro_df, sector_articles_df], axis=0, ignore_index=True)

all_articles_df = all_articles_df.drop_duplicates().reset_index(drop=True)

In [57]:
all_articles_df

Unnamed: 0,Source,Author,Title,Description,URL,Published At,Content
0,The Washington Post,Dan Stillman,D.C.-area forecast: Sunshine and some smoke to...,Shower and storm chances return late Friday an...,https://www.washingtonpost.com/weather/2023/06...,2023-06-28T12:00:00Z,Comment on this story\r\nComment\r\n* Code Ora...
1,Investor's Business Daily,Investor's Business Daily,Dow Jones Futures Rise But Techs Fall As AI Ch...,,https://www.investors.com/market-trend/stock-m...,2023-06-28T11:54:00Z,
2,CBS News,Emmet Lyons,Kevin Spacey's U.K. trial on sexual assault ch...,Oscar-winning actor Kevin Spacey faces a dozen...,https://www.cbsnews.com/news/kevin-spacey-uk-t...,2023-06-28T11:34:00Z,London – Kevin Spacey's trial began Wednesday ...
3,CNN,Julia Buckley,Air passenger gets plane all to himself after ...,Everyone else made other plans when the flight...,https://www.cnn.com/travel/solo-air-passenger-...,2023-06-28T11:19:00Z,If youve ever thought that having an empty sea...
4,WABC-TV,,Travel woes worsen for passengers stranded at ...,Hundreds of flights are already canceled as of...,https://abc7ny.com/flight-cancellations-lga-ne...,2023-06-28T09:11:15Z,"EAST ELMHURST, Queens (WABC) -- Hundreds of fl..."
...,...,...,...,...,...,...,...
788,Eurogamer.net,John Linneman,Mortal Kombat 1's network test shows promise o...,With only a few months between its announcemen...,https://www.eurogamer.net/digitalfoundry-2023-...,2023-06-28T08:00:00Z,With only a few months between its announcemen...
789,Eurogamer.net,Ed Nightingale,"Alan Wake, Call of Duty arrive on PlayStation ...",July's PlayStation Plus Essential games have l...,https://www.eurogamer.net/alan-wake-call-of-du...,2023-06-28T11:40:42Z,July's PlayStation Plus Essential games have l...
790,Eurogamer.net,Ed Nightingale,Final Fantasy 16 sells 3m copies worldwide,Final Fantasy 16 has sold 3m copies worldwide ...,https://www.eurogamer.net/final-fantasy-16-sel...,2023-06-28T13:46:17Z,Final Fantasy 16 has sold 3m copies worldwide ...
791,Eurogamer.net,Ed Nightingale,Diablo 4 update will make grinding easier for ...,Blizzard has released its latest patch update ...,https://www.eurogamer.net/diablo-4-update-will...,2023-06-28T10:40:25Z,Blizzard has released its latest patch update ...


In [59]:
all_articles_df.to_csv('all_articles.csv', index=False)