## Find the Relations of the Articles to Micro-Sectors

In [1]:
import pandas as pd
import numpy as np
import json
import random
from datetime import datetime, timedelta

import os
from dotenv import load_dotenv, find_dotenv

import requests
import re
import openai
from google.cloud import bigquery

from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains import SimpleSequentialChain


# Agent imports
from langchain.agents import load_tools
from langchain.agents import initialize_agent

# Tool imports
from langchain.agents import Tool
from langchain.utilities import GoogleSearchAPIWrapper
from langchain.utilities import TextRequestsWrapper

In [2]:
# Use find_dotenv to locate the file
dotenv_path = find_dotenv()

# Load the .env file
load_dotenv(dotenv_path)

# Fetch the specific keys
apikey_openai = os.getenv('apikey_openai')
GOOGLE_CSE_ID = os.getenv('apikey_GOOGLE_CSE_ID')
GOOGLE_API_KEY = os.getenv('apikey_GOOGLE')

In [None]:
max_prompt_length = 4000

In [None]:
client = bigquery.Client.from_service_account_json(r'C:\Users\samir\OneDrive\Desktop\News Stock Relevance Project\service path\newspulse-1b847-92ee0b8c89f0.json')

# Construct a reference to the "dataset_id" dataset
dataset_ref = client.dataset("stocklist")  # replace "dataset_id" with your dataset ID

# Construct a reference to the "table_id" table
table_ref = dataset_ref.table("us-stocklist")  # replace "table_id" with your table ID

# API request - fetch the table
table = client.get_table(table_ref)

# Load table data to a DataFrame
df = client.list_rows(table).to_dataframe()

# Assuming you have a DataFrame called 'df' and a column named 'Micro-sectors'
microsectors = df['MicroSectors'].unique()

In [None]:
# Extract micro-sectors from merged dataframe
micro_sectors = df['MicroSectors'].tolist()

# Remove duplicate micro-sectors
unique_micro_sectors = list(set([sector.strip() for sectors in micro_sectors for sector in sectors.split(',')]))

# DELETE IN PRODUCTION
#micro_sectors = random.choice(unique_micro_sectors)

In [None]:
#Import Articles
articles_df = pd.read_csv(r"C:\Users\samir\OneDrive\Desktop\News Stock Relevance Project\articles.csv")

In [None]:
llm = OpenAI(model_name='gpt-4-32k',temperature=0, openai_api_key=apikey_openai)

In [None]:
search = GoogleSearchAPIWrapper(google_api_key=GOOGLE_API_KEY, google_cse_id=GOOGLE_CSE_ID)

requests = TextRequestsWrapper()

In [None]:
article = """
Comedian Sarah Silverman and two authors have filed copyright infringement lawsuits against Meta Platforms and OpenAI for allegedly using their content without permission to train artificial intelligence language models.

The proposed class action lawsuits filed by Silverman, Richard Kadrey and Christopher Golden in San Francisco federal court Friday allege Facebook parent company Meta and ChatGPT maker OpenAI used copyrighted material to train chat bots.

Meta and OpenAI, a private company backed by Microsoft, did not immediately respond to requests for comment on Sunday.

The lawsuits underscore the legal risks developers of chat bots face when using troves of copyrighted material to create apps that deliver realistic responses to user prompts.

Silverman, Kadrey and Golden allege Meta and OpenAI used their books without authorization to develop their so-called large language models, which their makers pitch as powerful tools for automating tasks by replicating human conversation.


NYC law requires companies to prove A.I. hiring software is not racist or sexist
JULY 6, 202302:08
In their lawsuit against Meta, the plaintiffs allege that leaked information about the company’s artificial intelligence business shows their work was used without permission.
"""

In [None]:
from langchain.llms import OpenAI
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import CommaSeparatedListOutputParser


input_template = """
Act as a news analyst/financial analyst/economist, and please analyze ONLY WHICH OF THESE business sector(s): {unique_micro_sectors}

would be directly impacted from the information in this news article: {article}.


PLEASE THOROUGHLY EXAMINE THE ENTIRE LIST ABOVE OF BUSINESS SECTORS AGAINST THE CONTENTS OF THE ARTICLE.
There may be more than one business sector impacted so please include ALL DIRECTLY impacted business sector(s) from the list.

PLEASE Just return a PYTHON LIST of IMPACTED BUSINESS SECTORS, do not add ANYTHING ELSE, NO INTERPRETATION AND IF NO 
BUSINESS SECTOR(S) ARE IMPACTED JUST RETURN THE WORD 'NONE'.

Your response should be a list of comma separated values, eg: `foo, bar, baz` or "NONE"
"""


magnitude_template = """
Now please assign a magnitude score for each impacted business sector above as low/medium/high. 

PLEASE RETURN THE IMPACTED BUSINESS SECTOR(S) ALONG WITH THEIR MAGNITUDE SCORE IN A DICTIONARY.

Your response should be a python dictionary separated values, eg: `["AI" : "low", "Banking" : "High"]
"""


company_specific_template = """
Please answer 'Yes' if this article's content names any specific companies or 'No' if not.
"""


llm = OpenAI(temperature=0, model_name='gpt-3.5-turbo',openai_api_key=apikey_openai)

sector_analysis = ConversationChain(
    llm=llm,
    verbose=True,
    memory=ConversationBufferMemory(),
)

ListofImpactedSectors = sector_analysis.run({"input": input_template.format(article=article, unique_micro_sectors=unique_micro_sectors)})

magnitude_dict = sector_analysis.run(magnitude_template)

specific_company = sector_analysis.run(company_specific_template)


In [None]:
sector_analysis.run("Why is social networking not directly impacted by this news?")

In [None]:
specific_company

In [None]:
ListofImpactedSectors = list(ListofImpactedSectors.split(", "))
magnitude_dict = eval(magnitude_dict)

In [None]:
print(ListofImpactedSectors)
print(magnitude_dict)

In [None]:
from language.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationChain
from langchain.chat_models import ChatOpenAI

memory = ConversationBuggerMemory(memory_key='chat_history', return_messages=True, output_keys='answer')



In [None]:
from language.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationChain
from langchain.chat_models import ChatOpenAI

memory = ConversationBuggerMemory(memory_key='chat_history', return_messages=True, output_keys='answer')

llm = OpenAI(temperature=0, openai_api_key=apikey_openai)

conversation = ConversationChain(
    llm=llm,
    verbose=True,
    memory=ConversationBufferMemory(),
    article=article,
    unique_micro_sectors=unique_micro_sectors
)

out1 = conversation.run(prompt_template, article=article)


In [None]:
import langchain
from langchain.llms import OpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter

llm = OpenAI(temperature=0, openai_api_key=apikey_openai)

text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=5000, chunk_overlap=350)
docs = text_splitter.create_documents([article])

# Get your chain ready to use
chain = load_summarize_chain(llm=llm, chain_type='map_reduce', verbose=True)

output = chain.run(docs)

# Pass the summarized text to another prompt
article = output
prompt = prompt_template
response = langchain.ask(prompt, context=summary)

In [None]:
output

In [None]:
import langchain

# Summarize the article
chain = langchain.refine()
summary = chain(article, prompt="Summarize this article.")

# Extract the summary from the context
summary = langchain.extract_summary(summary)

# Pass the summarized text to another prompt
prompt = prompt_template
response = langchain.ask(prompt, context=summary)

# Print the response
print(response)

In [None]:
# Set up OpenAI API credentials
openai.api_key = apikey_openai

response = openai.Completion.create(
    engine='text-davinci-003',
    prompt="""
        Act as a financial analyst/news analyst/economist, now please tell which of these these business sectors: {unique_micro_sectors} are impacted from the information in this news article: {article}. Please respond only with the impacted business sector(s) that have been listed here and attach a score for magnitutude (low, medium, high) for the level of impact for each impacted business sector.
    """[:max_prompt_length],
    max_tokens=4097,  #Adjust as needed
    temperature=0,  # Adjust as needed
    n=1  # Adjust as needed
    )

In [None]:
agent = initialize_agent(toolkit, llm, agent="zero-shot-react-description", verbose=True, return_intermediate_steps=True)


response = agent({"input": prompt_template.format(article=article, unique_micro_sectors=unique_micro_sectors)})
response['output']

In [None]:
def search_list(list, value):
  """
  Searches a list for the value and returns True if it exists
  Args:
    list: The list to search
    value: The value to search for
  """
  found = False

  for i in range(len(list)):
    if list[i] == value:
      found = True
      break

  return found


search_list(unique_micro_sectors, "Discount Airlines")

In [None]:
unique_micro_sectors