# LLM Prompts and Chains

### Imports

In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm
import os
import time

import openai
from dotenv import load_dotenv, find_dotenv


_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

In [2]:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

from langchain.callbacks import get_openai_callback

from langchain.output_parsers import ResponseSchema
from langchain.chains import SequentialChain, LLMChain
from langchain.output_parsers import StructuredOutputParser

In [3]:
# Set Model Version
llm_model = 'gpt-3.5-turbo-0125'

### Output Response Schema

In [4]:
# Sentiment Extraction
sentiment_schema = ResponseSchema(name="sentiment",
                                description='''Determine the degree of sentiment of the news article with respect to {company}.
Output in the range of (-1, 1) where -1 is Negative, and 1 is Positive.''')

# Evidence Extraction
evidence_schema = ResponseSchema(name="evidence",
                                description='''Extract any sentences that provide evidence for the extracted sentiment, and output as a Python List.''')

# Stock Movement Extraction
stock_movement_schema = ResponseSchema(name="stock_movement",
                                description='''Given your expertise in the field, determine stock movement of the {company}.
Output as Up or Down.''')

# Explination Extraction
explaination_schema = ResponseSchema(name="explaination",
                                description='''Explain your thoughts and thinking process. Output as string.''')

response_schemas = [
    sentiment_schema, 
    evidence_schema,
    stock_movement_schema,
    explaination_schema
]

# Output Parsers and Format Instructions for LLM
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

### LLM Chains

In [5]:
# 1st Chain
llm = ChatOpenAI(temperature=0.0, model=llm_model)

# Company Extraction Prompt
first_prompt = ChatPromptTemplate.from_template(
    '''
    You are an expert in the field of finance news, stock market and trading. Which company does the news article talk about majorly? Output only one company name.
    article: {text}
    '''
)

# Chain 1
chain_one = LLMChain(llm=llm, prompt=first_prompt, output_key='company')

In [6]:
# 2nd Chain
second_prompt = ChatPromptTemplate.from_template(
    """\
    You are an expert in analyzing financial news, stock market and trading for {company} given below. For the following financial news article, do the following:

    sentiment: Determine the degree of sentiment of the news article with respect to {company}.
    Output in the range of (-1, 1) where -1 is Negative, and 1 is Positive.

    evidence: Extract any sentences that provide evidence for the extracted sentiment, and output as a Python List.

    stock_movement: Given your expertise in the field, determine stock movement of the {company}.
    Output as Up or Down.

    Explaination: Explain your thoughts and thinking process. Output as string.

    article: {text}
    {format_instructions}
    """
)

# chain 2
chain_two = LLMChain(llm=llm, prompt=second_prompt, output_key='json_output')

In [7]:
# overall_chain: input = Text 
# and output= company, json_output
overall_chain = SequentialChain(
    chains=[chain_one, chain_two],
    input_variables=["text", "format_instructions"],
    output_variables=["company", "json_output"],
    verbose=False
)

In [8]:
# with get_openai_callback() as cb:
#     outputs = overall_chain.invoke({
#         'text': df.iloc[20, -1],
#         'format_instructions': format_instructions
#     })
#     print(cb.successful_requests)
#     print(cb.total_cost)
#     print(cb.total_tokens)
#     print(cb.prompt_tokens)
#     print(cb.completion_tokens)

### Batch Requests per minute

In [9]:
TOKENS_PER_MINUTE = 58000
REQUESTS_PER_MINUTE = 250
REQUESTA_PER_DAY = 9000

REQUEST_INTERVAL = 61

In [10]:
df = pd.read_csv('../data/final/artilces_with_content.csv')

word2token = 3.6
df['est_tokens'] = df['Content'].apply(lambda x: len(x.split(' '))) * word2token

In [11]:
batches = []

while len(df)> 0:
    cum_sum = df['est_tokens'].cumsum()
    num_requests = cum_sum[cum_sum < TOKENS_PER_MINUTE].size

    if num_requests * 2 < REQUESTS_PER_MINUTE: # 2 requests per minute
        batches.append(df.iloc[:num_requests, :])
        df = df.iloc[num_requests:, :]
    else:
        batches.append(df.iloc[:REQUESTS_PER_MINUTE, :])
        df = df.iloc[REQUESTS_PER_MINUTE:, :]

In [12]:
# Sanity Check
for batch in batches:
    if len(batch) > REQUESTS_PER_MINUTE or batch['est_tokens'].sum() > TOKENS_PER_MINUTE:
        print('Something is Wrong')

### Run All Batch Requests

In [13]:
df_outputs = []
not_worked = []
start_time = time.time()

for batch in batches:
    for i, row in tqdm(batch.iterrows(), total=len(batch)):
        try:
            article_text = f'Title: {row['Title']}, Content: {row['Content']}'
            outputs = overall_chain.invoke({
                'text': article_text,
                'format_instructions': format_instructions
            })
            output_dict = {'GOID': row['GOID'], 'company': outputs['company']}
            output_dict.update(output_parser.parse(outputs['json_output']))
            df_outputs.append(output_dict)
        except:
            not_worked.append(row['GOID'])
            
        if i % 2 == 0:
            pd.DataFrame.from_dict(df_outputs).to_csv('../data/final/llm_outputs.csv', index=False)
            np.array(not_worked).tofile('../data/final/did_not_work.txt', sep=',')

    time_elapsed = time.time() - start_time
    if time_elapsed < REQUEST_INTERVAL:
        time.sleep(REQUEST_INTERVAL - time_elapsed)
        start_time = time.time()

100%|██████████| 23/23 [01:23<00:00,  3.65s/it]
100%|██████████| 23/23 [01:17<00:00,  3.37s/it]
100%|██████████| 19/19 [00:58<00:00,  3.06s/it]
100%|██████████| 20/20 [01:07<00:00,  3.39s/it]
100%|██████████| 27/27 [01:27<00:00,  3.23s/it]
100%|██████████| 23/23 [01:13<00:00,  3.20s/it]
100%|██████████| 25/25 [01:19<00:00,  3.18s/it]
100%|██████████| 22/22 [01:11<00:00,  3.23s/it]
100%|██████████| 21/21 [01:10<00:00,  3.35s/it]
100%|██████████| 9/9 [00:29<00:00,  3.23s/it]
100%|██████████| 18/18 [00:57<00:00,  3.18s/it]
100%|██████████| 6/6 [00:20<00:00,  3.37s/it]
100%|██████████| 17/17 [00:58<00:00,  3.45s/it]
100%|██████████| 14/14 [00:43<00:00,  3.13s/it]
100%|██████████| 20/20 [01:01<00:00,  3.07s/it]
100%|██████████| 14/14 [00:46<00:00,  3.34s/it]
100%|██████████| 21/21 [01:03<00:00,  3.03s/it]
100%|██████████| 16/16 [00:52<00:00,  3.25s/it]
100%|██████████| 25/25 [01:22<00:00,  3.29s/it]
100%|██████████| 29/29 [01:28<00:00,  3.04s/it]
100%|██████████| 29/29 [01:36<00:00,  3.31s/