<a href="https://colab.research.google.com/github/dibyanshupatnaik/US-Bank-Capstone/blob/main/Llama_2_Financial_alpaca.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# presentation layer code

import base64
from IPython.display import Image, display
import matplotlib.pyplot as plt

def mm(graph):
  graphbytes = graph.encode("ascii")
  base64_bytes = base64.b64encode(graphbytes)
  base64_string = base64_bytes.decode("ascii")
  display(Image(url="https://mermaid.ink/img/" + base64_string))

def genai_app_arch():
  mm("""
  flowchart TD
    A[Users] --> B(Applications e.g. mobile, web)
    B --> |Hosted API|C(Platforms e.g. Custom, HuggingFace, Replicate)
    B -- optional --> E(Frameworks e.g. LangChain)
    C-->|User Input|D[Llama 2]
    D-->|Model Output|C
    E --> C
    classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;
  """)

def rag_arch():
  mm("""
  flowchart TD
    A[User Prompts] --> B(Frameworks e.g. LangChain)
    B <--> |Database, Docs, XLS|C[fa:fa-database External Data]
    B -->|API|D[Llama 2]
    classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;
  """)

def llama2_family():
  mm("""
  graph LR;
      llama-2 --> llama-2-7b
      llama-2 --> llama-2-13b
      llama-2 --> llama-2-70b
      llama-2-7b --> llama-2-7b-chat
      llama-2-13b --> llama-2-13b-chat
      llama-2-70b --> llama-2-70b-chat
      classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;
  """)

def apps_and_llms():
  mm("""
  graph LR;
    users --> apps
    apps --> frameworks
    frameworks --> platforms
    platforms --> Llama 2
    classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;
  """)

import ipywidgets as widgets
from IPython.display import display, Markdown

# Create a text widget
API_KEY = widgets.Password(
    value='',
    placeholder='',
    description='API_KEY:',
    disabled=False
)

def md(t):
  display(Markdown(t))

def bot_arch():
  mm("""
  graph LR;
  user --> prompt
  prompt --> i_safety
  i_safety --> context
  context --> Llama_2
  Llama_2 --> output
  output --> o_safety
  i_safety --> memory
  o_safety --> memory
  memory --> context
  o_safety --> user
  classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;
  """)

def fine_tuned_arch():
  mm("""
  graph LR;
      Custom_Dataset --> Pre-trained_Llama
      Pre-trained_Llama --> Fine-tuned_Llama
      Fine-tuned_Llama --> RLHF
      RLHF --> |Loss:Cross-Entropy|Fine-tuned_Llama
      classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;
  """)

def load_data_faiss_arch():
  mm("""
  graph LR;
      documents --> textsplitter
      textsplitter --> embeddings
      embeddings --> vectorstore
      classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;
  """)

def mem_context():
  mm("""
      graph LR
      context(text)
      user_prompt --> context
      instruction --> context
      examples --> context
      memory --> context
      context --> tokenizer
      tokenizer --> embeddings
      embeddings --> LLM
      classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;
  """)

In [2]:
# Install dependencies and initialize
%pip install -qU \
    replicate \
    langchain \
    sentence_transformers \
    pdf2image \
    pdfminer \
    pdfminer.six \
    unstructured



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.0/817.0 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.5/149.5 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m246.4/246.4 kB[0m [31m10.3 MB/s[

In [3]:
# model url on Replicate platform that we will use for inferencing
# We will use llama 13b chat model hosted on replicate server ()

llama2_13b = "meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d"

In [4]:
# We will use Replicate hosted cloud environment
# Obtain Replicate API key → https://replicate.com/account/api-tokens)

# enter your replicate api token
from getpass import getpass
import os
os.environ["REPLICATE_API_TOKEN"]='r8_earHMnOQNEkQgi3AlVpLl4RaQAwMx5A2YBLj9'

# alternatively, you can also store the tokens in environment variables and load it here


In [5]:
# we will use replicate's hosted api
import replicate

# text completion with input prompt
def Completion(prompt):
  output = replicate.run(
      llama2_13b,
      input={"prompt": prompt, "max_new_tokens":1000}
  )
  return "".join(output)

# chat completion with input prompt and system prompt
def ChatCompletion(prompt, system_prompt=None):
  output = replicate.run(
    llama2_13b,
    input={"system_prompt": system_prompt,
            "prompt": prompt,
            "max_new_tokens":1500}
  )
  return "".join(output)

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
import pandas as pd
import csv
from tqdm import tqdm

# Read the input file
df = pd.read_csv('/content/drive/MyDrive/Datasets_US Bank/cleaned_financial_alpaca.csv', skiprows=range(1, 4928))
print("Length of DataFrame:", len(df))
print(df.head())

# Open the output file
with open('/content/drive/MyDrive/Datasets_US Bank/llama2_financial_alpaca_output.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)

    # Write the header row
    writer.writerow(['Prompt', 'Output'])

    # Loop through each line in the input file in batches of 16
    for batch_start in tqdm(range(0, len(df), 16)):
    #for batch_start in tqdm(range(0, 33, 16)):
        batch_df = df.iloc[batch_start:batch_start + 16]

        # Process each prompt in the batch
        for _, row in batch_df.iterrows():
            prompt = row['Input']
            output = ChatCompletion(prompt, system_prompt="Assume you are a Wealth Management advisor and answer the question concisely in a maximum of 100 words.")

            # Write the prompt and output to the file
            writer.writerow([prompt, output])
            # Print the prompt and output
            print("Prompt:", prompt)
            print("Output:", output)
            print("---")


Length of DataFrame: 63984
                                               Input  \
0  Doctor's office won't submit claim to insuranc...   
1  Why are stocks having less institutional inves...   
2  Is it necessary to pay tax if someone lends me...   
3        How To Record Income As An Affiliate ( UK )   
4     What is a good price to “Roll” a Covered Call?   

                                              Output  
0  I'm a business law student, so medical stuff i...  
1  Institutional investors are the "elephant" in ...  
2  I can't vouch for Australian law, but in the U...  
3  Every bill you write counts as income (if the ...  
4  There is no reason to roll an option if the cu...  


  0%|          | 0/3 [00:00<?, ?it/s]

Prompt: Doctor's office won't submit claim to insurance after 5 months
Output:  As a Wealth Management advisor, I recommend you pursue an appeal with the doctor's office and request they resubmit the claim to your insurance provider. If unsuccessful, consider filing a complaint with your state's department of insurance or seeking legal advice. Don't let this delay derail your financial well-being – take proactive steps to resolve the issue.
---
Prompt: Why are stocks having less institutional investors a “good thing”?
Output:  Fewer institutional investors in stocks can be beneficial as it reduces the influence of passive, index-based strategies and increases the potential for active management and long-term value creation. This can lead to more stable and sustainable growth, rather than simply following an index. Additionally, it may encourage more individual investor participation and greater diversity in ownership structures.
---
Prompt: Is it necessary to pay tax if someone lends m

 33%|███▎      | 1/3 [00:35<01:10, 35.38s/it]

Prompt: Choosing the limit when making a limit order?
Output:  As a Wealth Management advisor, I recommend choosing a limit price that reflects your desired entry point for the security, based on your risk tolerance and investment objectives. Consider factors such as current market conditions, recent price trends, and the stock's volatility to determine an appropriate limit price.
---
Prompt: What were the main causes of the spike and drop of DRYS's stock price?
Output:  As a Wealth Management advisor, the main causes of the spike and drop of DRYS's stock price can be attributed to the company's aggressive expansion into new markets, which led to increased costs and dilution of its brand. Additionally, the company's over-reliance on a few key products and lack of diversification in its product portfolio contributed to the volatility in its stock price.
---
Prompt: Can I Accept Gold?
Output:  As a Wealth Management advisor, it is generally not advisable to accept gold as an investment v

 67%|██████▋   | 2/3 [01:15<00:38, 38.30s/it]

Prompt: How to use proceeds of old house sale shortly after buying new house?
Output:  As a Wealth Management advisor, I recommend using the proceeds from the sale of your old house to pay down any existing mortgage on your new home, if possible. This will help reduce your debt and free up more money for other expenses or investments. Alternatively, consider putting the funds into a short-term investment vehicle, such as a high-yield savings account or a money market fund, to earn interest until you need the funds for living expenses or other purposes.
---
Prompt: Setting a trailing stop loss at $39.70 bid price, stock sold at $41
Output:  As a Wealth Management advisor, I would recommend setting a trailing stop loss at $39.70 based on the current bid price to protect your gains, while also allowing for potential future upside. This approach will automatically adjust the stop loss price as the stock price moves higher, ensuring that your profits are protected should the market turn.
--

100%|██████████| 3/3 [01:50<00:00, 36.67s/it]

Prompt: Could someone explain this scenario about Google's involvement in the wireless spectrum auction?
Output:  Sure! In short, Google participated in the recent wireless spectrum auction in the US to acquire airwaves for its new mobile carrier, Google Fi. The company spent around $4.6 billion on spectrum licenses, which will allow it to offer wireless services to customers across the country. This move is part of Google's broader push into the telecom industry, as it looks to compete with traditional carriers like Verizon and AT&T.
---





In [None]:
import pandas as pd
import csv
from tqdm import tqdm

# Read the input file
df = pd.read_csv('/content/drive/MyDrive/Datasets_US Bank/cleaned_financial_alpaca.csv', skiprows=range(1, 4928))
print("Length of DataFrame:", len(df))
print(df.head())

# Open the output file
with open('/content/drive/MyDrive/Datasets_US Bank/llama2_financial_alpaca_output_new.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)

    # Write the header row
    writer.writerow(['Prompt', 'Output'])

    # Loop through each line in the input file in batches of 16
    for batch_start in tqdm(range(240, len(df), 16)):
        batch_df = df.iloc[batch_start:batch_start + 16]

        # Create prompts for the batch
        prompts = ["Assume you are a Wealth Management advisor and answer the question concisely in a maximum of 100 words." + row['Input'] for _, row in batch_df.iterrows()]

        # Generate text for the batch
        outputs = [ChatCompletion(prompt, system_prompt="") for prompt in prompts]

        # Write the output to the file for each row in the batch
        for i, (_, row) in enumerate(batch_df.iterrows()):
            writer.writerow([prompts[i], outputs[i]])



Length of DataFrame: 63984
                                               Input  \
0  Doctor's office won't submit claim to insuranc...   
1  Why are stocks having less institutional inves...   
2  Is it necessary to pay tax if someone lends me...   
3        How To Record Income As An Affiliate ( UK )   
4     What is a good price to “Roll” a Covered Call?   

                                              Output  
0  I'm a business law student, so medical stuff i...  
1  Institutional investors are the "elephant" in ...  
2  I can't vouch for Australian law, but in the U...  
3  Every bill you write counts as income (if the ...  
4  There is no reason to roll an option if the cu...  


  1%|          | 29/3984 [22:59<48:23:37, 44.05s/it]