In [21]:
import os
from dotenv import load_dotenv

# use below when running in outside jupyter notebook
#load_dotenv()

#use below when running inside jupyter notebook
%load_ext dotenv
%dotenv

# Load the values from .env
API_KEY = os.environ['API_KEY']
RESOURCE_ENDPOINT = os.environ['RESOURCE_ENDPOINT']

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [22]:
import openai
import requests

openai.api_type = "azure"
openai.api_key = API_KEY
openai.api_base = RESOURCE_ENDPOINT
openai.api_version = "2022-12-01"

In [23]:
url = openai.api_base + "/openai/deployments?api-version=2022-12-01" 

r = requests.get(url, headers={"api-key": API_KEY})
print(r.text)

{
  "data": [
    {
      "scale_settings": {
        "scale_type": "standard"
      },
      "model": "text-davinci-003",
      "owner": "organization-owner",
      "id": "openaidavinci",
      "status": "succeeded",
      "created_at": 1682328861,
      "updated_at": 1682328861,
      "object": "deployment"
    },
    {
      "scale_settings": {
        "scale_type": "standard"
      },
      "model": "gpt-35-turbo",
      "owner": "organization-owner",
      "id": "gpt35turbo",
      "status": "succeeded",
      "created_at": 1682370638,
      "updated_at": 1688708663,
      "object": "deployment"
    },
    {
      "scale_settings": {
        "scale_type": "standard"
      },
      "model": "text-embedding-ada-002",
      "owner": "organization-owner",
      "id": "text-embedding-ada-002",
      "status": "succeeded",
      "created_at": 1684818936,
      "updated_at": 1684818936,
      "object": "deployment"
    },
    {
      "scale_settings": {
        "scale_type": "standard"
 

In [24]:
file = open("Help.txt", "r")
lines = file.readlines()
lines[:20]

['My name: Hi My name is Nilay Shah. \n',
 "Education: I'm graduated from DA-IICT and I'm 2020 passout.\n",
 "Profession: I'm software engineer by profession.\n",
 'Likes: My favourite cricketer is Virat Kohli.\n',
 "Likes: I'm fond of watching foolball games as well.\n",
 'Filler: This file can contain millions of lines\n',
 '...\n',
 '...\n',
 '...\n']

In [25]:
import re
import pandas as pd

#cleanup and remove whitespaces and new lines
def normalize_text(s):
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","",s)
    # remove all instances of multiple spaces
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.strip()
    
    return s

#create a dictionary of title and paragraph
allTexts = []

previousLine = ''
title = 'Know about Nilay'
paragraph = []

for line in lines:
    line = normalize_text(line)
    line
    if line == '':
        continue
    elif line.startswith("My custom logic"):
        # implement your custom logic here.
        pass
    else: 
        allTexts.append([line.split(":")[0], ' '.join(line)])

#Load the dictionary into a Data Frame
df = pd.DataFrame(allTexts, columns = ['title', 'text'])
df

Unnamed: 0,title,text
0,My name,M y n a m e : H i M y n a m e i s ...
1,Education,E d u c a t i o n : I ' m g r a d u a t e ...
2,Profession,P r o f e s s i o n : I ' m s o f t w a r ...
3,Likes,L i k e s : M y f a v o u r i t e c r i ...
4,Likes,L i k e s : I ' m f o n d o f w a t c ...
5,Filler,F i l l e r : T h i s f i l e c a n c ...
6,..,. .
7,..,. .
8,..,. .


In [26]:
import tiktoken
tokenizer = tiktoken.get_encoding("cl100k_base")

#calculate number of tokens for each text columns
df['n_tokens'] = df["text"].apply(lambda x: len(tokenizer.encode(x)))
#remove any section that is greater than 8K tokens, better way would be to split the section into smaller pieces
df = df[df.n_tokens<8192]
df

Unnamed: 0,title,text,n_tokens
0,My name,M y n a m e : H i M y n a m e i s ...,34
1,Education,E d u c a t i o n : I ' m g r a d u a t e ...,63
2,Profession,P r o f e s s i o n : I ' m s o f t w a r ...,48
3,Likes,L i k e s : M y f a v o u r i t e c r i ...,45
4,Likes,L i k e s : I ' m f o n d o f w a t c ...,51
5,Filler,F i l l e r : T h i s f i l e c a n c ...,47
6,..,. .,2
7,..,. .,2
8,..,. .,2


In [27]:
from openai.embeddings_utils import get_embedding, cosine_similarity
#Caution run this only once
df['ada_v2'] = df["text"].apply(lambda x : get_embedding(x, engine = 'text-embedding-ada-002')) # engine should be set to the deployment name you chose when you deployed the text-embedding-ada-002 (Version 2) model
df.to_csv('embeddings.csv')
df.head()
df

Unnamed: 0,title,text,n_tokens,ada_v2
0,My name,M y n a m e : H i M y n a m e i s ...,34,"[-0.021562892943620682, -0.007291931193321943,..."
1,Education,E d u c a t i o n : I ' m g r a d u a t e ...,63,"[-0.008981960825622082, -0.002542386297136545,..."
2,Profession,P r o f e s s i o n : I ' m s o f t w a r ...,48,"[-0.00974990427494049, -0.01973761059343815, -..."
3,Likes,L i k e s : M y f a v o u r i t e c r i ...,45,"[-0.015658726915717125, 0.006818894762545824, ..."
4,Likes,L i k e s : I ' m f o n d o f w a t c ...,51,"[-0.017090141773223877, 0.0007425870862789452,..."
5,Filler,F i l l e r : T h i s f i l e c a n c ...,47,"[0.0022929783444851637, -0.0011751066194847226..."
6,..,. .,2,"[-0.005569935776293278, -0.012087945826351643,..."
7,..,. .,2,"[-0.005569935776293278, -0.012087945826351643,..."
8,..,. .,2,"[-0.005569935776293278, -0.012087945826351643,..."


In [28]:
df = pd.read_csv('embeddings.csv', index_col=0)
df.columns = ['title', 'text', 'n_tokens', 'ada_v2']
df

Unnamed: 0,title,text,n_tokens,ada_v2
0,My name,M y n a m e : H i M y n a m e i s ...,34,"[-0.021562892943620682, -0.007291931193321943,..."
1,Education,E d u c a t i o n : I ' m g r a d u a t e ...,63,"[-0.008981960825622082, -0.002542386297136545,..."
2,Profession,P r o f e s s i o n : I ' m s o f t w a r ...,48,"[-0.00974990427494049, -0.01973761059343815, -..."
3,Likes,L i k e s : M y f a v o u r i t e c r i ...,45,"[-0.015658726915717125, 0.006818894762545824, ..."
4,Likes,L i k e s : I ' m f o n d o f w a t c ...,51,"[-0.017090141773223877, 0.0007425870862789452,..."
5,Filler,F i l l e r : T h i s f i l e c a n c ...,47,"[0.0022929783444851637, -0.0011751066194847226..."
6,..,. .,2,"[-0.005569935776293278, -0.012087945826351643,..."
7,..,. .,2,"[-0.005569935776293278, -0.012087945826351643,..."
8,..,. .,2,"[-0.005569935776293278, -0.012087945826351643,..."


In [34]:
question = "What is my favourite game?"
def search_docs(df, user_query, t_tokens=3000):
    #get embedding for user query
    embedding = get_embedding(
        user_query,
        engine="text-embedding-ada-002" 
    )
    # Identify cosine_similarity for between each help section and the user query
    df["similarities"] = df.ada_v2.apply(lambda x: cosine_similarity(list(map(float, x[1:-1].split(','))), embedding))

    #sort based on similarity
    dfsorted = df.sort_values("similarities", ascending=False)
    #pick only those top sections which fit within our total token count
    return dfsorted[dfsorted["n_tokens"].cumsum() < t_tokens]

res = search_docs(df, question, t_tokens=3000)
res

Unnamed: 0,title,text,n_tokens,ada_v2,similarities
4,Likes,L i k e s : I ' m f o n d o f w a t c ...,51,"[-0.017090141773223877, 0.0007425870862789452,...",0.757569
7,..,. .,2,"[-0.005569935776293278, -0.012087945826351643,...",0.745735
8,..,. .,2,"[-0.005569935776293278, -0.012087945826351643,...",0.745735
6,..,. .,2,"[-0.005569935776293278, -0.012087945826351643,...",0.745735
3,Likes,L i k e s : M y f a v o u r i t e c r i ...,45,"[-0.015658726915717125, 0.006818894762545824, ...",0.733956
0,My name,M y n a m e : H i M y n a m e i s ...,34,"[-0.021562892943620682, -0.007291931193321943,...",0.715529
1,Education,E d u c a t i o n : I ' m g r a d u a t e ...,63,"[-0.008981960825622082, -0.002542386297136545,...",0.71118
2,Profession,P r o f e s s i o n : I ' m s o f t w a r ...,48,"[-0.00974990427494049, -0.01973761059343815, -...",0.708968
5,Filler,F i l l e r : T h i s f i l e c a n c ...,47,"[0.0022929783444851637, -0.0011751066194847226...",0.681469


In [35]:
allText = '\n '.join("###\n" + res["title"] + " \n " + res["text"])
allText

"###\nLikes \n L i k e s :   I ' m   f o n d   o f   w a t c h i n g   f o o l b a l l   g a m e s   a s   w e l l .\n ###\n.. \n . .\n ###\n.. \n . .\n ###\n.. \n . .\n ###\nLikes \n L i k e s :   M y   f a v o u r i t e   c r i c k e t e r   i s   V i r a t   K o h l i .\n ###\nMy name \n M y   n a m e :   H i   M y   n a m e   i s   N i l a y   S h a h .\n ###\nEducation \n E d u c a t i o n :   I ' m   g r a d u a t e d   f r o m   D A - I I C T   a n d   I ' m   2 0 2 0   p a s s o u t .\n ###\nProfession \n P r o f e s s i o n :   I ' m   s o f t w a r e   e n g i n e e r   b y   p r o f e s s i o n .\n ###\nFiller \n F i l l e r :   T h i s   f i l e   c a n   c o n t a i n   m i l l i o n s   o f   l i n e s"

In [36]:
query = f"""Use the below article to answer the subsequent question. If the answer cannot be found, write "I don't know."

Titles are provided in the article in the format ###\n<title>.

Article:
\"\"\"
{allText}
\"\"\"

Respond in the following format:
\n\nQuestion: <question>
\n\nAnswer: <answer>
\n\nTitle: <title>

Question: {question}"""

response = openai.ChatCompletion.create(
    engine="gpt35turbo",
    api_version="2023-03-15-preview",
    messages=[
        {"role": "system", "content": "You answer questions about care management" },
        {"role": "user", "content": query}
    ],
    temperature=0,
)

response['choices'][0]['message']['content']

'Answer: My favourite game is cricket.\n\nTitle: Likes'