In [1]:
import pandas as pd
import tiktoken
import openai
import langchain

import os, sys
from uuid import uuid4

from dotenv import load_dotenv
load_dotenv()

In [2]:
#load openAI api key

openai.api_key = os.getenv("OPENAI_API_KEY")

In [3]:
tiktoken_encoding = tiktoken.get_encoding("gpt2")
tiktoken_encoding

<Encoding 'gpt2'>

In [6]:
encoded_text = tiktoken_encoding.encode("This is the first text")
encoded_text

[1212, 318, 262, 717, 2420]

In [21]:
import transformers

gpt2_model = transformers.AutoModelForCausalLM.from_pretrained("gpt2")

encoder_details = {
    "vocabulary_size": gpt2_model.config.vocab_size,
    "embedding_dim": gpt2_model.config.n_embd,
}

print(encoder_details)


{'vocabulary_size': 50257, 'embedding_dim': 768}


In [27]:
gpt2_tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")
gpt2_tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

In [32]:
encoded_text = gpt2_tokenizer.encode("This is the first text")
encoded_text

[1212, 318, 262, 717, 2420]

In [34]:
contents = []

tiktoken_encoding = tiktoken.get_encoding("gpt2")

path = r"C:\Users\Ricky\Sample_Datasets\Text_files"

for file in os.listdir(path):
    with open(os.path.join(path,file), "r") as f:
        file_content = f.read()
        tokens = tiktoken_encoding.encode(file_content)
        total_tokens = len(tokens)
        contents.append((file, file_content, total_tokens))
        
df = pd.DataFrame(contents, columns=['filename', 'file_content', 'tokens'])

df['embeddings'] = df.file_content.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])
df['id'] = [str(uuid4()) for i in range (len(df))]
df.to_excel('Text_files_Embeddings.xlsx')
df.head()

Unnamed: 0,filename,file_content,tokens,embeddings,id
0,Text_file_1.txt,RNN (Recurrent Neural Network):\nImagine you'r...,395,"[-0.0245195385068655, 0.007557663135230541, 0....",5bafa779-3329-47e1-a525-21d9d9343161
1,Text_file_2.txt,"In a confusion matrix, TP, TN, FP, and FN repr...",441,"[-0.010267744772136211, 0.0009929778752848506,...",eba69c68-8ded-44cc-bab3-797f28b9c17d


In [37]:
df['len'] = df['embeddings'].apply(len)
df.head()

Unnamed: 0,filename,file_content,tokens,embeddings,id,len
0,Text_file_1.txt,RNN (Recurrent Neural Network):\nImagine you'r...,395,"[-0.0245195385068655, 0.007557663135230541, 0....",5bafa779-3329-47e1-a525-21d9d9343161,1536
1,Text_file_2.txt,"In a confusion matrix, TP, TN, FP, and FN repr...",441,"[-0.010267744772136211, 0.0009929778752848506,...",eba69c68-8ded-44cc-bab3-797f28b9c17d,1536


In [60]:
import uuid
df['id'] = [str(uuid4()) for i in range (len(df))]
df.to_excel('Text_files_Embeddings.xlsx')
df.head()

Unnamed: 0,filename,file_content,tokens,embeddings,id
0,Text_file_1.txt,RNN (Recurrent Neural Network):\nImagine you'r...,395,"[-0.0245195422321558, 0.007557664066553116, 0....",86fc9c4e-4168-4cda-a3de-6d21f9c05a31
1,Text_file_2.txt,"In a confusion matrix, TP, TN, FP, and FN repr...",441,"[-0.010267744772136211, 0.0009929778752848506,...",eff0a9e7-c103-413c-94f9-335e2ff19ed3


In [61]:
from langchain.vectorstores import pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

EMBEDDING_MODEL = "Text-embedding-ada-002"
embeddings = OpenAIEmbeddings(openai_api_key=openai.api_key)

  from tqdm.autonotebook import tqdm


In [70]:
import pinecone

pinecone.init(
    api_key = os.getenv("PINECONE_API_KEY"),
    environment = "gcp-starter"
)

index_name = "test-1"

if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension = 1536, metric='cosine')
    print("Pinecone index is created")
    
index = pinecone.Index(index_name)

Pinecone index is created


In [74]:
from tqdm.auto import tqdm

batch_size = 100 # how many embeddings we create and insert at once

# Convert the DataFrame to a list of dictionaries
chunks = df.to_dict(orient='records')

# Upsert embeddings into Pinecone in batches of 100
for i in tqdm (range(0, len(chunks), batch_size)):
    i_end = min(len(chunks), i+batch_size)
    meta_batch = chunks[i:i_end]
    ids_batch = [x['id'] for x in meta_batch]
    embeddings = [x['embeddings'] for x in meta_batch]
    data = [ {
    'filename': x['filename'],
    'file_content': x['file_content']
    } for x in meta_batch]
    
    to_upsert = list(zip(ids_batch, embeddings, data))
    index.upsert(vectors=to_upsert)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.48it/s]


In [76]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 2e-05,
 'namespaces': {'': {'vector_count': 2}},
 'total_vector_count': 2}

In [83]:
limit = 50

def construct_prompt(query):
    embedding_model = "text-embedding-ada-002"
    embed_query = openai.Embedding.create(input=query, 
                                          engine=embedding_model)
    
    # retrieve from Pinecone
    query_embeds = embed_query['data'][0]['embedding']
    
    # get relevant contexts (including the questions)
    response = index.query(query_embeds, top_k=3, include_metadata=True)
    contexts = [x['metadata']['file_content'] for x in response ['matches']]
    
    # build our prompt with the retrieved contexts included
    prompt_start = ("Answer the question based on the context below.\n\n Context: \n")
    prompt_end = (f"\n\nQuestion: {query}\nAnswer: ")
    
    # append contexts until hitting limit
    for i in range (1, len(contexts)):
        if len("-".join(contexts[:i])) >= limit:
            prompt = (prompt_start + "-".join(contexts[:i-1]) + prompt_end)
            break
        elif i == len(contexts) - 1:
            prompt = (prompt_start + "-".join(contexts) + prompt_end)
    return prompt

In [84]:
query = "what is RNN in 15 words?"
prompt_with_context = construct_prompt(query)
prompt_with_context

'Answer the question based on the context below.\n\n Context: \n\n\nQuestion: what is RNN in 15 words?\nAnswer: '

In [89]:
result = openai.Completion.create(
    engine = 'text-davinci-003',
    prompt = prompt_with_context,
    temperature = 0,
    max_tokens = 350,
    top_p = 1
)
result

<OpenAIObject text_completion id=cmpl-7zoVG5O3WwCZrSig3uEKB2V1tQjDJ at 0x2439ad75d60> JSON: {
  "id": "cmpl-7zoVG5O3WwCZrSig3uEKB2V1tQjDJ",
  "object": "text_completion",
  "created": 1694966498,
  "model": "text-davinci-003",
  "choices": [
    {
      "text": " Recurrent Neural Network is a type of artificial neural network that processes data sequentially.",
      "index": 0,
      "logprobs": null,
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 29,
    "completion_tokens": 17,
    "total_tokens": 46
  }
}

In [90]:
result['choices'][0]['text']

' Recurrent Neural Network is a type of artificial neural network that processes data sequentially.'