In [1]:
import inspect

from getpass import getpass
from langchain import OpenAI
from langchain.chains import LLMChain, ConversationChain
from langchain.chains.conversation.memory import (ConversationBufferMemory, 
                                                  ConversationSummaryMemory, 
                                                  ConversationBufferWindowMemory,
                                                  ConversationKGMemory)
from langchain.callbacks import get_openai_callback
import tiktoken
import pickle
import os
import pandas as pd

In [5]:
data_path = '../data'
filepath = os.path.join(data_path,'chat_embeddings.pkl')
with open(filepath,'rb') as file:
    # A new file will be created
    chat_embeddings = pickle.load(file)

In [7]:
# Check the length of the embedding files
len(chat_embeddings[0]['data'][0]['embedding'])

1536

In [12]:
import pinecone
pinecone_api_key= # place your key here
# initialize connection to pinecone
pinecone.init(
    api_key=pinecone_api_key,  # app.pinecone.io (console)
    environment="eu-west1-gcp"  # next to API key in console
)
# check if index already exists (it shouldn't if this is first time)

  from tqdm.autonotebook import tqdm


In [14]:
index_name = 'personalized-bot'
if index_name not in pinecone.list_indexes():
    # if does not exist, create index
    pinecone.create_index(
        index_name,
        dimension=len(chat_embeddings[0]['data'][0]['embedding']),
        metric='dotproduct'
    )

In [33]:
index = pinecone.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [20]:
filename = 'text_fields.csv'
df = pd.read_csv(os.path.join(data_path,filename))

In [22]:
df

Unnamed: 0,text_field
0,prady: aj kuch krna hai kya. aarish: aaja ghar
1,prady: deepanshu?. aarish: chlega ayga meko l...
2,prady: pubg?. aarish: kal khelengr
3,prady: 30 min bs. aarish: thik hai aaja. ayga...
4,prady: mai ghum hi ra tha us area me. aarish: ...
...,...
3669,prady: bas. aarish: functional ke andar ivy v...
3670,prady: functional api hai. aarish: m toh sirf...
3671,prady: bol bhai. aarish: aaja meet mein
3672,prady: ???. aarish: aana hackathon ka thoda k...


In [23]:
vectors= []
for vector in chat_embeddings:
    vectors.append(vector['data'][0]['embedding'])

In [24]:
text = df['text_field'].values
meta_batch = [{
        'text': x} for x in df['text_field'].values]
df['vector_id'] = df.index
df['vector_id'] = df['vector_id'].apply(str)
ids = (df['vector_id'].values)

In [31]:
from typing import Iterator
import numpy as np
class BatchGenerator:
    """ Models a simple batch generator that make chunks out of an input DataFrame. """
    
    def __init__(self, batch_size: int = 10) -> None:
        self.batch_size = batch_size
    
    def to_batches(self, df: pd.DataFrame) -> Iterator[pd.DataFrame]:
        """ Makes chunks out of an input DataFrame. """
        splits = self.splits_num(df.shape[0])
        if splits <= 1:
            yield df
        else:
            for chunk in np.array_split(df, splits):
                yield chunk
    
    def splits_num(self, elements: int) -> int:
        """ Determines how many chunks DataFrame contians. """
        return round(elements / self.batch_size)
    
    __call__ = to_batches

df_batcher = BatchGenerator(300)

In [27]:
df['vector_embeddings'] = vectors

In [28]:
df

Unnamed: 0,text_field,vector_id,vector_embeddings
0,prady: aj kuch krna hai kya. aarish: aaja ghar,0,"[0.021390901878476143, -0.02353258803486824, -..."
1,prady: deepanshu?. aarish: chlega ayga meko l...,1,"[0.01264501828700304, -0.024169592186808586, -..."
2,prady: pubg?. aarish: kal khelengr,2,"[0.014415320008993149, -0.018524296581745148, ..."
3,prady: 30 min bs. aarish: thik hai aaja. ayga...,3,"[0.005870819091796875, -0.021560825407505035, ..."
4,prady: mai ghum hi ra tha us area me. aarish: ...,4,"[-0.004257076885551214, -0.02479679509997368, ..."
...,...,...,...
3669,prady: bas. aarish: functional ke andar ivy v...,3669,"[0.012840365059673786, -0.005695781204849482, ..."
3670,prady: functional api hai. aarish: m toh sirf...,3670,"[0.02033286541700363, -0.013291946612298489, -..."
3671,prady: bol bhai. aarish: aaja meet mein,3671,"[-0.0020940264221280813, -0.026402940973639488..."
3672,prady: ???. aarish: aana hackathon ka thoda k...,3672,"[0.00557997589930892, -0.02382233366370201, -0..."


In [34]:
# lets uspert the data using batches for better speed (note that the batch size set is 300)
k = []
for batch_df in df_batcher(df):
    meta_batch = [{
        "text": x} for x in batch_df['text_field'].values]
    k.append(list(zip(batch_df.vector_id, batch_df.vector_embeddings,meta_batch)))
    index.upsert(vectors=list(zip(batch_df.vector_id, batch_df.vector_embeddings,meta_batch)), namespace='index_1')