<a href="https://colab.research.google.com/github/PratiteeMalakar/hello-world/blob/main/tech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Step 1: Load the text file
def load_text_file(file_path):
    with open(file_path, 'r') as file:
        text_data = file.read()
    return text_data

# Step 2: Create chunks
def create_chunks(text_data, chunk_size=500, chunk_overlap=20):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = splitter.split_text(text_data)
    return chunks

# # Step 3: Process each chunk and store embeddings
# def process_and_store_chunks(file_path, index):
#     text_data = load_text_file(file_path)
#     chunks = create_chunks(text_data)

#     for i, chunk in enumerate(chunks):
#         doc_id = f"{file_path}_chunk_{i}"
#         embeddings = add_data_to_index(chunk, doc_id, index)

#     return f"Processed {len(chunks)} chunks."

# # Example usage:
# file_path = "your_file.txt"  # Replace with your file path
# process_and_store_chunks(file_path, index)


In [2]:
pip install langchain

Collecting langchain
  Downloading langchain-0.2.13-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.3.0,>=0.2.30 (from langchain)
  Downloading langchain_core-0.2.30-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.99-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.30->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4

In [6]:
import pinecone

In [5]:
pip install pinecone openai groq

Collecting pinecone
  Downloading pinecone-5.0.1-py3-none-any.whl.metadata (18 kB)
Collecting openai
  Downloading openai-1.40.6-py3-none-any.whl.metadata (22 kB)
Collecting groq
  Downloading groq-0.9.0-py3-none-any.whl.metadata (13 kB)
Collecting pinecone-client==5.0.1 (from pinecone)
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client==5.0.1->pinecone)
  Downloading pinecone_plugin_inference-1.0.3-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client==5.0.1->pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->o

In [7]:
from pinecone import Pinecone
import os
from google.colab import userdata

os.environ["PINECONE_API_KEY"]=userdata.get("PINECONE")

api_key = os.getenv("PINECONE_API_KEY")

pc = Pinecone(api_key = api_key)

In [8]:
from pinecone import ServerlessSpec

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud , region=region)

In [9]:
import time
index_name = "indexci3"

if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

# we create a new index
pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of text-embedding-ada-002
        metric='dotproduct',
        spec=spec
    )

# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

In [10]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [11]:
import openai
import numpy as np


os.environ["OPENAI_API_KEY"]=userdata.get("OPENAI")

openai_api_key = os.getenv("OPENAI_API_KEY")

In [24]:
from openai import OpenAI
client = OpenAI(api_key=openai_api_key)

def get_embedding(text, model="text-embedding-3-small"):
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [13]:
def add_data_to_index(text_data, doc_id):
    # Generate embeddings for the given text data

    embeddings = get_embedding(text_data)

    # Upsert the data to Pinecone index
    index.upsert([
        {
            'id': f'doc_{doc_id}',
            'values': embeddings,
            'metadata': {'text': text_data}
        }
    ])

    return embeddings

In [15]:
text = load_text_file("four_company_merged_text.txt")

In [16]:
chunks = create_chunks(text)

In [17]:
len(chunks)

20090

In [19]:
doc_counter = 0
for i, chunk in enumerate(chunks):
      doc_id = f"doc {doc_counter}"
      add_data_to_index(chunk, doc_id)
      doc_counter += 1
print(f"Processed {len(chunks)} chunks.")

Processed 20090 chunks.


In [20]:
# Function to retrieve similar documents
def retrieve_similar_docs(query, k=5):
    query_embedding = generate_embeddings(query)
    response = index.query(vector=query_embedding, top_k=k)
    return response['matches']

In [26]:
def generate_response(query):
    # Generate embedding for the query
    query_embedding = get_embedding(query)

    # Query Pinecone for similar documents
    response = index.query(vector=query_embedding, top_k=5, include_metadata=True)
    similar_docs = response['matches']

    # Check if any similar documents were found
    if similar_docs:
        # Extract context from similar documents
        context = ' '.join([doc['metadata'].get('text', '') if 'metadata' in doc else '' for doc in similar_docs])
        input_text = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"

        # Generate response using Groq AI
        chat_completion = client1.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": input_text,
                }
            ],
            model="llama3-70b-8192",  # Replace with the correct model identifier from Groq AI
        )

        return chat_completion.choices[0].message.content
    else:
        return "No similar documents found. Please try a different query."

In [25]:
import os

from groq import Groq

os.environ["GROQ_API_KEY"]=userdata.get("GROQ")

client1 = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)

In [28]:
query = "Tell me about EXL Service"
response = generate_response(query)
print(response)

Based on the provided context, EXL Service appears to be a company that provides a range of services, including:

1. **ITSM (IT Service Management)**: EXL Service offers predictive analytics for ITSM data, enabling quick and correct decisions through granular level filtering and breakdown of data.

2. **Cloud Financial Management**: The company's FinOps module provides intelligent insights for effective visualization, giving customers more control over their purchases and after-sale experiences.

3. **XR (Extended Reality) Solutions**: EXL Service offers XTERN suite, a cloud-integrated rapid XR application development platform, as well as XR in a Box, a validated XR solution for pharmaceutical companies.

4. **IT-enabled Services**: The company provides a range of IT-enabled services, including digital advisory, customer-centric design, consulting, custom application design, development, re-engineering, and maintenance, systems integration, package implementation, global infrastructure