In [1]:
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_groq import ChatGroq
from langchain_ollama import ChatOllama

In [2]:
from dotenv import load_dotenv
import os

# Load environment variables from the .env file
load_dotenv()

groq_api_key = os.getenv('GROQ_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
cohere_api_key = os.getenv('COHERE_API_KEY')

In [3]:
llm_local = ChatOllama(model="mistral:instruct")
llm_groq = ChatGroq(
            groq_api_key=groq_api_key,
            model_name='mixtral-8x7b-32768'
    )

In [4]:
# Read the PDF file
pdf = PyPDF2.PdfReader(r"C:\Users\swarn\Desktop\Swarnim_Shekhar_Resume.pdf")
pdf_text = ""
for page in pdf.pages:
    pdf_text += page.extract_text()

In [5]:
# Split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_text(pdf_text)

In [6]:
embeddings = OllamaEmbeddings(model="nomic-embed-text")

r1 = embeddings.embed_documents(
    texts
)

  embeddings = OllamaEmbeddings(model="nomic-embed-text")


In [8]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=pinecone_api_key)

pc.create_index(
  name="rag-qa",
  dimension=768,
  metric="cosine",
  spec=ServerlessSpec(
    cloud="aws",
    region="us-east-1"
  )
)

index = pc.Index("rag-qa")

In [9]:
for i in range(len(texts)):
    index.upsert([((str(i),r1[i],{"text":texts[i]}))])
    
print("done upserting...")

done upserting...


In [10]:
def get_query_embdedding(text):
    embedding=embeddings.embed_query(text)
    return embedding

In [11]:
import cohere

co = cohere.Client(cohere_api_key)

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\swarn\AppData\Local\sagemaker\sagemaker\config.yaml


In [21]:
query="Can Swarnim be hired as a Gen AI engineer?"

question_embedding=get_query_embdedding(query)

query_result = index.query(vector=question_embedding, top_k=5, include_metadata=True)
similar_texts = []
# Extract metadata from query result
docs = {x["metadata"]['text']: i for i, x in enumerate(query_result["matches"])}

In [24]:
# Rerank the documents
rerank_docs = co.rerank(
    model="rerank-english-v3.0",
    query=query, 
    documents=list(docs.keys()), 
    top_n=5, 
    return_documents=True
)

In [25]:
# Extract reranked documents
reranked_texts = [doc.document.text for doc in rerank_docs.results]
reranked_texts

['SWARNIM SHEKHAR\nData Science/ML\n+91 7542898888 ⋄Pune, India\nswarnim2302@gmail.com ⋄LinkedIn ⋄Github\nOBJECTIVE\nDedicated and result-driven aspiring Data Scientist in the final year of B.Tech in Computer Science Engineering.\nEquipped with extensive hands-on experience from three internships, two of which focused on AI and Data Sci-\nence. Proficient in Python, Machine Learning, Deep Learning, NLP, and Generative AI. Eager to contribute to the\nadvancement of AI through innovative solutions and cutting-edge research.\nEDUCATION\nBachelor of Technology in Computer Science Engineering , MIT ADT University Expected 2025\nSKILLS\nTechnical Skills Machine Learning, Deep Learning, NLP, Python, Bert, GPT, Pandas, NumPy, Matplotlib,\nSeaborn, Generative AI, Scikit-Learn, CNN, ANN, XGBoost, TensorFlow, Keras, Py-\nTorch, NLTK, spaCy, Gensim, Transformers, SQL, C++, Analytics, GitHub\nSoft Skills Analytical Thinking, Problem Solving, Team Collaboration, Communication, Team Lead-\nership\nEX

In [26]:
context=" ".join(reranked_texts)

In [27]:
Template = f"Given the following context: {context}, generate a comprehensive and accurate response to the question: {query}. The response should include both paragraphs and bullet points where appropriate, ensuring that no important details from the context are omitted. Preserve all critical information and treat \n as a newline character."  
# Filling the template with the actual context and question.
filled_template = Template.format(context=context, question=query)

In [28]:
import os
from groq import Groq

client = Groq(
    api_key=groq_api_key ,
)

In [29]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": filled_template,
        }
    ],
    model="mixtral-8x7b-32768",
)

In [30]:
print(chat_completion.choices[0].message.content)

Based on the provided context, Swarnim can indeed be hired as a Gen AI (Generative AI) engineer. Here are the reasons supporting this conclusion:

- Swarnim has hands-on experience in various AI and Data Science domains, including NLP (Natural Language Processing) and Generative AI, as stated in the context.
- Swarnim's skill set encompasses several tools and libraries relevant to Gen AI, such as Python, NLP libraries (NLTK, spaCy, Gensim, Transformers), and SQL, demonstrating their proficiency in managing and processing data required for Gen AI model development.
- Swarnim has worked on projects relevant to Gen AI, such as the "Real-Time Language Translator" project, which integrates speech recognition, text translation, and text-to-speech conversion. This project showcases Swarnim's ability to apply Gen AI techniques effectively.
- Swarnim has experience as a Data Science Intern at Code Nucleus Solutions, where they led data extraction and preprocessing efforts, optimizing datasets f