<a href="https://colab.research.google.com/github/Venu005/cnn-learning/blob/main/text_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
!pip install PyPDF2==3.0.1 pinecone-client



In [72]:
!pip install pinecone-client



In [185]:
!pip install python-dotenv==1.0.0

Collecting python-dotenv==1.0.0
  Downloading python_dotenv-1.0.0-py3-none-any.whl.metadata (21 kB)
Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.0


In [209]:
from google import genai
from google.genai import types
import PyPDF2
from pinecone import Pinecone
import hashlib
import datetime
import time
import pprint

In [186]:
from google.colab import userdata
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
PINECONE_API = userdata.get('PINECONE_API')

In [122]:
INDEX_NAME = "resume-parser"
EMBED_MODEL = "text-embedding-004"
CHUNK_SIZE = 2000

In [187]:
client =  genai.Client(api_key= GOOGLE_API_KEY )
pc = Pinecone(api_key=  PINECONE_API )

# Don't run the below two cells everytime

In [188]:
if pc.has_index(INDEX_NAME):
    pc.delete_index(INDEX_NAME)
    time.sleep(5)

In [189]:
pc.create_index(
       name=INDEX_NAME,
        dimension=768,
        spec={
        "dimension": 768,
        "metric": "cosine",
        "serverless" : {
            "cloud": "aws",
            "region": "us-east-1",

        }

       }

)
time.sleep(10)

In [190]:
index =  pc.Index(INDEX_NAME)

In [191]:
def extract_from_pdf(pdf_path):
  with open(pdf_path,'rb') as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    num_pages = len(pdf_reader.pages)
    text = ""
    for page_num in range(num_pages):
      page =  pdf_reader.pages[page_num]
      text += page.extract_text()
    return text

In [216]:
def store_in_pinecone(user_email, fullText):
    # Split text into chunks
    chunks = [fullText[i:i+CHUNK_SIZE] for i in range(0, len(fullText), CHUNK_SIZE)]

    vectors = []
    for i, chunk in enumerate(chunks):
        # Get embedding - PROPERLY EXTRACT VALUES  # returns dict
        embedding_response = client.models.embed_content(
            model=EMBED_MODEL,
            contents=chunk
        )

        embedding = embedding_response.embeddings[0].values
        print("eR:")
        print(embedding_response)
        print ("e")
        print(embedding)
        vectors.append({
            "id": f"{user_email}-{i}",
            "values": embedding,  # Should be List[float]
            "metadata": {
                "text": chunk,
                "user_email": user_email,
            }
        })

    index.upsert(vectors=vectors)

In [193]:
def process_resume(pdf_path,user_email):
  fullText = extract_from_pdf(pdf_path)
  resume_id = hashlib.sha256(fullText.encode()).hexdigest()
  store_in_pinecone(user_email,fullText)

  return fullText

In [194]:
def analyseJobDes(job_desc, user_email):
    # Generate job description embedding
    jd_embedding_response = client.models.embed_content(
        model=EMBED_MODEL,
        contents=job_desc
    )
    jd_embedding = jd_embedding_response.embeddings[0].values
    # Query Pinecone with email filter
    results = index.query(
        vector=jd_embedding,
        filter={"user_email": {"$eq": user_email}},
        top_k=10,
        include_metadata=True
    )

    if not results['matches']:
        return {"error": "No resume found for this user"}

    # Calculate similarity metrics
    avg_score = sum(match['score'] for match in results['matches']) / len(results['matches'])
    max_score = max(match['score'] for match in results['matches'])

    # Extract resume content from matches
    resume_chunks = [match['metadata']['text'] for match in results['matches']]
    resume_context = "\n---\n".join(resume_chunks)

    # Generate AI analysis
    analysis_prompt = f"""Analyze this job description against the user's resume excerpts:

    Job Description:
    {job_desc}

    Resume Excerpts:
    {resume_context}

    Provide specific suggestions focusing on:
    1. Missing technical skills
    2. Keyword optimization
    3. Experience alignment
    4. Certification recommendations
    5. Project improvements"""

    analysis_response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents= analysis_prompt
    ).text

    # Extract missing skills
    skill_prompt = f"""Identify missing technical skills from the resume compared to the job description:

    Job Description: {job_desc}
    Resume Excerpts: {resume_context}

    Output format: Missing: skill1, skill2, ..."""

    missing_skills = client.models.generate_content(
        model="gemini-2.0-flash",
        contents= skill_prompt
    ).text
    # Parse missing skills
    missing_list = []
    if "Missing:" in missing_skills:
        missing_list = [s.strip() for s in missing_skills.split("Missing:")[1].split(",")]

    return {
        "user_email": user_email,
        "similarity": {
            "average": round(avg_score, 2),
            "highest": round(max_score, 2)
        },
        "missing_skills": missing_list,
        "analysis": analysis_response,
        "top_matches": [
            {
                "text": match['metadata']['text'][:200] + "...",
                "score": round(match['score'], 2)
            } for match in results['matches']
        ]
    }


In [205]:
user_email =  "nagakushal10@gmail.com"

In [206]:
fullText = process_resume('Resume.pdf',user_email)

eR:
embeddings=[ContentEmbedding(values=[0.004246525, -0.0113861, -0.051043175, 0.03900643, 0.07863846, 0.014029296, 0.028906835, -0.010667327, -0.010667593, -0.008460534, -0.0054368535, 0.016598338, 0.057275373, -0.010576915, 0.009261845, -0.024698775, 0.03758644, 0.05286926, -0.04396475, -0.04815396, -0.01858587, -0.051515087, 0.0067488747, 0.013240105, 0.016809376, -0.041073114, 0.002764179, -0.03943473, -0.026730448, 0.037410058, 0.032409053, 0.025968557, -0.035296816, 0.0033219312, -0.015106144, 0.04672577, 0.0128762955, 0.01653516, 0.030106183, -0.05476039, -0.005050315, -0.01970379, -0.041568775, 0.035199165, 0.019434143, -0.04663379, 0.00046043342, -0.028304178, -0.01993119, 0.04846088, 0.08646663, 0.012761189, -0.028440325, 0.010394405, -0.007615983, -0.03386494, -0.02443229, -0.04576662, -0.009775791, -0.00085134635, -0.010382255, -0.07057147, -0.02370606, -0.080261864, 0.0035022348, -0.049839865, -0.019895293, 0.036014073, -0.031895407, 0.020134863, -0.030631341, 0.033996664

In [207]:
job_desc = "A full-stack developer, fresher with experience and projects in reactjs and nodejs"

In [212]:
response = analyseJobDes(job_desc,user_email)

In [215]:
pprint.pprint(response)

{'analysis': "Okay, let's break down Naga Kushal Vankadara's resume and how it "
             'aligns with the full-stack developer job description, along with '
             'specific recommendations for improvement.\n'
             '\n'
             '**Overall Assessment:**\n'
             '\n'
             "Naga Kushal's resume shows good potential for a fresher "
             'full-stack developer role focusing on ReactJS and NodeJS. He has '
             'relevant project experience and internships, but there are areas '
             'where he can significantly improve his presentation and skillset '
             'to make a stronger impression.\n'
             '\n'
             '**1. Missing Technical Skills:**\n'
             '\n'
             '*   **Testing:** The resume mentions "thorough testing of LLMs," '
             'which is great but lacks details about testing methodologies for '
             'web applications. He should explicitly mention experience with '
            