In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader,TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# def process_pdf_or_text_files(resume_pdf_path,job_dis_text):
#     all_docs=[]
#     resume_pdf_path=Path(resume_pdf_path)
#     job_dis_path="D:\Projects\Rag-based-cover-letter---cold-email-builder\data\job_discription.txt"

#     resume_pdf_files=list(resume_pdf_path.glob("**/*.pdf"))

#     print(f"Found {len(resume_pdf_files)} PDF files to process")

#     for pdf in resume_pdf_files:
#         try:
#             loader=PyPDFLoader(str(pdf))
#             docs=loader.load()

#             with open(job_dis_path ,"w") as file:
#                 file.write(job_dis_text)

#             text_loader=TextLoader(Path(job_dis_path))
#             texts=text_loader.load()

#             all_docs.extend(docs)
#             all_docs.extend(texts)

#         except Exception as e:
#             print(f"got an error--> {e}")

#     return all_docs


In [3]:
# docs=process_pdf_or_text_files(resume_pdf_path="D:\Projects\Rag-based-cover-letter---cold-email-builder\data",job_dis_text="""Job Description
# You’re not the person who will settle for just any role. Neither are we. Because we’re out to create Better Care for a Better World, and that takes a certain kind of person and teams who care about making a difference. Here, you’ll bring your professional expertise, talent, and drive to building and managing our portfolio of iconic, ground-breaking brands. In this role, you’ll help us deliver better care for billions of people around the world. It starts with YOU.

# In This Role, You Will

# Play a pivotal role in designing and building analytics solutions to facilitate informed decision-making. You will work closely with various R&D and DTS technology teams to design and implement scalable data pipelines, design analytics within R&D solutions, and ensure the accuracy and availability of data for analytics and reporting. Primary focus of the position is to design, develop, and maintain analytics solutions. Kimberly-Clark is seeking a motivated and skilled data scientist to join our dynamic team. Customers: Research & Development, Global Growth, and Quality Assurance.

# Collaborate with engineering and architecture teams to identify, collect, and harmonize data from various sources.
# Design and develop ETL (Extract, Transform, Load) and ELT (Extract, Load, Transform) pipelines to process and curate data sets using technologies such as SQL Server, Azure Data Factory and Databricks.
# Develop and maintain data models and data warehouses using platforms like SQL Server, Azure Data Factory, Snowflake, and Databricks.
# Apply metadata-driven frameworks to ensure scalable data ingestion and processing.
# Implement data quality checks and validation frameworks to maintain high data standards.
# Build and maintain data development standards and principles, providing guidance and project-specific recommendations.
# Build models that are interpretable, scalable, and meet business needs.
# Develop visualizations to demonstrate the results of data models to stakeholders and leadership, leveraging Microsoft Azure technologies.
# Test and validate analytics solutions to ensure data integrity and actual results meet expected results.
# Work with principal architect, product owners, solution engineers, business customers, and other key stakeholders to translate requirements into technical designs.
# Mentor junior engineers and team members on data engineering techniques and best practices.
# Train and build the talent of business users to maximize the return on investment of the analytics solutions.
# Use Agile methodologies and tools to deliver products in a fast-paced environment.
# Collaborate with platform teams to design and build automated processes for pipeline construction, testing, and code migration.""")

In [4]:
# docs

In [5]:
# def generate_chunks(docs, chunk_size=1000, chunk_overlap=200):
#     text_splitter = RecursiveCharacterTextSplitter(
#         chunk_size=chunk_size,
#         chunk_overlap=chunk_overlap,
#         length_function=len,
#         is_separator_regex=False,
#         separators=["\n\n", "\n", " ", ""],
#     )

#     # Accept either a list of LangChain Document objects or list-of-strings.
#     if not docs:
#         return []

#     try:
#         if hasattr(docs[0], "page_content"):
#             # preserve metadata when splitting Document objects
#             doc_chunks = text_splitter.split_documents(docs)
#         else:
#             # ensure we pass a list of strings to create_documents
#             texts = [str(d) for d in docs]
#             doc_chunks = text_splitter.create_documents(texts)
#     except Exception:
#         # As a final fallback, convert everything to strings and split.
#         texts = [d.page_content if hasattr(d, "page_content") else str(d) for d in docs]
#         doc_chunks = text_splitter.create_documents(texts)

#     return doc_chunks


In [6]:
# doc_chunks=generate_chunks(docs=docs)
# doc_chunks

In [7]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
class embeddingsGenerator:

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def generate_embeddings(self, doc_chunks: List[str]) -> np.ndarray:
        texts = [doc.page_content if hasattr(doc, "page_content") else str(doc) for doc in doc_chunks]
        embeddings=self.model.encode(texts, show_progress_bar=True)
        return embeddings

In [9]:
# def generate_embeddings(doc_chunks):
#     embedder=SentenceTransformer("all-MiniLM-L6-v2")
#     # Extract page_content from Document objects
#     texts = [doc.page_content if hasattr(doc, "page_content") else str(doc) for doc in doc_chunks]
#     embeddings=embedder.encode(texts, show_progress_bar=True)
#     return embeddings


In [10]:
# embeddings=embeddingsGenerator().generate_embeddings(doc_chunks=doc_chunks)
# embeddings

In [11]:
class vectorStore:
    def __init__(self, collection_name: str = "resume_job_descriptions", dir:str=None):
        self.client = chromadb.Client(Settings(
            persist_directory=dir,
        ))
        self.collection = self.client.get_or_create_collection(name=collection_name)

    def add_embeddings(self, doc_chunks: List[Any], embeddings: np.ndarray):
        # Validate lengths
        if doc_chunks is None or embeddings is None:
            raise ValueError("doc_chunks and embeddings must be provided")

        if len(doc_chunks) != len(embeddings):
            print(f"Length of docs not equal to length of embeddings,lengths are {len(doc_chunks)} and {len(embeddings)} respectively")
            return

        documents_text = []
        embeddings_list = []

        for doc, emb in zip(doc_chunks, embeddings):
            # Support both Document objects and plain strings
            text = doc.page_content if hasattr(doc, "page_content") else str(doc)
            documents_text.append(text)

            # Ensure embedding is a plain list of floats
            if hasattr(emb, "tolist"):
                embeddings_list.append(emb.tolist())
            else:
                embeddings_list.append(list(emb))

        ids = [str(uuid.uuid4()) for _ in range(len(documents_text))]
        self.collection.add(
            documents=documents_text,
            embeddings=embeddings_list,
            ids=ids
        )


In [12]:
# vector_store=vectorStore(dir="D:/Projects/Rag-based-cover-letter---cold-email-builder/vectorstore")
# vector_store.add_embeddings(doc_chunks=doc_chunks, embeddings=embeddings)

In [13]:
class Ragpipeline:
    def __init__(self, vector_store: vectorStore, embeddings_generator: embeddingsGenerator):
        self.vector_store = vector_store
        self.embeddings_generator = embeddings_generator

    def query(self, query_text: str, top_k: int = 5) -> List[Tuple[str, float]]:
        query_embedding = self.embeddings_generator.model.encode([query_text])[0]
        results = self.vector_store.collection.query(
            query_embeddings=[query_embedding.tolist()],
            n_results=top_k
        )
        
        retrieved_docs = results['documents'][0]
        scores = results['distances'][0]
        
        return list(zip(retrieved_docs, scores))

In [14]:
# response_pipeline = Ragpipeline(vector_store, embeddingsGenerator())
# response=response_pipeline.query(query_text="Generate a short and crisp cover letter position", top_k=3)
# response


In [15]:
from google import genai
from dotenv import load_dotenv

In [16]:
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")

In [19]:
class LLMResponseGenerator:
    def __init__(self, resume_pdf_path):
        self.client = genai.Client(api_key=api_key)
        self.resume_pdf_path = resume_pdf_path

    def process_pdf_or_text_files(self):
        all_docs=[]
        resume_pdf_path=Path(self.resume_pdf_path)

        resume_pdf_files=list(resume_pdf_path.glob("**/*.pdf"))

        print(f"Found {len(resume_pdf_files)} PDF files to process")

        for pdf in resume_pdf_files:
            try:
                loader=PyPDFLoader(str(pdf))
                docs=loader.load()
                all_docs.extend(docs)

            except Exception as e:
                print(f"got an error--> {e}")

        return all_docs
    
    def generate_chunks(self, docs, chunk_size=1000, chunk_overlap=200):
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            is_separator_regex=False,
            separators=["\n\n", "\n", " ", ""],
        )

        if not docs:
            return []

        try:
            if hasattr(docs[0], "page_content"):
                doc_chunks = text_splitter.split_documents(docs)
            else:
                texts = [str(d) for d in docs]
                doc_chunks = text_splitter.create_documents(texts)
        except Exception:
            texts = [d.page_content if hasattr(d, "page_content") else str(d) for d in docs]
            doc_chunks = text_splitter.create_documents(texts)

        return doc_chunks

    def generate_response(self, query: str) -> str:
        # Gather docs and create chunks
        all_docs = self.process_pdf_or_text_files()
        doc_chunks = self.generate_chunks(all_docs)

        # Generate embeddings for chunks
        embedder = embeddingsGenerator()
        embeddings = embedder.generate_embeddings(doc_chunks)

        # Persist embeddings to vector store
        vector_store = vectorStore(dir="D:/Projects/Rag-based-cover-letter---cold-email-builder/vectorstore")
        vector_store.add_embeddings(doc_chunks=doc_chunks, embeddings=embeddings)

        # Build a retrieval pipeline and query it
        response_pipeline = Ragpipeline(vector_store, embedder)
        retrieved = response_pipeline.query(query_text=query, top_k=3)

        # retrieved is list of (doc_text, score); join doc_texts for context
        context= "\n\n".join([doc for doc, _ in retrieved])
        print("context--->",context)

        prompt = f"Context: {context}\n\nQuestion: {query}\n\nResponce_body:Start with introducing yourself,mention recent experience, mention highest qualification and finally mention your works and skils to prove why you are the best fit for this role." 

        response = self.client.models.generate_content(
            model="gemini-2.5-flash",
            contents=prompt,
        )
        return response


In [None]:
get_cover_letter=LLMResponseGenerator(resume_pdf_path="D:\Projects\Rag-based-cover-letter---cold-email-builder\data")
get_cover_letter.generate_response(query="""Generate a short and crisp cover letter for this following job description-->Job Description

You’re not the person who will settle for just any role. Neither are we. Because we’re out to create Better Care for a Better World, and that takes a certain kind of person and teams who care about making a difference. Here, you’ll bring your professional expertise, talent, and drive to building and managing our portfolio of iconic, ground-breaking brands. In this role, you’ll help us deliver better care for billions of people around the world. It starts with YOU.

In This Role, You Will

Play a pivotal role in designing and building analytics solutions to facilitate informed decision-making. You will work closely with various R&D and DTS technology teams to design and implement scalable data pipelines, design analytics within R&D solutions, and ensure the accuracy and availability of data for analytics and reporting. Primary focus of the position is to design, develop, and maintain analytics solutions. Kimberly-Clark is seeking a motivated and skilled data scientist to join our dynamic team. Customers: Research & Development, Global Growth, and Quality Assurance.

Collaborate with engineering and architecture teams to identify, collect, and harmonize data from various sources.
Design and develop ETL (Extract, Transform, Load) and ELT (Extract, Load, Transform) pipelines to process and curate data sets using technologies such as SQL Server, Azure Data Factory and Databricks.
Develop and maintain data models and data warehouses using platforms like SQL Server, Azure Data Factory, Snowflake, and Databricks.
Apply metadata-driven frameworks to ensure scalable data ingestion and processing.
Implement data quality checks and validation frameworks to maintain high data standards.
Build and maintain data development standards and principles, providing guidance and project-specific recommendations.
Build models that are interpretable, scalable, and meet business needs.
Develop visualizations to demonstrate the results of data models to stakeholders and leadership, leveraging Microsoft Azure technologies.
Test and validate analytics solutions to ensure data integrity and actual results meet expected results.
Work with principal architect, product owners, solution engineers, business customers, and other key stakeholders to translate requirements into technical designs.
Mentor junior engineers and team members on data engineering techniques and best practices.
Train and build the talent of business users to maximize the return on investment of the analytics solutions.
Use Agile methodologies and tools to deliver products in a fast-paced environment.
Collaborate with platform teams to design and build automated processes for pipeline construction, testing, and code migration.""").

Found 1 PDF files to process


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.57it/s]


context---> Pratik Kujur
6264902455 |
 pratikkujur121@gmail.com|
 PratikKujur |
 PratikKujur
Experience
PORTER | Machine Learning Intern Jan 2025 – Present
Skills: Python, Sklearn, Snowflake, Mlflow, Chalk AI, GCP, BigQuery, Streamlit Bangalore, Karnataka
• Developed and deployed production-grade ML models (Logistic Regression, XGBoost) on GCP , achieving
Precision: 0.79 and Recall: 0.76 with 100K+ daily predictions and ensure 99.9% uptime.
• Orchestrated a cross-platform ETL pipeline using Airflow DAG integrating BigQuery and Snowflake for
feature retrieval, processing 5M+ records daily for model training .
• Engineered 15+ features using Chalk AI (Feature Store) to strengthen training pipelines for Driver Ranking
models and monitor city-wise model performance.
• Automated real-time feature skewness monitoring to ensure data stability and feature correctness.
• Contributed to MLOPS dashboard to observe Concept drift and Feature drift in Driver-Ranking models.
Education

Pratik Kujur
6

GenerateContentResponse(
  automatic_function_calling_history=[],
  candidates=[
    Candidate(
      content=Content(
        parts=[
          Part(
            text="""Dear Hiring Manager,

I am writing to express my enthusiastic interest in the Data Scientist position at Kimberly-Clark, as advertised. With a strong commitment to leveraging data for impactful solutions, I am particularly drawn to your mission of "Better Care for a Better World" and believe my skills align perfectly with your team's objectives.

Currently serving as a Machine Learning Intern at PORTER, I have been instrumental in designing and deploying production-grade ML models and robust data pipelines. My experience includes orchestrating cross-platform ETL pipelines using Airflow, integrating BigQuery and Snowflake to process millions of records daily for model training. This directly correlates with your need for developing scalable data pipelines and managing data models/warehouses.

While my full educational 