<a href="https://colab.research.google.com/github/Sayandeep27/Gen-AI-with-Sayandeep/blob/main/9_Cold_Email_Generator_with_LLM_and_Langchain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install langchain chromadb langchain-google-genai google-generativeai

Collecting chromadb
  Downloading chromadb-1.0.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting langchain-google-genai
  Downloading langchain_google_genai-2.1.2-py3-none-any.whl.metadata (4.7 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.23.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.m

In [2]:
api="AIzaSyBpijVSmuqyt5qelKWaPKpd-Ys30wTd37w"

In [4]:
!pip install langchain_community

Collecting langchain_community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-core<1.0.0,>=0.3.51 (from langchain_community)
  Downloading langchain_core-0.3.51-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.23 (from langchain_community)
  Downloading langchain-0.3.23-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-

In [5]:
import os
import csv
import uuid
import re
import chromadb
from google.colab import files
from langchain_community.document_loaders import WebBaseLoader
from langchain_google_genai import GoogleGenerativeAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.exceptions import OutputParserException



In [6]:
def clean_text(text):
    text = re.sub(r'<[^>]*?>', '', text)
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
    text = re.sub(r'\s{2,}', ' ', text)
    return ' '.join(text.strip().split())

In [7]:
class Portfolio:
    def __init__(self, file_path):
        self.file_path = file_path
        self.data = self.read_csv_file(self.file_path)
        self.chroma_client = chromadb.PersistentClient('vectorstore2')
        self.collection = self.chroma_client.get_or_create_collection(name="portfolio")

    def read_csv_file(self, file_path):
        data = []
        with open(file_path, 'r') as file:
            csv_reader = csv.reader(file)
            next(csv_reader)
            for row in csv_reader:
                skills = tuple(row[:-1])
                project_link = row[-1]
                data.append((skills, project_link))
        return data

    def load_portfolio(self):
        if not self.collection.count():
            for skills, portfolio_url in self.data:
                self.collection.add(
                    documents=[str(skills)],
                    metadatas=[{"portfolio_url": portfolio_url}],
                    ids=[str(uuid.uuid4())]
                )

    def query_links(self, skills):
        return self.collection.query(query_texts=skills, n_results=2).get("metadatas", [])


In [11]:
class Chain:
    def __init__(self):

        api_key = api
        self.llm = GoogleGenerativeAI(model="gemini-1.5-pro-latest", google_api_key=api_key, temperature=0.1)

    def extract_jobs(self, cleaned_text):
        prompt_extract = PromptTemplate.from_template("""
        I will give you scraped text from the job posting.
        Your job is to extract the job details & requirements in a JSON format containing the following keys:
        'role', 'experience', 'skills', and 'description'.
        Only return valid JSON. No preamble, please.
        Here is the scraped text: {page_data}
        """)

        chain_extract = prompt_extract | self.llm
        response = chain_extract.invoke(input={"page_data": cleaned_text})

        try:
            json_parser = JsonOutputParser()
            response = json_parser.parse(response)
        except OutputParserException:
            raise OutputParserException("Content too big, unable to parse jobs.")

        return response if isinstance(response, list) else [response]

    def write_email(self, job_description, portfolio_urls):
        prompt_email = PromptTemplate.from_template("""
        I will give you a role and a task that you have to perform in that specific role.
        Your Role: Your name is Hassan, You are an incredible business development officer who knows how to get clients.
        You work for Mentee Consulting firm, your firm works with all sorts of IT clients and provide solutions in the domain of Data Science and AI.
        Mentee AI focuses on efficient tailored solutions for all clients keeping costs down.

        Your Job: Your Job is to write cold emails to clients regarding the Job openings that they have advertised.
        Try to pitch your clients with an email hook that opens a conversation about a possibility of working with them.
        Add the most relevant portfolio URLs from the following (shared below) to showcase that we have the right expertise to get the job done.

        I will now provide you with the Job description and the portfolio URLs:
        JOB DESCRIPTION: {job_description}
        ------
        PORTFOLIO URLS: {portfolio_urls}
        """)

        chain_email = prompt_email | self.llm
        response = chain_email.invoke({
            "job_description": str(job_description),
            "portfolio_urls": portfolio_urls
        })

        return response.content


In [12]:
uploaded = files.upload()
file_path = list(uploaded.keys())[0]


Saving sample_portfolio.csv to sample_portfolio (1).csv


In [13]:
chain = Chain()
portfolio = Portfolio(file_path)
portfolio.load_portfolio()

url = input("Paste the job description URL: ")

loader = WebBaseLoader([url])
page = loader.load().pop()
cleaned = clean_text(page.page_content)

jobs = chain.extract_jobs(cleaned)

for i, job in enumerate(jobs):
    skills = job.get("skills", [])
    matched = portfolio.query_links(skills)
    email = chain.write_email(job, matched)
    print(f"\n--- Cold Email {i+1} ---\n")
    print(email)


Paste the job description URL: https://jobs.micro1.ai/post/79a04cae-856d-47c1-a25d-2c31f67cb040


ValueError: Non-empty lists are required for ['documents'] in query.