In [2]:
!pip install pandas


Collecting pandas
  Using cached pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl (11.4 MB)
Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.2.3 pytz-2025.2 tzdata-2025.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
# Step 1: Import and load dataset
import pandas as pd
import numpy as np

# Load the CSV file (adjust the path if needed)
df = pd.read_csv("test.csv.xls")

# Step 2: Clean and drop unnecessary columns
columns_to_drop = ["EmployeeNumber", "StandardHours", "Over18", "EmployeeCount"]
df.drop(columns=columns_to_drop, inplace=True)

# Step 3: Generate a synthetic Attrition column
df["Attrition"] = np.where(
    (df["WorkLifeBalance"] <= 2) & (df["MonthlyIncome"] < 4000),
    "Yes",
    "No"
)

# Step 4: Convert 'Attrition' to descriptive text
df["Attrition"] = df["Attrition"].map({
    "Yes": "at risk of leaving",
    "No": "not at risk of leaving"
})

# Step 5: Define function to convert a row to a paragraph of text
def row_to_paragraph(row):
    return (
        f"{row['JobRole']} in the {row['Department']} department, aged {row['Age']}, "
        f"earns ${row['MonthlyIncome']} per month. They travel {row['BusinessTravel']} and "
        f"have a job satisfaction score of {row['JobSatisfaction']}. "
        f"Their work-life balance rating is {row['WorkLifeBalance']}. "
        f"This employee is {row['Attrition']}."
    )

# Step 6: Convert all rows to paragraphs
employee_paragraphs = df.apply(row_to_paragraph, axis=1).tolist()

# Step 7: Save as text file for RAG ingestion (optional)
with open("employee_profiles.txt", "w", encoding="utf-8") as f:
    for para in employee_paragraphs:
        f.write(para + "\n\n")

# Step 8: Preview the first few
for p in employee_paragraphs[:3]:
    print(p, "\n")


Sales Executive in the Sales department, aged 34, earns $4599 per month. They travel Travel_Rarely and have a job satisfaction score of 2. Their work-life balance rating is 4. This employee is not at risk of leaving. 

Sales Representative in the Sales department, aged 35, earns $2404 per month. They travel Travel_Rarely and have a job satisfaction score of 3. Their work-life balance rating is 3. This employee is not at risk of leaving. 

Laboratory Technician in the Research & Development department, aged 24, earns $3172 per month. They travel Travel_Frequently and have a job satisfaction score of 1. Their work-life balance rating is 2. This employee is at risk of leaving. 



In [5]:
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

# Initialize embedding model
model_name = "all-MiniLM-L6-v2"
embedding_model = HuggingFaceEmbeddings(model_name=model_name)

# Prepare the documents
documents = employee_paragraphs  # from previous step

# Create the Chroma vector store
vectorstore = Chroma.from_texts(texts=documents, embedding=embedding_model, persist_directory="chroma_store")
vectorstore.persist()

print(" Employee summaries embedded and stored in ChromaDB.")


  from .autonotebook import tqdm as notebook_tqdm
  embedding_model = HuggingFaceEmbeddings(model_name=model_name)


 Employee summaries embedded and stored in ChromaDB.


  vectorstore.persist()


In [6]:
pip install groq


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
# Step 1: Set your Groq API key
import os
os.environ["GROQ_API_KEY"] = "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"  

# Step 2: Import Groq's LangChain wrapper
from langchain_groq import ChatGroq

# Step 3: Load the LLM from Groq using llama-4-scout
llm = ChatGroq(
    model_name="meta-llama/llama-4-scout-17b-16e-instruct",
    temperature=0
)

# Step 4: Reload the Chroma vector DB from Phase 1
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA

# Load embedding model again (used during initial chunking)
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Reload ChromaDB
vectorstore = Chroma(
    persist_directory="chroma_store",
    embedding_function=embedding_model
)

# Step 5: Build RAG pipeline using Groq + Chroma
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(),
    return_source_documents=True
)

# Step 6: Ask your HR Assistant a question
query = "Summarize employees with poor work-life balance and low job satisfaction"


response = rag_chain.invoke(query)

# Step 7: Print the main answer
print(" HR Assistant Answer:\n", response['result'])

# Optional: Show the source documents used
print("\n Retrieved Sources:")
for i, doc in enumerate(response['source_documents']):
    print(f"\n Source {i+1}:\n{doc.page_content}")




  vectorstore = Chroma(


 HR Assistant Answer:
 Based on the provided context, here is a summary of employees with poor work-life balance (rating 2 or below) and low job satisfaction (rating 3 or below):

* 1 employee in Research & Development, aged 42, with a work-life balance rating of 2 and a job satisfaction score of 3.
* 1 employee in Research & Development, aged 39, with a work-life balance rating of 2 and a job satisfaction score of 4 (job satisfaction is not low, so this employee doesn't fully meet the criteria).
* 1 employee in Research & Development, aged 43, with a work-life balance rating of 3 and a job satisfaction score of 3 (work-life balance is not poor, so this employee doesn't fully meet the criteria).
* 1 employee in Research & Development, aged 42, is the best match.

So, at least 1 employee (Research & Development, aged 42) has a poor work-life balance and low job satisfaction.

 Retrieved Sources:

 Source 1:
Manager in the Research & Development department, aged 42, earns $18880 per mont

In [10]:
from langchain.docstore.document import Document

with open("/Users/nithishkaranam/Documents/smart-hr-assistant/notebooks/employee_profiles.txt", "r") as f:
    lines = f.readlines()

docs = [Document(page_content=line.strip()) for line in lines]
print(f"✅ Loaded {len(docs)} employee summaries.")


✅ Loaded 824 employee summaries.


In [12]:
pip install tiktoken


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting tiktoken
  Downloading tiktoken-0.9.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp312-cp312-macosx_11_0_arm64.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [13]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

# Step 1: Split docs into chunks
splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300,
    chunk_overlap=0
)
chunks = splitter.split_documents(docs)
print(f"✅ Split into {len(chunks)} chunks.")

# Step 2: Load embedding model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Step 3: Embed + save to ChromaDB
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory="../chroma_store"
)
vectorstore.persist()

print("✅ Vector DB successfully saved to chroma_store/")


✅ Split into 412 chunks.
✅ Vector DB successfully saved to chroma_store/
