## This file helps to Build and test the LLama model and Langchain functions

In [54]:
# Importing packages

from langchain_groq import ChatGroq
from dotenv import load_dotenv
import os

In [55]:
#setting up env
load_dotenv()

True

In [56]:
#Building LLM
llm = ChatGroq(
    model='llama3-8b-8192',
    groq_api_key=os.getenv("GROQ_API_KEY"),
    temperature=0.5
    )

In [7]:
#Getting response 
response = llm.invoke("Ask me a question to determine my career path along with 4 options to chose from.")
print(response)

content="Here's a question to help determine your career path:\n\nWhen working on a project, what motivates you most?\n\nA) The opportunity to be creative and come up with innovative solutions\nB) The chance to work with people and build strong relationships\nC) The challenge of solving complex problems and overcoming obstacles\nD) The sense of accomplishment and achieving specific goals\n\nWhich option resonates with you the most?" additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 83, 'prompt_tokens': 28, 'total_tokens': 111, 'completion_time': 0.069166667, 'prompt_time': 0.003874421, 'queue_time': 0.020889679, 'total_time': 0.073041088}, 'model_name': 'llama3-8b-8192', 'system_fingerprint': 'fp_a97cfe35ae', 'finish_reason': 'stop', 'logprobs': None} id='run-185f54cd-cdbc-4fc8-8996-b8c60b8ba80a-0' usage_metadata={'input_tokens': 28, 'output_tokens': 83, 'total_tokens': 111}


In [8]:
from langchain.document_loaders import WebBaseLoader

url = "https://www.google.com/"
loader = WebBaseLoader(url)
docs = loader.load()

print(docs[0].page_content, end = ' ')  # Extracted text from the webpage

GoogleSearch Images Maps Play YouTube News Gmail Drive More »Web History | Settings | Sign in Advanced searchAdvertisingBusiness SolutionsAbout Google© 2025 - Privacy - Terms  

In [25]:
#Loading data for using RAG
import pandas as pd
data = pd.read_excel('./processed_data/Final_Occupation_Data_Cleaned.xlsx')

# Convert all data to string format (ensures compatibility for embeddings)
data = data.astype(str)

# Remove any rows with missing values
data = data.dropna()

# Create unique IDs for each row
data["id"] = data.index.astype(str)

# Combine all relevant text columns into one text field for embedding
data["combined_text"] = data.apply(lambda row: " | ".join(row.values), axis=1)

data['combined_text']


0      11-1011.00 | Chief Executives | Determine and ...
1      11-1011.03 | Chief Sustainability Officers | C...
2      11-1021.00 | General and Operations Managers |...
3      11-2011.00 | Advertising and Promotions Manage...
4      11-2021.00 | Marketing Managers | Plan, direct...
                             ...                        
873    53-7071.00 | Gas Compressor and Gas Pumping St...
874    53-7072.00 | Pump Operators, Except Wellhead P...
875    53-7073.00 | Wellhead Pumpers | Operate power ...
876    53-7081.00 | Refuse and Recyclable Material Co...
877    53-7121.00 | Tank Car, Truck, and Ship Loaders...
Name: combined_text, Length: 878, dtype: object

In [36]:
data.shape

(878, 10)

In [37]:
# Define column names
column_names = [
    'O*NET-SOC Code', 'Title', 'Description', 'Interests', 'Knowledge','Skills', 'Technology Skills', 'Tools Used', 'id', 'combined_text'
]
data.columns = column_names

In [40]:
data.head()

Unnamed: 0,O*NET-SOC Code,Title,Description,Interests,Knowledge,Skills,Technology Skills,Tools Used,id,combined_text
0,11-1011.00,Chief Executives,Determine and formulate policies and provide o...,"Realistic, Investigative, Artistic, Social, En...","Administration and Management, Administration ...","Reading Comprehension, Active Listening, Writi...","Adobe Acrobat, AdSense Tracker, Atlassian JIRA...","10-key calculators, Desktop computers, Laptop ...",0,11-1011.00 | Chief Executives | Determine and ...
1,11-1011.03,Chief Sustainability Officers,"Communicate and coordinate with management, sh...","Realistic, Investigative, Artistic, Social, En...","Administration and Management, Administration ...","Reading Comprehension, Active Listening, Writi...","Adobe Acrobat, Adobe Photoshop, Email software...","10-key calculators, Computer data input scanne...",1,11-1011.03 | Chief Sustainability Officers | C...
2,11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ...","Realistic, Investigative, Artistic, Social, En...","Administration and Management, Administration ...","Reading Comprehension, Active Listening, Writi...","Act!, ActionWare, Adobe Acrobat, Adobe Creativ...","10-key calculators, Cell phones, Computer scan...",2,11-1021.00 | General and Operations Managers |...
3,11-2011.00,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici...","Realistic, Investigative, Artistic, Social, En...","Administration and Management, Administration ...","Reading Comprehension, Active Listening, Writi...","Actuate BIRT, Adobe Acrobat, Adobe Acrobat Rea...","Computer data input scanners, Desktop computer...",3,11-2011.00 | Advertising and Promotions Manage...
4,11-2021.00,Marketing Managers,"Plan, direct, or coordinate marketing policies...","Realistic, Investigative, Artistic, Social, En...","Administration and Management, Administration ...","Reading Comprehension, Active Listening, Writi...","Adobe Acrobat, Adobe Acrobat Reader, Adobe Act...","Desktop computers, Laser facsimile machines, N...",4,"11-2021.00 | Marketing Managers | Plan, direct..."


In [38]:
import chromadb
# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path="./processed_data/chroma_db")

# Create a new collection for storing embeddings
collection = chroma_client.get_or_create_collection(name="career_recommendation")

In [45]:
# Add data to ChromaDB with metadata
collection.add(
    ids=data["id"].astype(str).tolist(),
    documents=data["combined_text"].tolist(), # Handle missing values
)

# Ensure all metadata columns are stored, not just Title & O*NET-SOC Code
print(f"Total records stored: {collection.count()}")

Insert of existing embedding ID: 0
Insert of existing embedding ID: 1
Insert of existing embedding ID: 2
Insert of existing embedding ID: 3
Insert of existing embedding ID: 4
Insert of existing embedding ID: 5
Insert of existing embedding ID: 6
Insert of existing embedding ID: 7
Insert of existing embedding ID: 8
Insert of existing embedding ID: 9
Insert of existing embedding ID: 10
Insert of existing embedding ID: 11
Insert of existing embedding ID: 12
Insert of existing embedding ID: 13
Insert of existing embedding ID: 14
Insert of existing embedding ID: 15
Insert of existing embedding ID: 16
Insert of existing embedding ID: 17
Insert of existing embedding ID: 18
Insert of existing embedding ID: 19
Insert of existing embedding ID: 20
Insert of existing embedding ID: 21
Insert of existing embedding ID: 22
Insert of existing embedding ID: 23
Insert of existing embedding ID: 24
Insert of existing embedding ID: 25
Insert of existing embedding ID: 26
Insert of existing embedding ID: 27
In

Total records stored: 878


In [52]:
# Perform a similarity search for "Scientist" (retrieving top 3 results)
query_result = collection.query(
    query_texts=["I am good at art and design, and I enjoy working with my hands."],
    n_results=5  # Get top 3 matches
)

# Display results
for i, doc in enumerate(query_result["documents"][0]):  # Extract first query result
    print(f"🔹 Match {i+1}: {doc}\n")

🔹 Match 1: 29-1129.01 | Art Therapists | Plan or conduct art therapy sessions or programs to improve clients' physical, cognitive, or emotional well-being. | Realistic, Investigative, Artistic, Social, Enterprising, Conventional, First Interest High-Point, Second Interest High-Point, Third Interest High-Point | Administration and Management, Administration and Management, Administrative, Administrative, Economics and Accounting, Economics and Accounting, Sales and Marketing, Sales and Marketing, Customer and Personal Service, Customer and Personal Service, Personnel and Human Resources, Personnel and Human Resources, Production and Processing, Production and Processing, Food Production, Food Production, Computers and Electronics, Computers and Electronics, Engineering and Technology, Engineering and Technology, Design, Design, Building and Construction, Building and Construction, Mechanical, Mechanical, Mathematics, Mathematics, Physics, Physics, Chemistry, Chemistry, Biology, Biology,

In [59]:
# Memory for user responses
from langchain.prompts import PromptTemplate

user_memory = []

# Function to generate questions dynamically
def generate_question(previous_answers):
    prompt = PromptTemplate.from_template("""
    You are an intelligent career advisor AI that helps users find the most suitable career based on their responses. 
    Your job is to **ask one question at a time**, with four multiple-choice options.

    ### **Instructions:**
    1. The next question should be relevant based on the user's previous answers.
    2. Keep the question concise but informative.
    3. Make sure the four answer choices represent **distinct career-related preferences**.
    4. The answers should be **diverse** (e.g., different skills, interests, or work preferences).

    ### **User Responses So Far:**
    {previous_answers}

    ### **Output Format:**
    No Premble, just the question and options.
    Question: <your generated question>
    A) <Option 1>
    B) <Option 2>
    C) <Option 3>
    D) <Option 4>
    """)
    
    full_prompt = prompt.format(previous_answers=", ".join(user_memory))
    question = llm.invoke(full_prompt)
    print(question)

# Function to query ChromaDB for career matches
def get_career_recommendations(user_answers):
    query_text = " ".join(user_answers)  # Combine responses
    results = collection.query(query_texts=[query_text], n_results=3)  # Get top 3 matches
    
    return results["documents"][0] if results["documents"] else ["No match found"]

# **Main Interaction Loop**
for i in range(5):  # Change to 20 for full experience
    question = generate_question(user_memory)
    print("\n", question)
    
    # Simulate user selecting an answer (in real use case, get input from user)
    user_response = input("Choose an option (A, B, C, D): ")
    user_memory.append(user_response)  # Store answer

# **Get Career Recommendations**
career_matches = get_career_recommendations(user_memory)
print("\n🎯 Recommended Careers:", career_matches)

content="I'm excited to help you find a suitable career! Here's my first question:\n\nWhat type of work environment do you prefer?\n\nA) Collaborative office setting with a team\nB) Independent workspace with minimal distractions\nC) Outdoor or fieldwork setting, such as construction or research\nD) Home-based work with flexible hours" additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 67, 'prompt_tokens': 198, 'total_tokens': 265, 'completion_time': 0.055833333, 'prompt_time': 0.025236089, 'queue_time': 0.021108900000000003, 'total_time': 0.081069422}, 'model_name': 'llama3-8b-8192', 'system_fingerprint': 'fp_179b0f92c9', 'finish_reason': 'stop', 'logprobs': None} id='run-beb06c6e-53af-4273-b028-c05e8a79a3c9-0' usage_metadata={'input_tokens': 198, 'output_tokens': 67, 'total_tokens': 265}

 None
content="Here's the next question:\n\nDo you enjoy working with people and building relationships?\n\nA) Yes, I thrive in social environments and enjoy helping others

In [None]:
# def get_best_career(user_answers):
#     """Retrieve the most relevant careers based on user responses."""
#     user_profile = " ".join(user_answers)  # Combine answers into a profile
#     results = vectorstore_careers.similarity_search(user_profile, k=3)  # Get top 3 matching careers
#     return [doc.page_content for doc in results]

In [None]:
# def generate_career_recommendation(user_answers):
#     """Uses RAG to retrieve career matches & LLaMA to generate recommendations."""
#     careers = get_best_career(user_answers)  # Fetch relevant careers

#     prompt = f"""
#     Based on the user's responses:
#     {user_answers}
    
#     Here are the most relevant careers:
#     {', '.join(careers)}

#     Provide a final career recommendation with an explanation.
#     """
    
#     response = llm.invoke(prompt)
#     return response.content

In [1]:
!which python


/opt/anaconda3/bin/python
