In [1]:
import gradio as gr
import pdfplumber
import nltk
from nltk.corpus import stopwords
import pandas as pd
import pandas as pd
from openai import OpenAI
import gradio as gr

from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
import os
import numpy as np
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_ollama import ChatOllama
from langchain.prompts import PromptTemplate
import warnings
from langchain_core._api import LangChainDeprecationWarning
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.chains import LLMChain

In [2]:
import pickle
import numpy as np

scaler = pickle.load(open('scaler.pkl', 'rb'))
model = pickle.load(open('model.pkl', 'rb'))
class_names = ['Lawyer', 'Doctor', 'Government Officer', 'Artist', 'Unknown',
               'Software Engineer', 'Teacher', 'Business Owner', 'Scientist',
               'Banker', 'Writer', 'Accountant', 'Designer',
               'Construction Engineer', 'Game Developer', 'Stock Investor',
               'Real Estate Developer']

def Recommendations(gender, part_time_job, absence_days, extracurricular_activities,
                    weekly_self_study_hours, math_score, history_score, physics_score,
                    chemistry_score, biology_score, english_score, geography_score
                   ):
    total_score = math_score + history_score + physics_score + chemistry_score + biology_score + english_score + geography_score
    average_score = total_score / 7

    # Encode categorical variables
    gender_encoded = 1 if gender.lower() == 'female' else 0
    part_time_job_encoded = 1 if part_time_job else 0
    extracurricular_activities_encoded = 1 if extracurricular_activities else 0

    # Create feature array
    feature_array = np.array([[gender_encoded, part_time_job_encoded, absence_days, extracurricular_activities_encoded,
                               weekly_self_study_hours, math_score, history_score, physics_score,
                               chemistry_score, biology_score, english_score, geography_score,total_score,average_score]])

    # Scale features
    scaled_features = scaler.transform(feature_array)

    # Predict using the model
    probabilities = model.predict_proba(scaled_features)

    # Get top five predicted classes along with their probabilities
    top_classes_idx = np.argsort(-probabilities[0])[:5]
    top_classes_names_probs = [(class_names[idx], probabilities[0][idx]) for idx in top_classes_idx]

    return top_classes_names_probs

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [3]:
# Load IT Job Roles dataset
jobs_df = pd.read_csv("data\\IT_Job_Roles_Skills.csv", encoding="ISO-8859-1") 
jobs_df.head()

Unnamed: 0,Job Title,Job Description,Skills,Certifications
0,Admin Big Data,Responsible for managing and overseeing big da...,"Hadoop, Spark, MapReduce, Data Lakes, Data War...","Cloudera Certified Professional (CCP), Hortonw..."
1,Ansible Operations Engineer,Focuses on automating IT processes using Ansib...,"Ansible, Linux, Automation, Cloud Platforms, C...",Red Hat Certified Specialist in Ansible Automa...
2,Artifactory Administrator,Manages the Artifactory repository for build a...,"Artifactory, CI/CD, Jenkins, Docker, Maven, Gr...","JFrog Artifactory Certification, DevOps Instit..."
3,Artificial Intelligence / Machine Learning Leader,"Leads AI/ML projects and teams, defining strat...","AI Strategy, Machine Learning, Team Management...","AI-900: Microsoft Azure AI Fundamentals, Certi..."
4,Artificial Intelligence / Machine Learning Sr....,Senior role overseeing multiple AI/ML initiati...,"AI Strategy, Machine Learning, Team Management...",Certified Artificial Intelligence Practitioner...


In [4]:
# Load Coursera Courses dataset
courses_df = pd.read_csv("data\\coursera_course_dataset_v2_no_null.csv")  
courses_df.head()

Unnamed: 0.1,Unnamed: 0,Title,Organization,Skills,Ratings,Review counts,Metadata
0,0,Google Cybersecurity,Google,"Network Security, Python Programming, Linux, ...",4.8,4.8(20K reviews),Beginner · Professional Certificate · 3 - 6 Mo...
1,1,Google Data Analytics,Google,"Data Analysis, R Programming, SQL, Business C...",4.8,4.8(137K reviews),Beginner · Professional Certificate · 3 - 6 Mo...
2,2,Google Project Management:,Google,"Project Management, Strategy and Operations, ...",4.8,4.8(100K reviews),Beginner · Professional Certificate · 3 - 6 Mo...
3,3,IBM Data Science,IBM,"Python Programming, Data Science, Machine Lea...",4.6,4.6(120K reviews),Beginner · Professional Certificate · 3 - 6 Mo...
4,4,Google Digital Marketing & E-commerce,Google,"Digital Marketing, Marketing, Marketing Manag...",4.8,4.8(23K reviews),Beginner · Professional Certificate · 3 - 6 Mo...


In [5]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pradu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pradu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
SKILLS_DB = list(set(sum(jobs_df['Skills'].str.split(',').tolist(), [])))

def extract_text_from_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            if page.extract_text():
                text += page.extract_text() + " "
    return text

def extract_skills_from_text(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha() and word not in stopwords.words('english')]
    extracted = set()
    for skill in SKILLS_DB:
        if skill.strip().lower() in text:
            extracted.add(skill.strip())
    return list(extracted)


In [7]:
similarity_matrix = pickle.load(open('similarity_matrix.pkl', 'rb'))
svd = pickle.load(open('svd.pkl', 'rb'))

def recommend(skills_input, resume_file, job_role):
    if not job_role:
        return "Please select a job role.", None
    
    # Extract skills from either file or textbox
    resume_skills = []
    if resume_file is not None:
        text = extract_text_from_pdf(resume_file.name)
        resume_skills = extract_skills_from_text(text)
    elif skills_input.strip() != "":
        resume_skills = [s.strip() for s in skills_input.split(',') if s.strip()]
    else:
        return "Please upload a resume or enter your skills.", None

    if not resume_skills:
        return "No skills found in the input. Please check the content.", None

    job_index = jobs_df[jobs_df['Job Title'] == job_role].index[0]
    required_skills = [skill.strip() for skill in jobs_df.iloc[job_index]['Skills'].split(',')]

    # Skill Gap Analysis
    missing_skills = [skill for skill in required_skills if skill not in resume_skills]

    # Content-based recommendations
    similar_courses = list(enumerate(similarity_matrix[job_index]))
    similar_courses = sorted(similar_courses, key=lambda x: x[1], reverse=True)[:5]
    skill_based = courses_df.iloc[[i[0] for i in similar_courses]]

    # Collaborative filtering
    cf_recommend = courses_df.copy()
    cf_recommend['Predicted_Rating'] = cf_recommend['Title'].apply(lambda x: svd.predict(uid="user", iid=x).est)
    cf_recommend = cf_recommend.sort_values(by='Predicted_Rating', ascending=False).head(5)

    # Merge both recommendations
    hybrid = pd.concat([skill_based, cf_recommend]).drop_duplicates(subset="Title").head(5)

    # Output formatting
    result = "### ✅ Extracted Skills:\n" + ", ".join(resume_skills) + "\n\n"
    result += "### ❌ Missing Skills for '{}':\n".format(job_role) + (", ".join(missing_skills) if missing_skills else "None") + "\n\n"
    result += "### 📚 Recommended Courses:\n"

    course_list = hybrid[['Title', 'Ratings']].values.tolist()
    return result, course_list

In [8]:
df = pd.read_csv("data\\large_skills_dataset.csv")
df.head()

Unnamed: 0,Specialization,Skills Required
0,SEO Consultant,"Hadoop, Agile, GCP, Spring Boot, Scrum, Redis,..."
1,SEO Architect,"PostgreSQL, Vue.js, Penetration Testing, Tenso..."
2,E-commerce Consultant,"MySQL, Git, Node.js, Big Data, Smart Contracts..."
3,Bioinformatics Specialist,"Linux, TensorFlow, Scikit-learn, MySQL, Data V..."
4,Cloud Specialist,"Firebase, Data Visualization, Deep Learning, A..."


In [9]:
model = "llama3.2"
openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')

In [10]:
specs = [f"Specialization: {row['Specialization']}\nSkills Required: {row['Skills Required']}" for _, row in df.iterrows()]

In [11]:
docs = []
for spec in specs:
    parts = spec.split("\n")
    specialization = parts[0].replace("Specialization: ", "").strip()
    skills = parts[1].replace("Skills Required: ", "").strip()

    doc = Document(
        page_content=spec,  # Full text
        metadata={
            "specialization": specialization,
            "skills": skills
        }
    )
    docs.append(doc)

texts = [doc.page_content for doc in docs]

In [12]:
model

'llama3.2'

In [13]:
it = pd.read_csv("data\\encoded-IT_Job_Roles_Skills.csv")
it.head()

Unnamed: 0,Job Title,Job Description,Skills,Certifications
0,Admin Big Data,Responsible for managing and overseeing big da...,"Hadoop, Spark, MapReduce, Data Lakes, Data War...","Cloudera Certified Professional (CCP), Hortonw..."
1,Ansible Operations Engineer,Focuses on automating IT processes using Ansib...,"Ansible, Linux, Automation, Cloud Platforms, C...",Red Hat Certified Specialist in Ansible Automa...
2,Artifactory Administrator,Manages the Artifactory repository for build a...,"Artifactory, CI/CD, Jenkins, Docker, Maven, Gr...","JFrog Artifactory Certification, DevOps Instit..."
3,Artificial Intelligence / Machine Learning Leader,"Leads AI/ML projects and teams, defining strat...","AI Strategy, Machine Learning, Team Management...","AI-900: Microsoft Azure AI Fundamentals, Certi..."
4,Artificial Intelligence / Machine Learning Sr....,Senior role overseeing multiple AI/ML initiati...,"AI Strategy, Machine Learning, Team Management...",Certified Artificial Intelligence Practitioner...


In [14]:
it = it.drop(['Certifications'], axis=1)
it.head()

Unnamed: 0,Job Title,Job Description,Skills
0,Admin Big Data,Responsible for managing and overseeing big da...,"Hadoop, Spark, MapReduce, Data Lakes, Data War..."
1,Ansible Operations Engineer,Focuses on automating IT processes using Ansib...,"Ansible, Linux, Automation, Cloud Platforms, C..."
2,Artifactory Administrator,Manages the Artifactory repository for build a...,"Artifactory, CI/CD, Jenkins, Docker, Maven, Gr..."
3,Artificial Intelligence / Machine Learning Leader,"Leads AI/ML projects and teams, defining strat...","AI Strategy, Machine Learning, Team Management..."
4,Artificial Intelligence / Machine Learning Sr....,Senior role overseeing multiple AI/ML initiati...,"AI Strategy, Machine Learning, Team Management..."


In [15]:
jobdetails_doc=[]

for _, row in it.iterrows():
    doc=Document(
        page_content=f"Job Title: {row['Job Title']}\nJob Description: {row['Job Description']}\nSkills: {row['Skills']}",
        metadata={
            "Job Title": row['Job Title'],
            "Job Description": row['Job Description'],
            "Skills": row['Skills']
        }
    )

    jobdetails_doc.append(doc)

In [16]:
print(jobdetails_doc[22])

page_content='Job Title: Machine learning Architect
Job Description: Designs machine learning systems architecture. Responsible for selecting appropriate frameworks, designing workflows, and ensuring scalability of ML models.
Skills: Machine Learning, Deep Learning, Cloud Computing, Data Science, System Design, AWS, Azure, Scalability, Security, Model Deployment' metadata={'Job Title': 'Machine learning Architect', 'Job Description': 'Designs machine learning systems architecture. Responsible for selecting appropriate frameworks, designing workflows, and ensuring scalability of ML models.', 'Skills': 'Machine Learning, Deep Learning, Cloud Computing, Data Science, System Design, AWS, Azure, Scalability, Security, Model Deployment'}


In [17]:
len(jobdetails_doc)

493

In [18]:
db_name="merged_db"

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

merged_vectorstore = Chroma(persist_directory=db_name, embedding_function=embeddings)
print(f"Vectorstore created with {merged_vectorstore._collection.count()} documents")


Vectorstore created with 713 documents


In [19]:
warnings.filterwarnings("ignore", category=LangChainDeprecationWarning)

llm = ChatOllama(model=model, temperature=0.7)

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

retriever = merged_vectorstore.as_retriever(search_kwargs={"k": 5})

conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    # combine_docs_chain_kwargs={"prompt": merged_prompt}
)

In [20]:
message = "Jobs I can get with Google Cloud as my skill"

docs = retriever.get_relevant_documents(message)
context = "\n\n".join(doc.page_content for doc in docs)
print(context)

Specialization: Cloud Computing
Skills Required: Statistics, Git, Python, Hadoop, Adobe XD, SEO, PostgreSQL, Node.js

Specialization: Cloud Technician
Skills Required: Deep Learning, HTML, Redis, Content Marketing, Cryptography, MATLAB, Flask, Kubernetes, Linux, ElasticSearch

Specialization: Cloud Manager
Skills Required: Blockchain, SEO, NLP, Redis, PyTorch, Figma, Content Marketing, Machine Learning

Job Title: GCP DevOps Engineer
Job Description: Implements DevOps practices on Google Cloud Platform. Responsible for automating deployments, managing CI/CD pipelines, and ensuring system reliability.
Skills: Google Cloud, DevOps, CI/CD, Terraform, Ansible, Docker, Kubernetes, Linux, Automation, Scripting

Specialization: Cloud Developer
Skills Required: Kubernetes, Smart Contracts, Swift, CSS, PostgreSQL, Statistics, Angular, Computer Vision, Scrum


In [21]:
import gradio as gr

def student_form():
    gender = gr.Radio(["Male", "Female"], label="Gender")
    part_time_job = gr.Checkbox(label="Do you have a part-time job?")
    extracurricular_activities = gr.Checkbox(label="Do you participate in extracurricular activities?")
    absence_days = gr.Number(label="Number of Absence Days", precision=0)
    weekly_self_study_hours = gr.Number(label="Weekly Self Study Hours")

    math_score = gr.Number(label="Math")
    history_score = gr.Number(label="History")
    physics_score = gr.Number(label="Physics")
    chemistry_score = gr.Number(label="Chemistry")
    biology_score = gr.Number(label="Biology")
    english_score = gr.Number(label="English")
    geography_score = gr.Number(label="Geography")

    submit_btn = gr.Button("Get Career Recommendations")
    output = gr.Textbox(label="Top 5 Career Paths")

    submit_btn.click(
        fn=Recommendations,
        inputs=[gender, part_time_job, absence_days, extracurricular_activities,
                weekly_self_study_hours, math_score, history_score, physics_score,
                chemistry_score, biology_score, english_score, geography_score],
        outputs=output
    )

    return [gender, part_time_job, extracurricular_activities, absence_days, weekly_self_study_hours,
            math_score, history_score, physics_score, chemistry_score, biology_score, english_score,
            geography_score, submit_btn, output]

def professional_form():
    skills_input = gr.Textbox(lines=2, label="Enter Skills (comma-separated)")
    resume_file = gr.File(file_types=[".pdf"], label="Upload Resume (optional)")
    job_role = gr.Dropdown(choices=jobs_df["Job Title"].unique().tolist(), label="Select Desired Job Role", value=None)

    submit_btn = gr.Button("Analyze & Recommend")
    result_textbox = gr.Markdown()
    course_textbox = gr.Textbox(label="Top 5 Course Recommendations")

    def wrapped_recommend(skills_input, resume_file, job_role):
        result_text, course_list = recommend(skills_input, resume_file, job_role)
        course_output = "\n".join([f"{i+1}. {title} (⭐ {rating})" for i, (title, rating) in enumerate(course_list)]) if course_list else "No courses found."
        return result_text, course_output

    submit_btn.click(
        fn=wrapped_recommend,
        inputs=[skills_input, resume_file, job_role],
        outputs=[result_textbox, course_textbox]
    )

    return [job_role, skills_input, resume_file, submit_btn, result_textbox, course_textbox]

def chat4(message, history):
    docs = retriever.get_relevant_documents(message)
    context = "\n\n".join(doc.page_content for doc in docs)

    history.append({"role": "user", "content": message})
    
    conversation_history = "\n".join([f"{msg['role']}: {msg['content']}" for msg in history])
    
    prompt = f"""
You are a friendly chatbot, guiding the user about their career.
Use ONLY the following CONTEXT to answer the QUESTION.

Provide a friendly, motivating message about the user's career goal, along with the skills or job information they are asking for.

For skills:
- If the user asks about skills, provide all skills required for the specialization that matches the query. If no specialization is found, fall back to the job title’s context and provide those skills.
- Give a one-line idea about each skill mentioned, making them excited about acquiring the skills.
Example:
A Product Engineer bridges the gap between technical development and product design, using engineering skills to create scalable, user-focused products that solve real-world problems. This role blends coding, system design, data handling, and user experience to deliver top-tier tech products from concept to deployment.

Here’s a one-line exciting pitch for each required skill:

MySQL: Learn to speak the language of databases and unlock insights that drive impactful product decisions.

TypeScript: Build robust, maintainable apps with confidence using this powerful upgrade to JavaScript.

React: Craft sleek, responsive interfaces and bring products to life with one of the most in-demand front-end frameworks.

Java: Power backend systems with a battle-tested language known for performance, scalability, and reliability.

Hadoop: Dive into the world of big data and process massive information streams like a data wizard.

Computer Vision: Give your apps the ability to "see" and revolutionize user experience through intelligent image understanding.

For jobs:
- If the user asks about jobs, provide a list of all jobs and specializations that require the specified skills.
- Also give them job description and also *skills* for that job from job title context**(mention skills right below the specified job)**
Example:
1. **Machine Learning Engineer**: Build and deploy intelligent models that will revolutionize industries. Your dream job awaits!
Skills required: Python, Machine Learning, TensorFlow, Keras, PyTorch, Data Analysis, Deep Learning, Natural Language Processing, Computer Vision, Data Science, Algorithms

Do not repeat context as it is. Instead, motivate and inspire the user with the information you provide, and encourage them to keep learning.
Don't respond like: 'from your context', 'from given context', etc

Finally motivate them in there journey

CONTEXT:
{context}

HISTORY:
{conversation_history}

QUESTION:
{message}

STRICT ANSWER:
"""

    result = llm.invoke(prompt)
    
    history.append({"role": "bot", "content": result.content})
    
    return [{"role": "bot", "content": result.content}]

# Full UI
with gr.Blocks() as demo:
    gr.Markdown("## 🧠 Skill Gap & Career Recommendation System")

    user_type = gr.Radio(["Student", "Professional", "Career Chat Assistant"], label="What option you are looking for?", value=None)

    student_section = gr.Group(visible=False)
    professional_section = gr.Group(visible=False)
    chatbot_section = gr.Group(visible=False)

    with student_section:
        student_ui_elements = student_form()

    with professional_section:
        professional_ui_elements = professional_form()

    with chatbot_section:
        gr.Markdown("### 💬 Career Guidance Chatbot")
        chatbot_ui = gr.ChatInterface(chat4, type="messages")

    def toggle_form(user_choice):
        return (
            gr.update(visible=user_choice == "Student"),
            gr.update(visible=user_choice == "Professional"),
            gr.update(visible=user_choice == "Career Chat Assistant")
        )

    user_type.change(fn=toggle_form, inputs=user_type, outputs=[student_section, professional_section, chatbot_section])

demo.launch(inbrowser=True)


* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


