In [None]:
import pandas as pd

df = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

print(df.head())


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

df['cleaned_description'] = df['description'].apply(preprocess_text)


# Keyword Extraction 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = vectorizer.fit_transform(df['cleaned_description'])

keywords = vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=keywords)


In [None]:
import openai

openai.api_key = 'sk-proj-jWeuuCo9RoDXiIqaMLElT3BlbkFJibCrTNO57ekYpJAQHovK'

def extract_keywords_with_llm(text):
    prompt = f"Extract the main keywords from the following job description:\n\n{text}\n\nKeywords:"
    response = openai.Completion.create(engine="text-davinci-003", prompt=prompt, max_tokens=50)
    keywords = response.choices[0].text.strip()
    return keywords

df['llm_keywords'] = df['description'].apply(extract_keywords_with_llm)


In [None]:
!pip install openai


In [None]:
import openai
print(openai.__version__)


In [None]:
!pip show openai


In [None]:
import sys
print(sys.executable)


In [None]:
import openai
print(openai.__version__)


In [None]:
import openai

openai.api_key = 'sk-proj-jd8lEgSk6gjsS7mZ2XyST3BlbkFJlZzawMKHi5bOzIqpTvma'

def extract_keywords_with_llm(text):
    prompt = f"Extract the main keywords from the following job description:\n\n{text}\n\nKeywords:"
    response = openai.Completion.create(engine="gpt-3.5-turbo-instruct", prompt=prompt, max_tokens=50)
    keywords = response.choices[0].text.strip()
    return keywords

df['llm_keywords'] = df['description'].apply(extract_keywords_with_llm)

In [None]:
pip install openai==0.28


# Matching Algorithm

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

categories = [
    'Data Science', 
    'Marketing', 
    'Sales', 
    'HR', 
    'Engineering', 
    'Operations', 
    'Clinical Research', 
    'Product Management', 
    'Other'
]

category_descriptions = [
    "Data Science involves data analysis, machine learning, statistics, and data visualization.",
    "Marketing includes campaign management, content creation, and social media strategy.",
    "Sales professionals focus on selling products or services to customers.",
    "HR professionals manage human resources functions such as recruitment, training, and employee relations.",
    "Engineering covers various fields including software, hardware, and systems engineering.",
    "Operations management involves overseeing the production of goods and services.",
    "Clinical Research involves conducting medical research studies to evaluate the safety and efficacy of treatments.",
    "Product Management involves developing and managing products throughout their lifecycle.",
    "Other encompasses roles that do not fit into the predefined categories.",
]

role_descriptions = {
    'Engineering': [
        "Web Development focuses on building and maintaining websites using technologies such as HTML, CSS, and JavaScript.",
        "App Development involves creating mobile applications for different platforms like iOS and Android.",
        "Frontend Developers specialize in creating user interfaces and experiences.",
        "Backend Developers focus on server-side development and database management.",
        "Machine Learning Engineers develop and deploy machine learning models for various applications.",
    ],
    'Operations': [
        "Founders Office refers to roles within the executive team responsible for strategic decision-making and leadership.",
    ],
    'Data Science': [
        "Data Analysts analyze data to extract insights and inform business decisions.",
        "Data Scientists utilize advanced statistical techniques and machine learning algorithms to solve complex problems.",
    ],
    'Sales': [
        "Sales professionals focus on identifying and acquiring new customers, as well as maintaining relationships with existing ones.",
    ],
    'Marketing': [
        "Marketing Consultants provide expert advice on marketing strategies and campaigns.",
    ],
    'HR': [
        "HR Consultants offer HR-related services such as recruitment, performance management, and organizational development.",
    ],
    'Operations': [
        "Operations Managers oversee the day-to-day operations of a business, ensuring efficiency and effectiveness.",
    ],
    'Clinical Research': [
        "Clinical Research Coordinators manage clinical trials and ensure compliance with regulatory requirements.",
    ],
    'Product Management': [
        "Product Managers are responsible for the strategy, development, and launch of new products.",
    ],
    'Other': [
        "Roles that do not fit into the predefined categories.",
    ],
}

for category in categories:
    if category in role_descriptions:
        category_descriptions += role_descriptions[category]

vectorizer = TfidfVectorizer(stop_words='english')

category_vectors = vectorizer.fit_transform(category_descriptions)


def match_category(row):
    tfidf_vector = vectorizer.transform([row['description']])
    similarities = cosine_similarity(tfidf_vector, category_vectors)
    best_match = categories[similarities.argmax()]
    return best_match

df['predicted_category'] = df.apply(match_category, axis=1)


In [None]:
print("Length of categories:", len(categories))
print("Shape of category_vectors:", category_vectors.shape)
print("Length of category_descriptions:", len(category_descriptions))


# LLMOps

In [None]:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel

app = FastAPI()

class InputData(BaseModel):
    description: str

@app.post("/predict")
def predict(input_data: InputData):
    try:
        description = preprocess_text(input_data.description)
        tfidf_vector = vectorizer.transform([description])
        category = match_category(tfidf_vector)
        return {"predicted_category": category}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


In [None]:
import logging

logging.basicConfig(level=logging.INFO)

@app.post("/predict")
def predict(input_data: InputData):
    try:
        logging.info(f"Received input: {input_data.description}")
        description = preprocess_text(input_data.description)
        tfidf_vector = vectorizer.transform([description])
        category = match_category(tfidf_vector)
        logging.info(f"Predicted category: {category}")
        return {"predicted_category": category}
    except Exception as e:
        logging.error(f"Error: {e}")
        raise HTTPException(status_code=500, detail=str(e))


# Testing the API 

In [None]:
import requests

url = "http://127.0.0.1:8000/predict"

test_descriptions = [
    "We are hiring a data analyst with expertise in SQL and Excel to manage and analyze company datasets. The successful candidate will be responsible for extracting, transforming, and analyzing large volumes of data to provide actionable insights. They should have strong analytical skills and be proficient in SQL, Excel, and data visualization tools. Additionally, the candidate should have experience working with stakeholders to understand business requirements and translate them into analytical solutions. A background in statistics or mathematics is preferred.",
    "Looking for a marketing coordinator to assist with campaign management, content creation, and social media strategy. The ideal candidate will have experience in developing and executing marketing campaigns across various channels, including social media, email, and digital advertising. They should be creative, detail-oriented, and have excellent communication skills. Responsibilities include coordinating with internal teams to develop marketing materials, analyzing campaign performance, and optimizing strategies based on data insights."
]

for description in test_descriptions:
    response = requests.post(url, json={"description": description})
    print(f"Input: {description}")
    print(f"Prediction: {response.json()['predicted_category']}\n")
