## **Import Libraries**

In [None]:
!pip install sentence-transformers
!pip install faiss-gpu

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-3.0.1
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [3]:
from sentence_transformers import SentenceTransformer, CrossEncoder
import pandas as pd
import numpy as np
import os
import faiss
# from google.colab import drive
# drive.mount('/content/drive')

## **Read Dataset**

In [4]:
# Path to the CSV file
file_path = './Data/jobs_data.csv'

job_data = pd.read_csv(file_path)
job_data.head()

Unnamed: 0,job_title,description,requirements,career_level
0,Senior Developer Relations Manager,<p>Senior Developer Relations Manager page is ...,,Not specified
1,Costing Manager - Cairo,"<ul>\n<li>Supervise, design and implement a co...",<ul>\n<li>Bachelor’s degree in Accounting</li>...,Manager
2,Banquet Supervisor,Mandarin Oriental Hotel GroupMandarin Oriental...,,Experienced (Non-Manager)
3,Trade Finance & Credit Collection,<p><b>About Us</b></p><br><p>Alfa Laval is a l...,,Not specified
4,Taste & Wellbeing Creative Marketing Associate...,Join us and celebrate the beauty of human expe...,,Not specified


## **Data Preparation**

### Data Cleansing

In [5]:
job_data.shape

(40000, 4)

In [6]:
from bs4 import BeautifulSoup
import re

# Function to clean text
def clean_text(text):
    if isinstance(text, str):
        # Remove HTML tags
        cleaned_text = BeautifulSoup(text, 'html.parser').get_text()
        # Remove extra whitespace and normalize line breaks
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
        # Remove unnecessary punctuation
        cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
        # Remove leading and trailing spaces
        cleaned_text = cleaned_text.strip()
        return cleaned_text
    else:
        return text

# Clean the 'description' and 'requirements' columns
job_data['description'] = job_data['description'].apply(clean_text)
job_data['requirements'] = job_data['requirements'].apply(clean_text)


  cleaned_text = BeautifulSoup(text, 'html.parser').get_text()


In [7]:
job_data.head()

Unnamed: 0,job_title,description,requirements,career_level
0,Senior Developer Relations Manager,Senior Developer Relations Manager page is loa...,,Not specified
1,Costing Manager - Cairo,Supervise design and implement a consistently ...,Bachelors degree in Accounting 10 years progre...,Manager
2,Banquet Supervisor,Mandarin Oriental Hotel GroupMandarin Oriental...,,Experienced (Non-Manager)
3,Trade Finance & Credit Collection,About UsAlfa Laval is a leading global provide...,,Not specified
4,Taste & Wellbeing Creative Marketing Associate...,Join us and celebrate the beauty of human expe...,,Not specified


### Concatenate columns with to create new one **job_details**

In [8]:
# Function to concatenate columns with specified format
def concatenate_columns(row):
    return f"[job_title] {row['job_title']} [description] {row['description']} [requirements] {row['requirements']} [career_level] {row['career_level']}"

# Create the new 'job_details' column
job_data['job_details'] = job_data.apply(concatenate_columns, axis=1)

In [9]:
job_data.to_csv("/content/saved_dataset/job_data_processed.csv", index=False)
job_data.head()

Unnamed: 0,job_title,description,requirements,career_level,job_details
0,Senior Developer Relations Manager,Senior Developer Relations Manager page is loa...,,Not specified,[job_title] Senior Developer Relations Manager...
1,Costing Manager - Cairo,Supervise design and implement a consistently ...,Bachelors degree in Accounting 10 years progre...,Manager,[job_title] Costing Manager - Cairo [descripti...
2,Banquet Supervisor,Mandarin Oriental Hotel GroupMandarin Oriental...,,Experienced (Non-Manager),[job_title] Banquet Supervisor [description] M...
3,Trade Finance & Credit Collection,About UsAlfa Laval is a leading global provide...,,Not specified,[job_title] Trade Finance & Credit Collection ...
4,Taste & Wellbeing Creative Marketing Associate...,Join us and celebrate the beauty of human expe...,,Not specified,[job_title] Taste & Wellbeing Creative Marketi...


## **Document Embedding**

In [10]:
# Load pre-trained Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode the combined job details into dense vectors
job_data['embeddings'] = job_data['job_details'].apply(lambda x: model.encode(x))

# Convert embeddings to a numpy array
job_embeddings = np.vstack(job_data['embeddings'].values)

print(f"Number of job embeddings: {len(job_embeddings)}")
print(f"Embedding dimension: {job_embeddings.shape}")

# Convert embeddings to a DataFrame
embeddings_df = pd.DataFrame(job_embeddings)

# Display the embeddings DataFrame
embeddings_df.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Number of job embeddings: 40000
Embedding dimension: (40000, 384)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.05584,-0.024423,-0.010135,-0.010457,-0.008871,-0.021036,-0.047775,-0.020302,-0.055645,-0.015645,...,-0.027673,0.000884,-0.02165,-0.027011,-0.011865,0.022204,0.093192,-0.090971,-0.007352,0.042767
1,-0.032493,0.054454,-0.051248,0.090706,-0.099144,0.01102,0.00992,0.017501,-0.027605,0.055007,...,0.041908,0.031774,0.00859,-0.011721,-0.001529,0.011831,0.063705,-0.044725,0.003046,0.015442
2,-0.002361,0.043782,0.014782,0.066233,-0.095586,0.031919,0.078575,-0.125943,-0.010117,-0.053071,...,-0.031051,0.025943,0.047285,-0.006249,0.053976,0.014085,0.023854,-0.010409,-0.06616,0.006581
3,-0.105171,-0.056978,-0.040802,0.002628,-0.001247,0.001723,8.2e-05,0.023778,0.018881,-0.085591,...,0.029452,0.04549,-0.021381,-0.004308,0.005761,-0.020008,0.047963,-0.062288,0.019927,-0.043167
4,-0.014408,-0.042388,0.05991,0.037302,-0.018109,0.013823,0.074219,-0.025937,-0.02551,-0.099077,...,-0.008969,-0.022591,-0.025541,0.022838,0.015753,0.050625,0.0736,-0.055761,-0.035343,-0.014509


In [11]:
# Display one text chunks corresponding with it's embedding.
i = 1
print(job_data['job_details'][i])
print(job_embeddings[i])

[job_title] Costing Manager - Cairo [description] Supervise design and implement a consistently effective cost control system and a company budget and reporting system Review and present budget proposals and prepare financial documents and reports to include suggestions on how to address and close financial gaps based on the data analyzed Manage and support performance achievement against function and individual KPIs of the team  Plan organize and direct all the activities of the cost accounting function Manage day to day operations of the existing cost accounting function from management of current unit costs to monthly close and analysis Manage the transition from the existing cost environment to a new cost system Identify hire and develop cost accountants Development and maintenance of standard costs Perform the monthly closing activities including preparation of the cost statement along with supporting analysis Proactive analysis of variances and communication of issues and opportu

In [12]:
# Save the array in compressed format
np.savez_compressed('/content/saved_dataset/job_embeddings_array.npz', array_data= job_embeddings)

## **Vector Database**

### Load the saved embeddings

In [14]:
# Load the compressed array
loaded_embeddings = np.load('/content/saved_dataset/job_embeddings_array.npz')

# Access the array by the name you specified ('my_array' in this case)
loaded_embeddings = loaded_embeddings['array_data']

loaded_embeddings.shape

(40000, 384)

### Create vector database object

In [15]:
embed_length = loaded_embeddings.shape[1]

index = faiss.IndexFlatL2(embed_length)

# Check if the index is trained.
# No training needed when using greedy search i.e. IndexFlatL2
print(index.is_trained)

# Add the embeddings to the index
index.add(loaded_embeddings)

# Check the total number of embeddings in the index
print(index.ntotal)

True
40000


In [16]:
# Load pre-trained Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Load pre-trained Cross Encoder model
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# Method to get career advice
def get_career_advice(query_text, top_k=5):
    # Encode query
    query_embedding = model.encode(query_text)
    query_embedding = np.expand_dims(query_embedding, axis=0)

    # Search in FAISS index
    scores, indices = index.search(query_embedding, top_k)

    # Retrieve job postings
    pred_list = list(indices[0])
    pred_strings_list = [job_data['job_details'].iloc[item] for item in pred_list]

    # Prepare input for cross-encoder
    cross_input_list = [[query_text, pred_text] for pred_text in pred_strings_list]

    # Score all retrieved passages using the cross_encoder
    cross_scores = cross_encoder.predict(cross_input_list)

    # Create DataFrame with results and scores
    df = pd.DataFrame(cross_input_list, columns=['query_text', 'pred_text'])
    df['original_index'] = pred_list
    df['cross_scores'] = cross_scores

    # Sort the DataFrame in descending order based on the scores
    df_sorted = df.sort_values(by='cross_scores', ascending=False).reset_index(drop=True)

    return df_sorted.head(top_k)

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [34]:
query_text = """"Machine Learning Engineer"""
results = get_career_advice(query_text, top_k = 3)
results

Unnamed: 0,query_text,pred_text,original_index,cross_scores
0,"""Machine Learning Engineer",[job_title] AI/ Machine Learning Sr. Engineer ...,11345,7.402164
1,"""Machine Learning Engineer",[job_title] Engineer - Machine Learning [descr...,34694,6.639116
2,"""Machine Learning Engineer",[job_title] Machine Learning Engineer [descrip...,1279,6.316038


## **Generative model to generate personalized career advice**

In [43]:
from huggingface_hub import login
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [48]:
from transformers import pipeline

# Load generative model
gen_model = pipeline("text-generation", model="EleutherAI/gpt-neo-2.7B")
# gen_model = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.1")

In [37]:
def generate_recommendations(query, job_results):
    prompt = f"User query: {query}\n\nRelevant Conext:\n"
    for _, job in job_results.iterrows():
        prompt += f"{job['pred_text']}\n\n"

    prompt += f"Based on your interest in the role of {query}, here are some personalized career advice:"

    response = gen_model(prompt, max_new_tokens=200)
    return response[0]['generated_text']

In [38]:
final_rec = generate_recommendations(query_text, results)
print(final_rec)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


User query: "Machine Learning Engineer

Relevant Conext:
[job_title] AI/ Machine Learning Sr. Engineer [description] Being able to engage with the client to understand their pain points and requirementsCommunicate solutionspropositions effectively back to clientteamCollect and clean large datasets for machine learning projectsExplore data to identify patterns anomalies and potential insightsPreprocess data including feature engineering and normalizationDesign develop and implement machine learning algorithmsExperiment with various machine learning models and techniquesOptimize algorithms for accuracy efficiency and scalabilityTrain machine learning models using collected dataPerform testing to validate modelsEvaluate model performance using appropriate metrics and techniquesFinetune models to improve predictive accuracyCreate informative data visualizations to communicate insights effectivelyFamiliarity with visualization libraries like Matplotlib Seaborn or data visualization software