In [2]:
from transformers import DistilBertTokenizer, DistilBertModel, AutoTokenizer, AutoModel
import torch
import pandas as pd
from tqdm import tqdm
import numpy as np
import faiss
from rank_bm25 import BM25Okapi

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from huggingface_hub import login

login(token="")  # Add your Hugging Face token here

In [4]:
# Load DistilBERT model and tokenizer
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# model = DistilBertModel.from_pretrained('distilbert-base-uncased')

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
model = AutoModel.from_pretrained("mistralai/Mistral-7B-v0.1")
model.eval()

Fetching 2 files: 100%|██████████| 2/2 [15:11<00:00, 455.86s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:34<00:00, 17.14s/it]


MistralModel(
  (embed_tokens): Embedding(32000, 4096)
  (layers): ModuleList(
    (0-31): 32 x MistralDecoderLayer(
      (self_attn): MistralAttention(
        (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
        (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
        (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
      )
      (mlp): MistralMLP(
        (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
        (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
        (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
    )
  )
  (norm): MistralRMSNorm((4096,), eps=1e-05)
  (rotary_emb): MistralRotaryEmbedding()
)

In [5]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = model.to(device)


In [16]:
df = pd.read_csv('data/job_postings_cleaned.csv')  # Load your dataset
df.head()

Unnamed: 0,company_name,title,description,salary,location,formatted_experience_level,skills_desc,work_type,zip_code,job_matching
0,Essentia Health,Long Term Care Administrator,Looking to Make a Difference in Someone’s Life...,110000.0,"Virginia, MN",Director,,FULL_TIME,55792.0,Long Term Care Administrator Looking to Make a...
1,STIIIZY,Retail Procurement Assistant,We are seeking a proactive and detail-oriented...,58240.0,"Los Angeles, CA",Associate,,FULL_TIME,90001.0,Retail Procurement Assistant We are seeking a ...
2,Net2Source Inc.,Quality Control Inspector,Title: Product Quality Inspector Location: Alp...,104000.0,"Alpharetta, GA",Associate,,CONTRACT,30004.0,Quality Control Inspector Title: Product Quali...
3,Swoon,Creative Project Manager - 79439,Our client is a Fortune 100 company & leading ...,70720.0,"Chicago, IL",Associate,,CONTRACT,60601.0,Creative Project Manager - 79439 Our client is...
4,Murphy USA,CASHIER (full-time & part-time opportunities),Job Posting\n\nAs one of the largest national ...,28080.0,"Lapeer, MI",Entry level,,PART_TIME,48446.0,CASHIER (full-time & part-time opportunities) ...


In [17]:
df['formatted_experience_level'].value_counts()

formatted_experience_level
Mid-Senior level    4625
Entry level         3263
Associate           1382
Director             447
Internship           144
Executive            139
Name: count, dtype: int64

In [18]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze(0).numpy()

def get_mistral_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    # Get the last hidden state (sequence_length x hidden_dim)
    hidden_states = outputs.last_hidden_state  # shape: (1, seq_len, hidden_dim)

    # Option 1: Use the mean pooling of all token embeddings (common for decoder LMs)
    embedding = hidden_states.mean(dim=1).squeeze(0).cpu().numpy()

    return embedding


In [19]:
job_embeddings = []
for job_text in tqdm(df['job_matching']):
    emb = get_mistral_embedding(str(job_text))  # ensure it's a string
    job_embeddings.append(emb)

job_embeddings = np.vstack(job_embeddings)

  0%|          | 1/10000 [02:19<388:19:07, 139.81s/it]


KeyboardInterrupt: 

In [12]:
# Make sure your job_embeddings is a float32 numpy array
job_embeddings_np = np.array(job_embeddings).astype('float32')

np.save('saved_data/MiniLML6/job_embeddings.npy', job_embeddings_np)

# Create a FAISS index (use cosine similarity via inner product + normalization)
faiss.normalize_L2(job_embeddings_np)
index = faiss.IndexFlatIP(job_embeddings_np.shape[1])
index.add(job_embeddings_np)

# Save DataFrame along with FAISS index (so index[i] corresponds to df.iloc[i])
df.reset_index(drop=True, inplace=True)  # important

In [13]:
# Save FAISS index
faiss.write_index(index, "saved_data/MiniLML6/job_faiss.index")

# Save dataframe
df.to_pickle("saved_data/MiniLML6/job_metadata.pkl")

In [14]:
index = faiss.read_index("saved_data/MiniLML6/job_faiss.index")
df = pd.read_pickle("saved_data/MiniLML6/job_metadata.pkl")

In [15]:
def match_jobs(resume_text, df_jobs, job_embeddings, top_k=10):
    # 2. Get corresponding filtered embeddings
    filtered_embeddings = []
    for idx in df_jobs.index:
        filtered_embeddings.append(job_embeddings[idx])  # assuming job_embeddings[i] matches df_jobs.iloc[i]

    filtered_embeddings = np.vstack(filtered_embeddings).astype("float32")

    # 3. Get resume embedding
    resume_embedding = get_embedding(resume_text).astype('float32').reshape(1, -1)
    faiss.normalize_L2(resume_embedding)  # Normalize the resume embedding

    # 4. Set up FAISS index with filtered embeddings
    index = faiss.IndexFlatIP(resume_embedding.shape[1])  # Using Inner Product (cosine similarity if normalized)
    faiss.normalize_L2(filtered_embeddings)  # Normalize the job embeddings
    index.add(filtered_embeddings)

    # 5. Search the index for top_k similar jobs
    distances, indices = index.search(resume_embedding, top_k)

    # 6. Fetch top results from the DataFrame
    results_df = df_jobs.iloc[indices[0]].copy()  # Fetch the rows of the top k matches
    results_df['similarity'] = distances[0]  # Add similarity score to the DataFrame

    return results_df

In [16]:
# for resume text in data/resume_queries.csv['resume_text'] embed each text and find top 10 job matches
resume_df = pd.read_csv('data/resume_queries.csv')

resume_texts = resume_df['Resume_str'].tolist()

# for each resume text, get the top 10 job matches
results = {}
for resume_text in tqdm(resume_texts):
    # Filter jobs based on experience level
    top_matches = match_jobs(resume_text, df, job_embeddings, top_k=10)
    results[resume_text] = top_matches

100%|██████████| 20/20 [00:01<00:00, 17.27it/s]


In [17]:
flattened_rows = []

for resume_text, top_matches_df in results.items():
    # Add the resume text as a new column, repeated for each row
    temp_df = top_matches_df.copy()
    temp_df.insert(0, 'resume_text', resume_text)
    flattened_rows.append(temp_df)

# Concatenate all into one DataFrame
final_df = pd.concat(flattened_rows, ignore_index=True)

# Save to CSV
final_df.to_csv('MiniLM_Cosine.csv', index=False)