In [1]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m116.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.2


In [1]:
import pandas as pd
import random
import json

# --- CONFIGURATION ---
RESUME_FILE = 'master_resumes.jsonl'
JOBS_FILE = 'naukri_jobs.csv' # Ensure this matches your uploaded filename exactly

# --- STEP 1: LOAD & DIAGNOSE JOBS DATA ---
print("Loading Jobs Dataset...")
try:
    df_jobs = pd.read_csv(JOBS_FILE)
    print("✅ Jobs File Loaded Successfully.")
    print(f"ACTUAL COLUMNS FOUND: {df_jobs.columns.tolist()}")
except Exception as e:
    print(f"❌ Error loading jobs file: {e}")
    exit()

# --- STEP 2: SMART COLUMN MAPPING ---
# We look for keywords to identify the right columns
column_map = {}

for col in df_jobs.columns:
    col_lower = col.lower()
    if 'title' in col_lower or 'role' in col_lower:
        column_map['title'] = col
    elif 'desc' in col_lower or 'job' in col_lower or 'summary' in col_lower:
        # Some datasets put description in a column named just "Job" or "Description"
        if 'id' not in col_lower and 'link' not in col_lower:
            column_map['description'] = col
    elif 'skill' in col_lower:
        column_map['skills'] = col

print("\n--- MAPPING COLUMNS ---")
print(f"Mapped Title to: {column_map.get('title', 'NOT FOUND')}")
print(f"Mapped Description to: {column_map.get('description', 'NOT FOUND')}")

# CRITICAL FIX: If no description column, create one from Title + Skills
if 'description' not in column_map:
    print("⚠️ No 'Description' column found! Creating synthetic description...")
    # Use Title + Skills as the "Job Description"
    skill_col = column_map.get('skills', '')
    if skill_col:
        df_jobs['synthetic_desc'] = df_jobs[column_map['title']] + " Requires skills: " + df_jobs[skill_col]
    else:
        df_jobs['synthetic_desc'] = df_jobs[column_map['title']]
    target_desc_col = 'synthetic_desc'
else:
    target_desc_col = column_map['description']

target_title_col = column_map.get('title')

# --- STEP 3: PROCESS RESUMES (From JSONL) ---
print("\nProcessing Resumes...")
processed_resumes = []

def flatten_resume(json_str):
    try:
        data = json.loads(json_str)
        # Extract Category (Job Title)
        category = "General"
        if data.get('experience'):
            category = data['experience'][0].get('title', 'General')

        # Extract Text
        text_parts = []
        if data.get('personal_info', {}).get('summary'):
            text_parts.append(data['personal_info']['summary'])

        skills = data.get('skills', {}).get('technical', {})
        for k, v in skills.items():
            if isinstance(v, list):
                text_parts.append(", ".join([x['name'] for x in v if 'name' in x]))

        full_text = " ".join(text_parts).lower()
        return {"text": full_text, "category": category.lower()}
    except:
        return None

with open(RESUME_FILE, 'r') as f:
    for line in f:
        res = flatten_resume(line)
        if res: processed_resumes.append(res)

df_resumes = pd.DataFrame(processed_resumes)

# --- STEP 4: CREATE WEAK SUPERVISION PAIRS ---
print("Generating Training Pairs...")
training_pairs = []

# Prepare Job Lists
jobs_by_title = {}
# Clean job titles for matching
df_jobs['clean_title'] = df_jobs[target_title_col].astype(str).str.lower()
df_jobs['clean_desc'] = df_jobs[target_desc_col].astype(str).str.lower()

unique_resume_titles = df_resumes['category'].unique()

for title in unique_resume_titles:
    # 1. Find matching jobs (Positive Samples)
    # Logic: Job Title contains Resume Title
    matches = df_jobs[df_jobs['clean_title'].str.contains(title, regex=False)]

    # 2. Find matching resumes
    resumes = df_resumes[df_resumes['category'] == title]['text'].tolist()

    if matches.empty or not resumes:
        continue

    job_texts = matches['clean_desc'].tolist()

    # Create POSITIVE Pairs (1.0)
    for _ in range(min(len(resumes), 20)): # Limit to 20 per category to be fast
        training_pairs.append([random.choice(resumes), random.choice(job_texts), 1.0])

    # Create NEGATIVE Pairs (0.0)
    # Pick jobs that explicitly DO NOT match this title
    non_matches = df_jobs[~df_jobs['clean_title'].str.contains(title, regex=False)]
    if not non_matches.empty:
        non_match_texts = non_matches['clean_desc'].tolist()
        for _ in range(min(len(resumes), 20)):
            training_pairs.append([random.choice(resumes), random.choice(non_match_texts), 0.0])

# --- STEP 5: SAVE ---
df_final = pd.DataFrame(training_pairs, columns=['resume', 'job', 'label'])
df_final.to_csv("train_data.csv", index=False)
print(f"\n✅ SUCCESS! Created 'train_data.csv' with {len(df_final)} pairs.")
print("You can now proceed to Fine-Tuning.")

Loading Jobs Dataset...
✅ Jobs File Loaded Successfully.
ACTUAL COLUMNS FOUND: ['Job_Titles', 'Company_Names', 'Experience_Required', 'Package_Details', 'Locations', 'Skills', 'Post_Url', 'Post_Time']

--- MAPPING COLUMNS ---
Mapped Title to: Job_Titles
Mapped Description to: NOT FOUND
⚠️ No 'Description' column found! Creating synthetic description...

Processing Resumes...
Generating Training Pairs...

✅ SUCCESS! Created 'train_data.csv' with 1044 pairs.
You can now proceed to Fine-Tuning.


In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import math

# --- STEP 1: LOAD & CLEAN DATA ---
print("1. Loading and Cleaning Data...")
df = pd.read_csv('train_data.csv')

# Drop rows where 'job' or 'resume' is missing/NaN
initial_count = len(df)
df = df.dropna(subset=['job', 'resume'])

# Drop duplicates
df = df.drop_duplicates()

# Drop rows with "garbage" text (too short to be meaningful)
# e.g., "not provided" or empty strings
df = df[df['resume'].astype(str).str.len() > 50]
df = df[df['job'].astype(str).str.len() > 50]

clean_count = len(df)
print(f"   Original Rows: {initial_count}")
print(f"   Cleaned Rows:  {clean_count} (Dropped {initial_count - clean_count} bad rows)")

if clean_count < 10:
    raise ValueError("Data verification failed! Too few rows after cleaning.")

# --- STEP 2: PREPARE FOR AI TRAINING ---
print("\n2. Preparing Training Examples...")
train_examples = []
for i, row in df.iterrows():
    # InputExample format: [Text1, Text2], Label (float)
    train_examples.append(InputExample(
        texts=[row['resume'], row['job']],
        label=float(row['label'])
    ))

# Create a DataLoader (feeds data to the AI in chunks)
# Batch size 16 is standard for this model size
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

# --- STEP 3: INITIALIZE MODEL ---
print("\n3. Loading Base Model (Generic Brain)...")
# We use 'all-MiniLM-L6-v2' because it is FAST and Efficient (perfect for your pitch)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define the Loss Function
# Contrastive Loss pulls matching vectors together and pushes non-matches apart
train_loss = losses.CosineSimilarityLoss(model)

# --- STEP 4: FINE-TUNE (The Training Loop) ---
print("\n4. Starting Training (Fine-Tuning)...")
print("   This may take 5-15 minutes depending on your hardware.")

# Calculate warmup steps (10% of training data)
num_epochs = 1
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    show_progress_bar=True
)

# --- STEP 5: SAVE THE NEW MODEL ---
save_path = "my_custom_resume_model"
model.save(save_path)
print(f"\n✅ SUCCESS! Custom model saved to folder: '{save_path}'")
print("   You can now use this specific folder for your Matcher App.")

1. Loading and Cleaning Data...
   Original Rows: 1044
   Cleaned Rows:  886 (Dropped 158 bad rows)

2. Preparing Training Examples...

3. Loading Base Model (Generic Brain)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]



config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


4. Starting Training (Fine-Tuning)...
   This may take 5-15 minutes depending on your hardware.


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


✅ SUCCESS! Custom model saved to folder: 'my_custom_resume_model'
   You can now use this specific folder for your Matcher App.


In [3]:
import pandas as pd
import numpy as np
import faiss
import pickle
from sentence_transformers import SentenceTransformer

# --- CONFIGURATION ---
JOBS_FILE = 'naukri_jobs.csv'
MODEL_PATH = 'my_custom_resume_model' # The folder you just created

# 1. LOAD & PREPARE JOBS DATA
print("1. Loading Job Database...")
df = pd.read_csv(JOBS_FILE)

# Apply the same cleaning logic as before (Create Synthetic Description)
# We need a single text field to represent the job
print("   Creating searchable job descriptions...")
df['search_text'] = df['Job_Titles'].astype(str) + " " + df['Skills'].astype(str)
df['search_text'] = df['search_text'].str.lower()

# Keep only necessary columns for the App
# We save this as a separate file so the App is lightweight
df_clean = df[['Job_Titles', 'Skills', 'search_text']].copy()
df_clean['id'] = df_clean.index # Create a unique ID for every job

print(f"   Database ready: {len(df_clean)} jobs.")

# 2. VECTORIZE (Using your Custom Brain)
print("\n2. Vectorizing Jobs (This takes time)...")
try:
    model = SentenceTransformer(MODEL_PATH)
    print("   ✅ Loaded Custom Siamese Model.")
except:
    print("   ⚠️ Custom model not found! Loading generic.")
    model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode all job descriptions into vectors
# This is the heavy lifting. On CPU, 1000 jobs take ~10 seconds.
job_vectors = model.encode(df_clean['search_text'].tolist(), show_progress_bar=True)

# 3. BUILD FAISS INDEX
print("\n3. Building FAISS Index...")
# Convert to float32 (FAISS requirement)
job_vectors = np.array(job_vectors).astype('float32')

# Create the Index
dimension = job_vectors.shape[1] # Should be 384
index = faiss.IndexFlatL2(dimension)
index.add(job_vectors)

print(f"   Index built. Total vectors: {index.ntotal}")

# 4. SAVE EVERYTHING
print("\n4. Saving artifacts...")
faiss.write_index(index, "job_index.faiss")
df_clean.to_pickle("jobs_metadata.pkl")

print("\n✅ DONE! You have 2 new files:")
print("   1. 'job_index.faiss' (The Search Engine)")
print("   2. 'jobs_metadata.pkl' (The Text Database)")
print("   You are ready for Day 4 (XAI & UI).")

1. Loading Job Database...
   Creating searchable job descriptions...
   Database ready: 47191 jobs.

2. Vectorizing Jobs (This takes time)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

   ✅ Loaded Custom Siamese Model.


Batches:   0%|          | 0/1475 [00:00<?, ?it/s]


3. Building FAISS Index...
   Index built. Total vectors: 47191

4. Saving artifacts...

✅ DONE! You have 2 new files:
   1. 'job_index.faiss' (The Search Engine)
   2. 'jobs_metadata.pkl' (The Text Database)
   You are ready for Day 4 (XAI & UI).


In [6]:
import shutil
# Zip the model folder so you can download it easily
shutil.make_archive('my_custom_resume_model', 'zip', 'my_custom_resume_model')
print("✅ Zipped! You can now download 'my_custom_resume_model.zip'")

✅ Zipped! You can now download 'my_custom_resume_model.zip'
