In [None]:
from google.colab import drive
drive.mount('/content/drive')

df = process_noran_csv("/content/drive/MyDrive/CareerMatch AI/Dataset.txt")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import re
from dateutil import parser as date_parser
from datetime import datetime
import hashlib

# Extraction utils
ROLE_KEYWORDS = ["intern", "analyst", "engineer", "manager", "developer", "scientist", "assistant", "consultant", "associate", "specialist", "fellow", "co-op"]
DEGREE_KEYWORDS = ["bachelor", "master", "phd", "mba", "m.tech", "b.tech", "m.s.", "b.s.", "msc", "bsc"]

# 🔹 Extract start date if dates are mentioned
def extract_start_date(text):
    match = re.search(r'(\w+\.?\s?\d{4})\s?-', text, re.IGNORECASE)
    if not match:
        return None
    try:
        return date_parser.parse(match.group(1), default=datetime(1900, 1, 1))
    except:
        return None

# 🔹 Career Trajectory (sequential with dates preserved)
def extract_career_trajectory(text):
    results, seen = [], set()
    lines = text.splitlines()
    for i, line in enumerate(lines):
        lower = line.lower().strip()
        for kw in ROLE_KEYWORDS + DEGREE_KEYWORDS:
            if kw in lower and 1 < len(line.split()) < 16 and not line.strip().startswith(("•", "-")):
                context = line
                for j in range(1, 3):
                    if i + j < len(lines):
                        context += " " + lines[i + j].strip()
                start_date = extract_start_date(context) or datetime.min
                if line not in seen:
                    results.append((line.strip(), start_date))
                    seen.add(line)
    results.sort(key=lambda x: x[1])
    return [title for title, _ in results]  # return titles only in order

# 🔹 Skills Block
def extract_skills_block(text):
    match = re.search(r'SKILLS[\s\n]*((?:.*?)(?=(?:\n\n|\n[A-Z ]{2,}|\Z)))', text, re.IGNORECASE | re.DOTALL)
    return match.group(1).strip() if match else ""

# 🔹 Generic block extractor by heading keyword
def extract_blocks(text, heading_keywords):
    lines = text.splitlines()
    blocks, current_block, capturing = [], [], False
    for line in lines:
        line_lower = line.lower().strip()
        is_heading = any(h in line_lower for h in heading_keywords) and len(line.strip().split()) <= 8
        if is_heading:
            if capturing and current_block:
                blocks.append("\n".join(current_block))
            capturing = True
            current_block = [line]
        elif capturing:
            if len(line.strip()) == 0 and current_block:
                blocks.append("\n".join(current_block))
                capturing = False
                current_block = []
            else:
                current_block.append(line)
    if current_block:
        blocks.append("\n".join(current_block))
    return "\n\n".join(blocks).strip() if blocks else None

def extract_education_block(text):
    return extract_blocks(text, ["education", "academics", "academic background"])

def extract_experience_block(text):
    return extract_blocks(text, ["experience", "employment", "work", "professional"])

# 🔹 Misc sections like projects, awards, etc.
def classify_section(header):
    header = header.lower()
    if "project" in header: return "project"
    if "volunteer" in header: return "volunteer"
    if "certificate" in header or "certification" in header: return "certification"
    if "award" in header or "achievement" in header: return "award"
    if "training" in header or "course" in header: return "training"
    return "misc"

def extract_misc_sections_labeled(text):
    lines = text.splitlines()
    trajectory = extract_career_trajectory(text)
    used = set(trajectory)
    skip_keywords = ["education", "experience", "work", "professional", "skills", "teaching", "training", "academic"]

    sections, current_header, current_block = [], None, []
    for line in lines:
        line = line.strip()
        if not line: continue
        if len(line.split()) <= 10 and any(h in line.lower() for h in ["project", "volunteer", "certification", "award", "achievement", "training"]):
            if current_block and current_header and not any(skip in current_header.lower() for skip in skip_keywords):
                joined = "\n".join(current_block).strip()
                if joined not in used:
                    sections.append({"type": classify_section(current_header), "text": joined})
            current_header = line
            current_block = []
        else:
            current_block.append(line)

    if current_block and current_header and not any(skip in current_header.lower() for skip in skip_keywords):
        joined = "\n".join(current_block).strip()
        if joined not in used:
            sections.append({"type": classify_section(current_header), "text": joined})

    return sections

# 🔹 Residual text using hash-based line comparison

def normalize_lines(text):
    return [line.strip() for line in text.splitlines() if line.strip()]

def hash_line(line):
    return hashlib.md5(line.encode("utf-8")).hexdigest()

def extract_residual_text(row):
    original_lines = normalize_lines(row["text"])
    removal_lines = []

    for col in ["education_block", "experience_block", "skills"]:
        if isinstance(row[col], str):
            removal_lines.extend(normalize_lines(row[col]))

    if isinstance(row.get("project_misc"), list):
        for block in row["project_misc"]:
            if isinstance(block, dict) and "text" in block:
                removal_lines.extend(normalize_lines(block["text"]))

    removal_hashes = set(hash_line(line) for line in removal_lines)
    residual_lines = [line for line in original_lines if hash_line(line) not in removal_hashes]

    return "\n".join(residual_lines).strip()

# 🔹 Pretty Print Single Resume Breakdown
def print_resume_sections(row):
    print("📌 RESUME CATEGORY:", row.get("category", "N/A"))
    print("\n📝 FULL TEXT\n" + "="*80 + f"\n{row.get('text', '')}\n")
    print("💙 EDUCATION\n" + "-"*80 + f"\n{row.get('education_block', '').strip()}\n")
    print("💼 EXPERIENCE\n" + "-"*80 + f"\n{row.get('experience_block', '').strip()}\n")
    print("🧠 SKILLS\n" + "-"*80 + f"\n{row.get('skills', '').strip()}\n")
    print("🌟 CAREER TRAJECTORY\n" + "-"*80)
    for title in row.get("career_trajectory", []):
        print(f"- {title}")
    print("\n🧹 PROJECTS & MISC\n" + "-"*80)
    for block in row.get("project_misc", []):
        print(f"[{block['type'].upper()}]\n{block['text']}\n")
    print("📄 RESIDUAL TEXT\n" + "-"*80 + f"\n{row.get('residual_text', '').strip()}")

# 🔹 Full Noran text resume processing
def process_noran_csv(csv_path):
    df = pd.read_csv(csv_path)
    df.columns = ["category", "text"]

    df["career_trajectory"] = df["text"].apply(extract_career_trajectory)
    df["skills"] = df["text"].apply(extract_skills_block)
    df["project_misc"] = df["text"].apply(extract_misc_sections_labeled)
    df["education_block"] = df["text"].apply(extract_education_block)
    df["experience_block"] = df["text"].apply(extract_experience_block)
    df["residual_text"] = df.apply(extract_residual_text, axis=1)

    return df

# Example usage:
df = process_noran_csv("/content/drive/MyDrive/CareerMatch AI/Dataset.txt")

In [None]:
df.head()

Unnamed: 0,category,text,career_trajectory,skills,project_misc,education_block,experience_block,residual_text
0,Accountant,﻿________________\r\n\r\nEDUCATION\r\nOMBA - E...,[OMBA - Executive Leadership University of Tex...,Quickbooks,"[{'type': 'award', 'text': 'Speaker | Bringing...",EDUCATION\nOMBA - Executive Leadership Univers...,TEACHING EXPERIENCE\nOnline Teacher - Udemy (2...,﻿________________
1,Accountant,﻿________________\r\n\r\nHOWARD GERRARD\r\nAcc...,[],Course details,[],EDUCATION\nUniversity\nCourse details\n2010-20...,WORK EXPERIENCE\nCompany name\nACCOUNTANT - Lo...,﻿________________\nHOWARD GERRARD\nAccountant\...
2,Accountant,﻿________________\r\n\r\nKevin Frank\r\nSENIOR...,"[Bachelor Of Commerce, Masters in Business Adm...",Misrosoft Office,[],Education\nBachelor Of Commerce\nSan Jose Stat...,Experience\nSenior Accountant & Customer Refer...,﻿________________\nKevin Frank\nSENIOR ACCOUNT...
3,Accountant,﻿________________\r\n\r\nPlace of birth\r\nNat...,"[Accountant I, Cobb & Associates, Association ...",,[],EDUCATION\nApr 2016 - Aug 2016\nAssociation fo...,EMPLOYMENT HISTORY\nMay 2017 Apr 2019\n—\nExpe...,﻿________________\nPlace of birth\nNationality...
4,Accountant,"﻿________________\r\n\r\nStephen Greet, CPA\r\...","[MBA Accounting 2012, Bachelor of Arts Account...",QuickBooks,[],EDUCATION\nUniversity of Pittsburgh\nMBA Accou...,WORK EXPERIENCE\nTeachers Pay Teachers\nSenior...,"﻿________________\nStephen Greet, CPA\nThrough..."


In [None]:
def show_sample_row(df, row_idx=0, max_chars=1000000):
    print("Sample Row (Index =", row_idx, "):\n")

    if row_idx >= len(df):
        print(f"Row index {row_idx} out of bounds. Total rows: {len(df)}")
        return

    row = df.iloc[row_idx]

    print(f"CATEGORY\n{row.get('category', 'N/A')}")
    print("=" * 80)

    print("FULL TEXT")
    print("-" * 80)
    print(row.get('text', '')[:max_chars])
    print("=" * 80)

    print("EDUCATION")
    print("-" * 80)
    print(row.get('education_block', '').strip())
    print("=" * 80)

    print("💼 EXPERIENCE")
    print("-" * 80)
    print(row.get('experience_block', '').strip())
    print("=" * 80)

    print("SKILLS")
    print("-" * 80)
    print(row.get('skills', '').strip())
    print("=" * 80)

    print("CAREER TRAJECTORY")
    print("-" * 80)
    trajectory = row.get("career_trajectory", [])
    if isinstance(trajectory, list) and len(trajectory) > 0:
        for title in trajectory:
            print(f"- {title}")
    else:
        print("(None)")
    print("=" * 80)

    print("PROJECTS & MISC")
    print("-" * 80)
    for block in row.get("project_misc", []):
        print(f"[{block.get('type', 'misc').upper()}]\n{block.get('text', '')}\n")
    print("=" * 80)

    print("RESIDUAL TEXT")
    print("-" * 80)
    print(row.get('residual_text', '').strip())
    print("=" * 80)
show_sample_row(df=df)

📌 Sample Row (Index = 0 ):

🟩 CATEGORY
Accountant
📝 FULL TEXT
--------------------------------------------------------------------------------
﻿________________

EDUCATION
OMBA - Executive Leadership University of Texas
2016-2018
O Bachelor of Science in Accounting Richland College
2005-2008
TRAINING & CERTIFICATIONS
Certified Management Accountant (CMA)
Certified Financial Modeling and Valuation Analyst
Compliance and Anti-Money Laundering (09/2016) American Institute of Banking
Certified Public Account (CPA)
Lean Six Sigma Green Belt Certified
Trade Products Financial Regulations (08/2016) American Institute of Banking
ACHIEVEMENTS
Speaker | Bringing out the Leader within YOU (08/2019)
Successfully presented an empowering speech on leadership to a 500+ participants.
Speaker | Dallas Convention of CPAs (03/2019) Successfully delivered a seminar to 3K+ CPAs and convention guests
TEACHING EXPERIENCE
Online Teacher - Udemy (2017)
Taught Online Accounting for Non-Accou

In [None]:
import pandas as pd
import numpy as np

def custom_isnull(val):
    try:
        if val is None:
            return True
        if isinstance(val, float) and np.isnan(val):
            return True
        if isinstance(val, str) and val.strip() == "":
            return True
        if isinstance(val, (list, dict, set)) and len(val) == 0:
            return True
        return False
    except:
        return False

def run_custom_diagnostics(df):
    print("Resume Data Diagnostics Summary\n")

    total_rows = len(df)
    print(f"Total Rows: {total_rows:,}\n")

    null_counts = {}
    for col in df.columns:
        null_counts[col] = df[col].map(custom_isnull).sum()

    null_percent = {col: round((count / total_rows) * 100, 2) for col, count in null_counts.items()}
    non_null = {col: total_rows - count for col, count in null_counts.items()}

    diagnostics_df = pd.DataFrame({
        "Null Count": null_counts,
        "Null %": null_percent,
        "Non-Null Count": non_null
    })

    print(diagnostics_df)

run_custom_diagnostics(df)

📊 Resume Data Diagnostics Summary

Total Rows: 13,389

                   Null Count  Null %  Non-Null Count
category                    0    0.00           13389
text                        0    0.00           13389
career_trajectory        1170    8.74           12219
skills                   1143    8.54           12246
project_misc             6974   52.09            6415
education_block          1709   12.76           11680
experience_block          262    1.96           13127
residual_text               0    0.00           13389


In [None]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import hashlib
import logging
import os

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Step 1: Prepare section-wise text and label
logger.info("Step 1: Preparing data for embedding and model input...")
section_cols = ["skills", "project_misc", "education_block", "experience_block", "residual_text"]

def deduplicated_combine_sections(row):
    seen_hashes = set()
    section_texts = []

    # Add skills, project_misc, education, experience
    for col in section_cols[:-1]:
        val = row.get(col)
        if isinstance(val, str):
            block = val.strip()
            block_hash = hashlib.md5(block.encode()).hexdigest()
            if block and block_hash not in seen_hashes:
                section_texts.append(block)
                seen_hashes.add(block_hash)
        elif isinstance(val, list):
            for b in val:
                if isinstance(b, dict) and "text" in b:
                    block = b["text"].strip()
                    block_hash = hashlib.md5(block.encode()).hexdigest()
                    if block and block_hash not in seen_hashes:
                        section_texts.append(block)
                        seen_hashes.add(block_hash)

    # Add unique lines from residual_text
    residual = row.get("residual_text", "")
    unique_lines = []
    for line in residual.splitlines():
        line = line.strip()
        if not line:
            continue
        line_hash = hashlib.md5(line.encode()).hexdigest()
        if line_hash not in seen_hashes:
            unique_lines.append(line)
            seen_hashes.add(line_hash)

    if unique_lines:
        section_texts.append("\n".join(unique_lines))

    return "\n\n".join(section_texts).strip()

df["combined_text"] = df.apply(deduplicated_combine_sections, axis=1)
df = df[df["combined_text"].str.strip().astype(bool)]

# Encode labels
le = LabelEncoder()
df["label_encoded"] = le.fit_transform(df["category"])

# Step 2: Train-test split
logger.info("Step 2: Performing stratified train-test split...")
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label_encoded"], random_state=42)

# Step 3: Embedding preparation
logger.info("Step 3: Generating embeddings using SentenceTransformer...")
model_name = "all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

train_embeddings = embedder.encode(train_df["combined_text"].tolist(), convert_to_tensor=True, show_progress_bar=True)
test_embeddings = embedder.encode(test_df["combined_text"].tolist(), convert_to_tensor=True, show_progress_bar=True)

train_labels = torch.tensor(train_df["label_encoded"].values)
test_labels = torch.tensor(test_df["label_encoded"].values)

# ✅ Save embeddings and metadata
os.makedirs("output_embeddings", exist_ok=True)
torch.save(train_embeddings, "output_embeddings/train_embeddings.pt")
torch.save(test_embeddings, "output_embeddings/test_embeddings.pt")
train_df[["label_encoded", "category", "combined_text"]].to_parquet("output_embeddings/train_metadata.parquet", index=False)
test_df[["label_encoded", "category", "combined_text"]].to_parquet("output_embeddings/test_metadata.parquet", index=False)
logger.info("✅ Saved embeddings and metadata to 'output_embeddings/'")

# Step 4: Define model
logger.info("Step 4: Defining neural network...")
class ResumeClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(ResumeClassifier, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x):
        return self.net(x)

input_dim = train_embeddings.shape[1]
num_classes = len(le.classes_)
model = ResumeClassifier(input_dim=input_dim, hidden_dim=256, output_dim=num_classes)

# Step 5: Training loop
logger.info("Step 5: Training the model...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
batch_size = 64
epochs = 5

class ResumeDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

train_loader = DataLoader(ResumeDataset(train_embeddings, train_labels), batch_size=batch_size, shuffle=True)

for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for batch in train_loader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    logger.info(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}")

# Step 6: Compute query embedding and find top 10 matches
logger.info("Step 6: Finding top 10 matches for a query...")

custom_query = """
Looking for a Data Scientist with 3 years of experience in Python, SQL, and deep learning frameworks.
Must have worked on customer analytics, churn prediction, and recommendation engines.
Experience with cloud platforms like AWS or GCP, and a solid understanding of ML pipelines and productionizing models.
Open-source contributions or technical blogs will be a bonus.
"""
query_embedding = embedder.encode(custom_query, convert_to_tensor=True)

train_embeddings_np = train_embeddings.cpu().numpy()
query_embedding_np = query_embedding.cpu().numpy()
cosine_scores = np.dot(train_embeddings_np, query_embedding_np) / (
    np.linalg.norm(train_embeddings_np, axis=1) * np.linalg.norm(query_embedding_np)
)

top_k = 10
top_indices = np.argsort(cosine_scores)[-top_k:][::-1]
top_matches = train_df.iloc[top_indices]

logger.info("🔍 Top 10 matches for the query:")
for rank, idx in enumerate(top_indices, 1):
    row = train_df.iloc[idx]
    logger.info(f"{rank}. Category: {row['category']} | Score: {cosine_scores[idx]:.4f}")
    logger.debug(f"Text Preview:\n{row['combined_text'][:300]}...\n")

logger.info("\n🔍 Best Matching Resume Preview:\n" + "-"*50)
print(top_matches.iloc[0]["combined_text"])

# Step 7: Evaluate model on test set
logger.info("Step 7: Evaluating on test set...")
model.eval()
with torch.no_grad():
    test_outputs = model(test_embeddings.to(device))
    predictions = torch.argmax(test_outputs, dim=1)
    accuracy = (predictions.cpu() == test_labels).float().mean().item()

logger.info(f"✅ Test Accuracy: {accuracy:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Batches:   0%|          | 0/335 [00:00<?, ?it/s]

Batches:   0%|          | 0/84 [00:00<?, ?it/s]

Programming Languages & Tools

EDUCATION
B.S. Data Science
Stanford University
•
2012-2016
Palo Alto, CA
Data Science Major in Foundational courses in Mathematics, Computing and Statistics
Part of the Maths & Statistics Society for 3 years
•
Took additional courses in Big Data Ecosystems and Data Visualisation GPA 3.8/4.0
STRENGTHS
Not afraid of wrong ideas Getting it wrong is almost always part of getting it right. My bad idea can give you the spark needed to help us succeed, or vice versa. We can work better by helping each other spot flaws and suggest improvements.
Keen learner
Always learning in order to keep up-to-date with technology enhancements. Still in my early years as a Data scientist and
therefore keen to learn as much as possible.
* Flexible
Fine to work overtime or
complete extra training if needed.
EXPERIENCE
Junior Data Scientist
Herman LLC
Herman LLC is an NGO gathering data about internal displacement
• Working in a team of 6 people
•
•
2016 - Ongoing
Santa Ana, CA
D

In [6]:
# --- Mount Google Drive to access files ---
from google.colab import drive

# Mount the user's Google Drive at '/content/drive'
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
!ls "/content/drive/MyDrive"

 136004274_Paperbreakdown.pptx	 Resume_2025.gdoc
'CareerMatch AI'		'Resume_Summer (1).gdoc'
 Classroom			 STAT650-Project-2-Report.gdoc
'Colab Notebooks'		 step_sizes.gdoc
 coverLetter_Oracle.gdoc	'Untitled document (1).gdoc'
 Fitness_Dashboard_Report.gdoc	'Untitled document (2).gdoc'
'modified newton.gdoc'		'Untitled document (3).gdoc'
'Newtons method.gdoc'		'Untitled document (4).gdoc'
'Prachi resume.pdf'		'Untitled document (5).gdoc'
'Prachi Stat 650 Report.gdoc'	'Untitled document.gdoc'
'Prachi Summary - 2.gdoc'


In [8]:
!find /content/drive/MyDrive -name "Dataset.txt"

/content/drive/MyDrive/CareerMatch AI/Dataset.txt


In [9]:
# --- Imports ---
import pandas as pd
import re
from dateutil import parser as date_parser
from datetime import datetime

# --- Constants ---
# Keywords to detect job roles and degrees
ROLE_KEYWORDS = ["intern", "analyst", "engineer", "manager", "developer", "scientist",
                 "assistant", "consultant", "associate", "specialist", "fellow", "co-op"]
DEGREE_KEYWORDS = ["bachelor", "master", "phd", "mba", "m.tech", "b.tech",
                   "m.s.", "b.s.", "msc", "bsc"]

# --- Helper Functions ---

# Extract a start date (year) from a line of text if present
def extract_start_date(text):
    match = re.search(r'(\w+\.?\s?\d{4})\s?-', text, re.IGNORECASE)
    if not match:
        return None
    try:
        return date_parser.parse(match.group(1), default=datetime(1900, 1, 1))
    except:
        return None

# Extract career trajectory titles (roles/degrees) from resume text
def extract_career_trajectory(text):
    results, seen = [], set()
    lines = text.splitlines()
    for i, line in enumerate(lines):
        lower = line.lower().strip()
        for kw in ROLE_KEYWORDS + DEGREE_KEYWORDS:
            if kw in lower and 1 < len(line.split()) < 16 and not line.strip().startswith(("•", "-")):
                # Capture line and nearby context (next 2 lines)
                context = line
                for j in range(1, 3):
                    if i + j < len(lines):
                        context += " " + lines[i + j].strip()
                start_date = extract_start_date(context) or datetime.min
                if line not in seen:
                    results.append((line.strip(), start_date))
                    seen.add(line)
    results.sort(key=lambda x: x[1])  # Sort by start date
    return [title for title, _ in results]

# Extract 'Skills' block of text from the resume
def extract_skills_block(text):
    match = re.search(r'SKILLS[\s\n]*((?:.*?)(?=(?:\n\n|\n[A-Z ]{2,}|\Z)))', text, re.IGNORECASE | re.DOTALL)
    return match.group(1).strip() if match else ""

# Generic function to extract sections based on heading keywords
def extract_blocks(text, heading_keywords):
    lines = text.splitlines()
    blocks, current_block, capturing = [], [], False
    for line in lines:
        line_lower = line.lower().strip()
        is_heading = any(h in line_lower for h in heading_keywords) and len(line.strip().split()) <= 8
        if is_heading:
            if capturing and current_block:
                blocks.append("\n".join(current_block))
            capturing = True
            current_block = [line]
        elif capturing:
            if len(line.strip()) == 0 and current_block:
                blocks.append("\n".join(current_block))
                capturing = False
                current_block = []
            else:
                current_block.append(line)
    if current_block:
        blocks.append("\n".join(current_block))
    return "\n\n".join(blocks).strip() if blocks else None

# Extract 'Education' section specifically
def extract_education_block(text):
    return extract_blocks(text, ["education", "academics", "academic background"])

# Extract 'Experience' section specifically
def extract_experience_block(text):
    return extract_blocks(text, ["experience", "employment", "work", "professional"])

# --- Main Function to Process the Resume Dataset ---

def process_noran_csv(csv_path):
    # Read CSV file
    df = pd.read_csv(csv_path)
    df.columns = ["category", "text"]  # Rename columns

    # Apply extraction functions
    df["career_trajectory"] = df["text"].apply(extract_career_trajectory)
    df["skills"] = df["text"].apply(extract_skills_block)
    df["education_block"] = df["text"].apply(extract_education_block)
    df["experience_block"] = df["text"].apply(extract_experience_block)

    return df


In [10]:
# --- Ensure the 'df' (DataFrame) is loaded ---

try:
    # If df already exists (previously loaded), just use it
    df
except NameError:
    # If df is not yet defined, load it from the specified path
    df = process_noran_csv("/content/drive/MyDrive/CareerMatch AI/Dataset.txt")


In [11]:
df.head()


Unnamed: 0,category,text,career_trajectory,skills,education_block,experience_block
0,Accountant,﻿________________\r\n\r\nEDUCATION\r\nOMBA - E...,[OMBA - Executive Leadership University of Tex...,Quickbooks,EDUCATION\nOMBA - Executive Leadership Univers...,TEACHING EXPERIENCE\nOnline Teacher - Udemy (2...
1,Accountant,﻿________________\r\n\r\nHOWARD GERRARD\r\nAcc...,[],Course details,EDUCATION\nUniversity\nCourse details\n2010-20...,WORK EXPERIENCE\nCompany name\nACCOUNTANT - Lo...
2,Accountant,﻿________________\r\n\r\nKevin Frank\r\nSENIOR...,"[Bachelor Of Commerce, Masters in Business Adm...",Misrosoft Office,Education\nBachelor Of Commerce\nSan Jose Stat...,Experience\nSenior Accountant & Customer Refer...
3,Accountant,﻿________________\r\n\r\nPlace of birth\r\nNat...,"[Accountant I, Cobb & Associates, Association ...",,EDUCATION\nApr 2016 - Aug 2016\nAssociation fo...,EMPLOYMENT HISTORY\nMay 2017 Apr 2019\n—\nExpe...
4,Accountant,"﻿________________\r\n\r\nStephen Greet, CPA\r\...","[MBA Accounting 2012, Bachelor of Arts Account...",QuickBooks,EDUCATION\nUniversity of Pittsburgh\nMBA Accou...,WORK EXPERIENCE\nTeachers Pay Teachers\nSenior...


In [12]:
# --- Imports ---
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
from collections import Counter
import re

# --- Utility Functions ---

# Function to clean titles: removes years, dates, punctuation, and extra spaces
def clean_title(title):
    title = title.lower()
    title = re.sub(r'\b\d{4}\b', '', title)          # Remove 4-digit years like 2020
    title = re.sub(r'\b(0?[1-9]|1[0-2])\/\b', '', title)  # Remove MM/ patterns
    title = re.sub(r'\d{1,2}/\d{1,2}/\d{2,4}', '', title) # Remove full dates
    title = re.sub(r'\d{1,2}/\d{4}', '', title)       # Remove MM/YYYY patterns
    title = re.sub(r'\d{1,2}-\d{1,2}', '', title)      # Remove ranges like 01-02
    title = re.sub(r'to.*', '', title)                # Remove "to Present" etc.
    title = re.sub(r'[\(\)\[\]\.,:;\'"•\-]', '', title) # Remove punctuation
    title = re.sub(r'\s+', ' ', title)                # Normalize spaces
    title = re.sub(r'^(0[1-9]|1[0-2])$', '', title)    # Remove single MM numbers
    return title.strip().title()

# Function to standardize job titles into broader categories
def standardize_education_titles(title):
    title = title.lower()
    if "computer science" in title: return "Computer Science"
    if "business" in title: return "Business"
    if "engineering" in title: return "Engineering"
    if "arts" in title or "psychology" in title or "sociology" in title: return "Humanities"
    if "accounting" in title or "finance" in title or "commerce" in title: return "Accounting/Finance"
    if "developer" in title or "engineer" in title: return "Software/IT"
    return title.title()  # Default: capitalize nicely

# Function to create input-output training pairs from sequential trajectories
def create_training_pairs(trajectories):
    X, y = [], []
    for path in trajectories:
        if isinstance(path, list) and len(path) >= 2:
            for i in range(1, len(path)):
                X.append(" > ".join(path[:i]))  # Input: history up to i
                y.append(path[i])               # Output: next job
    return X, y

# --- Data Preparation ---

# Step 1: Extract training samples from career trajectories
X_raw, y_raw = create_training_pairs(df["career_trajectory"].dropna())
print(f"🔹 Total samples before cleaning: {len(X_raw)}")

# Step 2: Clean and standardize job titles
y_cleaned = [standardize_education_titles(clean_title(title)) for title in y_raw]

# Optional: Print some examples
for i in range(3):
    print(f"Original: {y_raw[i]}")
    print(f"Cleaned:  {y_cleaned[i]}\n")

# Step 3: Filter out samples where input text is too long (>300 characters)
X_filtered, y_filtered = zip(*[(x, y) for x, y in zip(X_raw, y_cleaned) if len(x) <= 300])
print(f"🔹 Total samples after filtering: {len(X_filtered)}")

# Step 4: Keep only the top 200 most frequent cleaned job titles
top_titles = {title for title, _ in Counter(y_filtered).most_common(200)}
X_final, y_final = zip(*[(x, y) for x, y in zip(X_filtered, y_filtered) if y in top_titles])
print(f"Samples after limiting to top 200 titles: {len(X_final)}")
print(f"Unique cleaned job titles: {len(set(y_final))}")

# --- Model Building ---

# Step 5: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=42)
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

# Step 6: Create a machine learning pipeline (TF-IDF + Logistic Regression)
pipeline = make_pipeline(
    TfidfVectorizer(ngram_range=(1, 2), max_features=5000),  # Text feature extraction
    LogisticRegression(max_iter=300, solver='lbfgs', multi_class='multinomial')  # Classifier
)

# Step 7: Train the model
pipeline.fit(X_train, y_train)

# --- Model Evaluation ---

# Step 8: Predict on the test set and print classification metrics
y_pred = pipeline.predict(X_test)
print("\nCareer trajectory model performance:")
print(classification_report(y_test, y_pred))

# --- Prediction on Full Data ---

# Step 9: Function to predict next job given a career history
def get_next_prediction(traj_list):
    if not traj_list or not isinstance(traj_list, list) or len(traj_list) < 1:
        return "Not enough history"
    input_str = " > ".join(traj_list)
    return pipeline.predict([input_str])[0]

# Step 10: Apply prediction function to the original DataFrame
df["predicted_next_title"] = df["career_trajectory"].apply(get_next_prediction)

# Step 11: View first few predictions
df[["career_trajectory", "predicted_next_title"]].head()


🔹 Total samples before cleaning: 31864
Original: O Bachelor of Science in Accounting Richland College
Cleaned:  Accounting/Finance

Original: Certified Financial Modeling and Valuation Analyst
Cleaned:  Certified Financial Modeling And Valuation Analyst

Original: Fellow Chartered Accountant (2011 - Present)
Cleaned:  Fellow Chartered Accountant Present

🔹 Total samples after filtering: 29285
Samples after limiting to top 200 titles: 16582
Unique cleaned job titles: 200
Training samples: 13265
Test samples: 3317





Career trajectory model performance:
                                                                                                                     precision    recall  f1-score   support

                                                                                                                          0.00      0.00      0.00         3
                                                                                                                01/       0.25      0.08      0.12        52
                                                                                                                02/       0.00      0.00      0.00        21
                                                                                                                03/       0.00      0.00      0.00        35
                                                                                                                04/       0.00      0.00      0.00        28
                   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,career_trajectory,predicted_next_title
0,[OMBA - Executive Leadership University of Tex...,Accounting/Finance
1,[],Not enough history
2,"[Bachelor Of Commerce, Masters in Business Adm...",Business
3,"[Accountant I, Cobb & Associates, Association ...",Accounting/Finance
4,"[MBA Accounting 2012, Bachelor of Arts Account...",Business
