In [1]:
pip install sentence-transformers torch-directml pandas scikit-learn ace_tools yake dask


Collecting torch-directml
  Downloading torch_directml-0.2.5.dev240914-cp311-cp311-win_amd64.whl.metadata (6.2 kB)
Collecting ace_tools
  Downloading ace_tools-0.0-py3-none-any.whl.metadata (300 bytes)
Collecting yake
  Downloading yake-0.4.8-py2.py3-none-any.whl.metadata (4.0 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.4.1-cp311-cp311-win_amd64.whl.metadata (27 kB)
Collecting torchvision==0.19.1 (from torch-directml)
  Downloading torchvision-0.19.1-cp311-cp311-win_amd64.whl.metadata (6.1 kB)
Collecting segtok (from yake)
  Downloading segtok-1.5.11-py3-none-any.whl.metadata (9.0 kB)
Downloading torch_directml-0.2.5.dev240914-cp311-cp311-win_amd64.whl (9.0 MB)
   ---------------------------------------- 0.0/9.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/9.0 MB 3.4 MB/s eta 0:00:03
    --------------------------------------- 0.2/9.0 MB 2.6 MB/s eta 0:00:04
   -

In [2]:
import torch_directml
import torch

# Create a DirectML device
dml_device = torch_directml.device()
print("DirectML device:", dml_device)

# Create a tensor on the DirectML device
x = torch.tensor([1.0, 2.0, 3.0], device=dml_device)
print("Tensor on DirectML device:", x)

# Simple computation
y = x * 2
print("Computed tensor:", y)


DirectML device: privateuseone:0
Tensor on DirectML device: tensor([1., 2., 3.], device='privateuseone:0')
Computed tensor: tensor([2., 4., 6.], device='privateuseone:0')


In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import yake
import dask.dataframe as dd

# Load resumes & jobs
resumes_df = pd.read_csv('combined_resume_data.csv')
jobs_df = pd.read_csv('computing_desc_job_posting.csv')

# Convert Pandas DataFrames to Dask DataFrames
d_resumes = dd.from_pandas(resumes_df, npartitions=20)
d_jobs = dd.from_pandas(jobs_df, npartitions=20)

# Ensure required columns exist
assert "raw_text" in resumes_df.columns, "Missing 'resume_text' in resumes.csv"
assert "description" in jobs_df.columns, "Missing 'description' in jobs.csv"

# Use TF-IDF for feature extraction
def extract_keywords_tfidf(text, top_n=10):
    vectorizer = TfidfVectorizer(stop_words="english", max_features=50)
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    scores = tfidf_matrix.toarray()[0]
    keywords = sorted(zip(feature_names, scores), key=lambda x: x[1], reverse=True)
    return [word for word, _ in keywords[:top_n]]

# Use YAKE for keyword extraction
def extract_keywords_yake(text, top_n=10):
    kw_extractor = yake.KeywordExtractor(lan="en", n=2, dedupLim=0.9, top=top_n)
    keywords = kw_extractor.extract_keywords(text)
    return [kw[0] for kw in keywords]

# Apply keyword extraction functions in parallel for resumes
d_resumes["keywords"] = d_resumes["raw_text"].apply(
    lambda x: extract_keywords_tfidf(x) + extract_keywords_yake(x),
    meta=('keywords', 'object')
)

# For job postings
d_jobs["keywords"] = d_jobs["description"].apply(
    lambda x: extract_keywords_tfidf(x) + extract_keywords_yake(x),
    meta=('keywords', 'object')
)


# ---- ADD DUMMY KEY FOR CROSS JOIN BELOW ----
d_resumes["key"] = 1
d_jobs["key"] = 1

# Perform the cross join (Cartesian product) using the dummy key
d_cross = d_resumes.merge(d_jobs, on="key", suffixes=("_resume", "_job"))

# Now, d_cross contains every combination of a resume and a job posting.
# You can then proceed to compute similarities on d_cross.

# For example, you could show the first few rows to verify:
print(d_cross.head().compute())


KeyboardInterrupt: 

In [21]:
print(d_cross)

Dask DataFrame Structure:
               raw_text    role clean_text source_file keywords_resume    key job_id company_name   title description max_salary pay_period location company_id    views med_salary min_salary formatted_work_type  applies original_listed_time remote_allowed job_posting_url application_url application_type   expiry closed_time formatted_experience_level skills_desc listed_time posting_domain sponsored work_type currency compensation_type normalized_salary zip_code     fips matched_category standardized_salary description_length cleaned_description keywords_job
npartitions=20                                                                                                                                                                                                                                                                                                                                                                                                            

In [3]:
print("Resumes shape:", resumes_df.shape)
print("Jobs shape:", jobs_df.shape)

NameError: name 'resumes_df' is not defined

In [47]:
print(resumes_df.head())
print(jobs_df.head())

                                            raw_text                    role  \
0  C:\Workspace\java\scrape_indeed\dba_part_1\1.h...  Database Administrator   
1  C:\Workspace\java\scrape_indeed\dba_part_1\10....  Database Administrator   
2  C:\Workspace\java\scrape_indeed\dba_part_1\100...  Database Administrator   
3  C:\Workspace\java\scrape_indeed\dba_part_1\100...  Database Administrator   
4  C:\Workspace\java\scrape_indeed\dba_part_1\100...  Database Administrator   

                                          clean_text  \
0  database administrator database administrator ...   
1  database administrator sql microsoft powerpoin...   
2  oracle database administrator oracle database ...   
3  amazon redshift administrator and etl develope...   
4  scrum master oracle database administrator scr...   

                  source_file  
0  Database_Administrator.csv  
1  Database_Administrator.csv  
2  Database_Administrator.csv  
3  Database_Administrator.csv  
4  Database_Administra

In [31]:
print(d_resumes[['raw_text', 'keywords']].head())
print(d_jobs[['description', 'keywords']].head())

                                            raw_text  \
0  C:\Workspace\java\scrape_indeed\dba_part_1\1.h...   
1  C:\Workspace\java\scrape_indeed\dba_part_1\10....   
2  C:\Workspace\java\scrape_indeed\dba_part_1\100...   
3  C:\Workspace\java\scrape_indeed\dba_part_1\100...   
4  C:\Workspace\java\scrape_indeed\dba_part_1\100...   

                                            keywords  
0  [sql, database, server, databases, backup, 200...  
1  [database, microsoft, administrator, sql, mana...  
2  [oracle, database, databases, experience, 11g,...  
3  [database, data, years, administrator, etl, re...  
4  [oracle, scrum, database, databases, master, t...  
                                         description  \
0  PGAV Destinations is seeking a self-motivated ...   
1  A leading pharmaceutical company committed to ...   
2  Education Bachelor's degree in software, math,...   
3  Job Description:GOYT is seeking a skilled and ...   
4  Are you driven by the thrill of solving proble... 

In [49]:
num_rows = d_cross.shape[0].compute()
print("Number of rows:", num_rows)

Number of rows: 396106782


In [None]:
import pandas as pd
import dask.dataframe as dd
import gc
import numpy as np
from dask.diagnostics import ProgressBar

# Define jaccard_similarity function
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0.0

# Function to process a pandas DataFrame chunk
def process_df_chunk(df):
    """Process a pandas DataFrame chunk."""
    # If the DataFrame is empty, return an empty DataFrame with correct columns
    if df.empty:
        return pd.DataFrame(columns=["resume", "job_posting", "similarity_score", "label"])
    
    # Create a copy to ensure we don't modify the original
    df = df.copy()
    
    # Calculate Jaccard similarity one row at a time
    similarities = []
    for _, row in df.iterrows():
        # Convert keywords to sets
        resume_kws = set(row["keywords_resume"].split() if isinstance(row["keywords_resume"], str) else row["keywords_resume"])
        job_kws = set(row["keywords_job"].split() if isinstance(row["keywords_job"], str) else row["keywords_job"])
        
        # Calculate similarity
        sim = jaccard_similarity(resume_kws, job_kws)
        similarities.append(sim)
    
    # Add the similarity column
    df["jaccard_similarity"] = similarities
    
    # Create binary label
    df["label"] = df["jaccard_similarity"].apply(lambda x: 1 if x > 0.2 else 0)
    
    # Select and rename columns
    result = df[[
        "raw_text", "description", "jaccard_similarity", "label"
    ]].rename(columns={
        "raw_text": "resume",
        "description": "job_posting", 
        "jaccard_similarity": "similarity_score"
    })
    
    return result

# Process data in chunks
def process_data(d_cross):
    """Process the data in chunks to avoid memory issues."""
    print("Examining dataset structure...")
    
    # Get column names to verify structure
    try:
        columns = d_cross.columns
        print(f"Columns in dataset: {columns}")
        
        # Find a non-empty partition for testing
        found_non_empty = False
        for i in range(min(10, d_cross.npartitions)):
            sample = d_cross.get_partition(i).head(1, compute=True)
            if not sample.empty:
                print(f"Found non-empty partition at index {i}")
                found_non_empty = True
                break
        
        if not found_non_empty:
            print("Warning: First 10 partitions are empty. Dataset might be empty or incorrectly structured.")
            
        # Check total size
        print(f"Number of partitions: {d_cross.npartitions}")
    except Exception as e:
        print(f"Error examining dataset: {str(e)}")
    
    # Process partitions
    results = []
    total_partitions = d_cross.npartitions
    
    print(f"Processing {total_partitions} partitions...")
    
    with ProgressBar():
        for i in range(total_partitions):
            try:
                print(f"Processing partition {i+1}/{total_partitions}")
                
                # Get a single partition
                partition = d_cross.get_partition(i)
                
                # Compute the partition to pandas DataFrame
                df_partition = partition.compute()
                
                # Skip empty partitions
                if df_partition.empty:
                    print(f"Partition {i+1} is empty, skipping...")
                    continue
                
                # Process the pandas DataFrame
                processed_df = process_df_chunk(df_partition)
                
                # Skip if result is empty
                if processed_df.empty:
                    print(f"Result for partition {i+1} is empty, skipping...")
                    continue
                
                # Save this chunk
                chunk_filename = f"ground_truth_keywords_part_{i}.csv"
                processed_df.to_csv(chunk_filename, index=False)
                print(f"Saved partition {i+1} to {chunk_filename}")
                
                # Add to results list
                results.append(chunk_filename)
                
                # Clean up
                del df_partition
                del processed_df
                gc.collect()
                
            except Exception as e:
                print(f"Error processing partition {i+1}: {str(e)}")
                continue
    
    # Combine all results
    print("Combining results...")
    
    if not results:
        print("No results to combine! All partitions were empty or failed.")
        return pd.DataFrame(columns=["resume", "job_posting", "similarity_score", "label"])
    
    # Combine files incrementally
    combined_df = pd.read_csv(results[0])
    
    for i, file in enumerate(results[1:], 1):
        print(f"Adding file {i+1}/{len(results)}...")
        try:
            chunk = pd.read_csv(file)
            combined_df = pd.concat([combined_df, chunk], ignore_index=True)
            del chunk
            gc.collect()
        except Exception as e:
            print(f"Error adding file {file}: {str(e)}")
    
    # Save final result
    final_filename = "ground_truth_keywords.csv"
    combined_df.to_csv(final_filename, index=False)
    print(f"Saved final dataset to {final_filename} with {len(combined_df)} rows")
    
    return combined_df

# Diagnose the dataset
def diagnose_dataset(d_cross):
    """Diagnose issues with the dataset."""
    print("Diagnosing dataset issues...")
    
    # Check if dataset is empty
    try:
        # Try to get the first non-empty row
        sample = d_cross.head(1, compute=True)
        if sample.empty:
            print("Dataset appears to be empty!")
        else:
            print("Dataset has data. First row:")
            print(sample)
    except Exception as e:
        print(f"Error checking dataset: {str(e)}")
    
    # Check dataset structure
    try:
        # Get column names
        columns = d_cross.columns
        print(f"Dataset columns: {columns}")
        
        # Check if required columns exist
        required_columns = ["keywords_resume", "keywords_job", "raw_text", "description"]
        missing_columns = [col for col in required_columns if col not in columns]
        
        if missing_columns:
            print(f"Missing required columns: {missing_columns}")
        else:
            print("All required columns present.")
    except Exception as e:
        print(f"Error checking columns: {str(e)}")
    
    # Check number of partitions
    try:
        num_partitions = d_cross.npartitions
        print(f"Number of partitions: {num_partitions}")
    except Exception as e:
        print(f"Error checking partitions: {str(e)}")
    
    # Try to get size info
    try:
        # Try to get length - this might be slow for large datasets
        size = len(d_cross)
        print(f"Dataset size: {size} rows")
    except:
        print("Cannot determine dataset size.")

# Run the diagnosis first
diagnose_dataset(d_cross)

# Now process the data
print("\nProcessing dataset...")
train_df = process_data(d_cross)

# Display sample of results
print("\nSample of processed data:")
print(train_df.head())

Diagnosing dataset issues...




Dataset appears to be empty!
Dataset columns: Index(['raw_text', 'role', 'clean_text', 'source_file', 'keywords_resume',
       'key', 'job_id', 'company_name', 'title', 'description', 'max_salary',
       'pay_period', 'location', 'company_id', 'views', 'med_salary',
       'min_salary', 'formatted_work_type', 'applies', 'original_listed_time',
       'remote_allowed', 'job_posting_url', 'application_url',
       'application_type', 'expiry', 'closed_time',
       'formatted_experience_level', 'skills_desc', 'listed_time',
       'posting_domain', 'sponsored', 'work_type', 'currency',
       'compensation_type', 'normalized_salary', 'zip_code', 'fips',
       'matched_category', 'standardized_salary', 'description_length',
       'cleaned_description', 'keywords_job'],
      dtype='object')
All required columns present.
Number of partitions: 20
Dataset size: 396106782 rows

Processing dataset...
Examining dataset structure...
Columns in dataset: Index(['raw_text', 'role', 'clean_text'

In [None]:
print(d_train.head())

In [78]:
print(d_train)
print(d_train['similarity_score'].head())

Dask DataFrame Structure:
                resume job_posting similarity_score  label
npartitions=20                                            
                string      string          float64  int32
                   ...         ...              ...    ...
...                ...         ...              ...    ...
                   ...         ...              ...    ...
                   ...         ...              ...    ...
Dask Name: operation, 63 expressions
Expr=RenameFrame(frame=Assign(frame=Assign(frame=Assign(frame=Assign(frame=Assign(frame=Assign(frame=Assign(frame=Assign(frame=Assign(frame=Assign(frame=Assign(frame=Assign(frame=Assign(frame=Assign(frame=Assign(frame=Assign(frame=Assign(frame=Assign(frame=Assign(frame=Assign(frame=Merge(a001357)))))))))))))))))))))[['raw_text', 'description', 'jaccard_similarity', 'label']], columns={'raw_text': 'resume', 'description': 'job_posting', 'jaccard_similarity': 'similarity_score'})




ValueError: Cannot set a DataFrame with multiple columns to the single column jaccard_similarity

In [None]:
#refines training data set by computing similarity score between each resume and job using pretrained bert
#deeper semantic meaning, improved label quality

from sentence_transformers import SentenceTransformer, util
import numpy as np

# Load pre-trained BERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def bert_similarity(resume_text, job_text):
    """Compute cosine similarity between resume & job using BERT"""
    resume_embedding = model.encode(resume_text, convert_to_tensor=True)
    job_embedding = model.encode(job_text, convert_to_tensor=True)
    similarity_score = util.pytorch_cos_sim(resume_embedding, job_embedding).item()
    return similarity_score

# Apply BERT-based similarity
train_df["bert_similarity"] = train_df.apply(lambda row: bert_similarity(row["resume"], row["job_posting"]), axis=1)

# Convert BERT similarity to binary label
train_df["label"] = (train_df["bert_similarity"] > 0.7).astype(int)  # Threshold for a match

# Save refined dataset
train_df.to_csv("ground_truth_bert.csv", index=False)

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

# Load labeled dataset
df = pd.read_csv('ground_truth_bert.csv')

# Ensure correct column names
assert "resume" in df.columns and "job_posting" in df.columns and "bert_similarity" in df.columns, "Columns missing in dataset!"

# Convert dataset into SentenceTransformer format
train_examples = [
    InputExample(texts=[row["resume"], row["job_posting"]], label=float(row["bert_similarity"]))
    for _, row in df.iterrows()
]

# Define DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

# Load pre-trained BERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Define loss function (CosineSimilarityLoss)
train_loss = losses.CosineSimilarityLoss(model)

# Fine-tune BERT
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=3, warmup_steps=100)

# Save fine-tuned model
model.save("fine_tuned_bert_resume_job")
print("✅ Fine-tuned BERT model saved at 'fine_tuned_bert_resume_job'")


In [None]:
import pickle

# Load fine-tuned model
model = SentenceTransformer('fine_tuned_bert_resume_job')

# Load job postings
jobs_df = pd.read_csv('computing_desc_job_posting.csv')

# Compute and store job embeddings
jobs_df['job_embedding'] = jobs_df['description'].apply(lambda x: model.encode(x))

# Save job embeddings for future use
with open('job_embeddings.pkl', 'wb') as f:
    pickle.dump(jobs_df[['title', 'description', 'job_embedding']], f)

print("✅ Job embeddings saved successfully!")


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def match_resume_to_jobs(resume_text, top_n=10):
    """
    Given a new resume, find the top N matching job postings.
    """
    # Load stored job embeddings
    with open('job_embeddings.pkl', 'rb') as f:
        job_data = pickle.load(f)
    
    # Load trained model
    model = SentenceTransformer('fine_tuned_bert_resume_job')

    # Compute embedding for new resume
    resume_embedding = model.encode([resume_text])

    # Extract stored job embeddings
    job_embeddings = np.stack(job_data['job_embedding'].values)

    # Compute cosine similarity
    similarities = cosine_similarity(resume_embedding, job_embeddings).flatten()

    # Add similarity scores to job postings
    job_data['Similarity Score'] = similarities

    # Get top N matches
    top_jobs = job_data.nlargest(top_n, 'Similarity Score')

    return top_jobs


In [None]:
# Example new resume
new_resume = """
Experienced software engineer with strong skills in Python, machine learning, 
and cloud computing. Worked on large-scale data processing and model deployment.
"""

# Get the top 10 matching jobs
top_matches = match_resume_to_jobs(new_resume)

# Display results
import ace_tools as tools
tools.display_dataframe_to_user(name="Top 10 Matching Job Postings", dataframe=top_matches)
