# Parsing and preparing set of tags by categories

## Imports

In [1]:
import pandas as pd
df_resume_data = pd.read_csv("resume_data.csv")
df_dating_app_dataset = pd.read_csv("Dating App Dataset.csv")
df_resume = pd.read_csv("Resume.csv")
df_updated_resume = pd.read_csv("UpdatedResumeDataSet.csv")

In [2]:
tags_list = []

### resume_data processing

In [3]:
df_resume_data.head(5)

Unnamed: 0,address,career_objective,skills,educational_institution_name,degree_names,passing_years,educational_results,result_types,major_field_of_studies,professional_company_names,...,online_links,issue_dates,expiry_dates,﻿job_position_name,educationaL_requirements,experiencere_requirement,age_requirement,responsibilities.1,skills_required,matched_score
0,,Big data analytics working and database wareho...,"['Big Data', 'Hadoop', 'Hive', 'Python', 'Mapr...",['The Amity School of Engineering & Technology...,['B.Tech'],['2019'],['N/A'],[None],['Electronics'],['Coca-COla'],...,,,,Senior Software Engineer,B.Sc in Computer Science & Engineering from a ...,At least 1 year,,Technical Support\nTroubleshooting\nCollaborat...,,0.85
1,,Fresher looking to join as a data analyst and ...,"['Data Analysis', 'Data Analytics', 'Business ...","['Delhi University - Hansraj College', 'Delhi ...","['B.Sc (Maths)', 'M.Sc (Science) (Statistics)']","['2015', '2018']","['N/A', 'N/A']","['N/A', 'N/A']","['Mathematics', 'Statistics']",['BIB Consultancy'],...,,,,Machine Learning (ML) Engineer,M.Sc in Computer Science & Engineering or in a...,At least 5 year(s),,Machine Learning Leadership\nCross-Functional ...,,0.75
2,,,"['Software Development', 'Machine Learning', '...","['Birla Institute of Technology (BIT), Ranchi']",['B.Tech'],['2018'],['N/A'],['N/A'],['Electronics/Telecommunication'],['Axis Bank Limited'],...,,,,"Executive/ Senior Executive- Trade Marketing, ...",Master of Business Administration (MBA),At least 3 years,,"Trade Marketing Executive\nBrand Visibility, S...",Brand Promotion\nCampaign Management\nField Su...,0.416667
3,,To obtain a position in a fast-paced business ...,"['accounts payables', 'accounts receivables', ...","['Martinez Adult Education, Business Training ...",['Computer Applications Specialist Certificate...,['2008'],[None],[None],['Computer Applications'],"['Company Name ï¼ City , State', 'Company Name...",...,,,,Business Development Executive,Bachelor/Honors,1 to 3 years,Age 22 to 30 years,Apparel Sourcing\nQuality Garment Sourcing\nRe...,Fast typing skill\nIELTSInternet browsing & on...,0.76
4,,Professional accountant with an outstanding wo...,"['Analytical reasoning', 'Compliance testing k...",['Kent State University'],['Bachelor of Business Administration'],[None],['3.84'],[None],['Accounting'],"['Company Name', 'Company Name', 'Company Name...",...,[None],[None],"['February 15, 2021']",Senior iOS Engineer,Bachelor of Science (BSc) in Computer Science,At least 4 years,,iOS Lifecycle\nRequirement Analysis\nNative Fr...,iOS\niOS App Developer\niOS Application Develo...,0.65


skills, major_field_of_studies, related_skils_in_job

In [4]:
import ast

unique_tags_skills = set()
unique_tags_majors = set()

for idx, row in df_resume_data.iterrows():
    try:
        skills = ast.literal_eval(row['skills']) if pd.notnull(row['skills']) else []
        unique_tags_skills.update([s.strip().lower() for s in skills if len(s.strip()) <= 20])
    except Exception:
        pass

    try:
        majors = ast.literal_eval(row['major_field_of_studies']) if pd.notnull(row['major_field_of_studies']) else []
        unique_tags_majors.update([m.strip().lower() for m in majors if len(m.strip()) <= 20])
    except Exception:
        pass

    try:
        related = ast.literal_eval(row['related_skils_in_job']) if pd.notnull(row['related_skils_in_job']) else []
        for item in related:
            if isinstance(item, list):
                unique_tags_skills.update([str(s).strip().lower() for s in item if len(str(s).strip()) <= 20])
            elif isinstance(item, str):
                if len(item.strip()) <= 20:
                    unique_tags_skills.add(item.strip().lower())
    except Exception:
        pass

## Resume.csv
only category, also this csv, after correction can be used for model learning

In [5]:
unique_tags_job_categories = set([tag.strip().lower() for tag in df_resume['Category'].dropna().unique() if len(tag.strip()) <= 20])

## UpdatedResumeDataSet.csv

Also only categories, but also can be used for model learning

In [6]:
unique_tags_job_categories.update([tag.strip().lower() for tag in df_updated_resume['Category'].dropna().unique() if len(tag.strip()) <= 20])

## Dating app dataset
Interests column only

In [8]:
unique_tags_interests = set()
for idx, row in df_dating_app_dataset.iterrows():
    try:
        interests = ast.literal_eval(row['Interests']) if pd.notnull(row['Interests']) else []
        unique_tags_interests.update([s.strip().lower() for s in interests if len(s.strip()) <= 20])
    except Exception:
        pass

In [9]:
unique_tags_interests

{'cooking', 'hiking', 'movies', 'music', 'reading', 'sports', 'travel'}

## All unique tags, from different categories

In [10]:
print("length of unique tags in skills: ", len(unique_tags_skills))
print("length of unique tags in majors: ", len(unique_tags_majors))
print("length of unique tags in job categories: ", len(unique_tags_job_categories))
print("length of unique tags in interests: ", len(unique_tags_interests))

length of unique tags in skills:  2862
length of unique tags in majors:  86
length of unique tags in job categories:  42
length of unique tags in interests:  7


## A lot of skills, we will group it clusters based on embeddings

In [12]:
tags = list(unique_tags_skills)

In [13]:
tags = [t for t in tags if isinstance(t, str) and t.strip()]

In [14]:
tags

['router',
 'of sales',
 'needs analysis',
 'kronos',
 'electronics',
 'design engineer',
 'equipment inventory',
 'web developer',
 'sublime 2',
 'monthly sales',
 'team supervision',
 'phone',
 'user support',
 'eda',
 'pro e',
 'direction',
 'venue',
 'cold calling',
 'cost accounting',
 'cash movement',
 'theoretical research',
 'ngs systems',
 'marketing plan',
 'budget forecasting',
 'quality control',
 'game theory',
 'lucidchart',
 'pillow',
 'iso 14001',
 'strategic plan',
 'repairs',
 'anytime scheduler',
 'alteryx',
 'forecasts',
 'memtest86',
 'cerner',
 'calibrations',
 'multimedia',
 '5s expert',
 'issue resolution',
 'cd',
 'compound synthesis',
 'appfolio',
 'webscrapping',
 'vendor pay system',
 'credit cards',
 'dnn',
 'managed care',
 'switch',
 'audio editing',
 'windows nt',
 'white box testing',
 'customer relations',
 'marketing department',
 'guided mechanisms',
 'sales processes',
 'compensation/payroll',
 'e-mail',
 'budget 5',
 'job plan library',
 'business 

## Model

In [15]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import numpy as np


model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(tags)


  from .autonotebook import tqdm as notebook_tqdm


## Lets try different clustering algorithms

### DBSCAN

In [16]:
from sklearn.cluster import DBSCAN

clustering = DBSCAN(eps=0.4, min_samples=2, metric='cosine')
labels = clustering.fit_predict(embeddings)

from collections import defaultdict
clusters = defaultdict(list)
for tag, label in zip(tags, labels):
    clusters[label].append(tag)

for cluster_id in sorted(clusters.keys()):
    print(f"Cluster {cluster_id}: {clusters[cluster_id]}")

Cluster -1: ['kronos', 'sublime 2', 'venue', 'cold calling', 'ngs systems', 'game theory', 'lucidchart', 'pillow', 'alteryx', 'appfolio', 'webscrapping', 'guided mechanisms', 'job plan library', 'ssl', 'taguchi analysis', 'accountable for', 'transmission', 'firmware development', 'geocoding', 'gearbox design', 'haskell', 'matlab', 'caffe 2', 'vpn', 'dvb', 'jira', 'snowflake', 'conferences', 'audience', 'council', 'correspondence', 'file maps', 'water treatment', 'cbmet', 'opening new accounts', 'keane care', 'gd&t', 'promis software', 'anaconda', 'facet', 'sonet', 'approach', 'newsletter', 'xpert', 'lucene', 'macro media director', 'catalog', 'kotlin', 'energetic', 'dialysis', 'yolo', 'gr&r practices', '.ney(vb, asp)', 'gdb', 'white papers', 'mail merging', 'xamarin studio', 'next', 'bounded rationality', 'dvp&r', 'procedure writing', 'mlops', 'weighing', '3-d noise mapping', 'secretarial', 'permit acquisition', 'geopak', 'microwave', 'backup', 'marconi', 'easy power', 'satellite image

  ret = a @ b
  ret = a @ b
  ret = a @ b


### Aglomerative Clustering

In [17]:
clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.65, metric='cosine', linkage='average')
labels = clustering.fit_predict(embeddings)


clusters = defaultdict(list)
for tag, label in zip(tags, labels):
    clusters[label].append(tag)

for cluster_id in sorted(clusters.keys()):
    print(f"Cluster {cluster_id}: {clusters[cluster_id]}")

Cluster 0: ['multimedia', 'audio editing', 'film', 'camera', 'videoscribe software', 'video conferencing', 'dvd', 'video', 'opencv', 'mpeg', 'mp4', 'h.264', 'mpeg-dash', 'video effects', 'image', 'graphic', 'video editing', 'oculus vr', 'recording']
Cluster 1: ['xpert', 'wap', 'eit', 'echimp', 'etap', 'owasp zap']
Cluster 2: ['cash movement', 'audit', 'petty cash', 'financial reports', 'banking', 'accounting manager', 'financial statements', 'accounting oversight', 'corporate finance', 'fund accounting', 'cash', 'accounts payable', 'public accounting', 'payables', 'daily cash receipts', 'financials', 'financial files', 'accountant', 'financial controls', 'accounting documents', 'interest payable', 'audit compliance', 'auditing services', 'finance', 'billing', 'cash counts', 'accounting system', 'financial accounting', 'financial department', 'financial training', 'auditing', 'financial planner', 'handling cash', 'cash flow', 'cash register', 'financial records', 'accounting duties', 'f

So we grouped using embeddings and aglomerative clustering, since it does not characterize different points as noise (they are like several clusters). Lets get categories

In [18]:
import transformers
print(transformers.__version__)

4.52.4


In [20]:
from transformers import pipeline
generator = pipeline(
    "text-generation",
    model="microsoft/phi-2",
    device_map="cpu"
)

def get_cluster_name(tags):
    prompt = (
        "Instruct: Given a set of professional skills: "
        + ", ".join(tags)
        + ". Summarize what this person knows or can do using one general word or a short phrase (no more than 2-3 words). Do not list the skills. Only summarize. Do not include any introductory phrases.\nOutput:"
    )
    
    result = generator(prompt, max_new_tokens=8, do_sample=True, temperature=0.3)
    name = result[0]['generated_text'].split("Output:")[-1].strip()
    while len(name.split(" ")) > 4:
        print(f"Generated name '{name}' is too long, regenerating...")
        prompt = (
            "Instruct: Given a set of professional skills: "
            + ", ".join(tags)
            + ". Summarize what this person knows or can do using one general word or a short phrase (no more than 2-3 words). Do not list the skills. Only summarize. Do not include any introductory phrases.\nOutput:"
        )
        result = generator(prompt, max_new_tokens=8, do_sample=True, temperature=0.3)
        name = result[0]['generated_text'].split("Output:")[-1].strip()
    return name

        

cluster_names = {}
for cluster_id in sorted(clusters.keys()):
    print(f"Generating name for cluster {cluster_id} with tags: {clusters[cluster_id]}")
    cluster_name = get_cluster_name(clusters[cluster_id])
    print(f"Cluster {cluster_id}: {clusters[cluster_id]} -> Cluster Name: {cluster_name}")
    cluster_names[cluster_id] = cluster_name

Loading checkpoint shards:   0%|          | 0/2 [00:06<?, ?it/s]


KeyboardInterrupt: 

## All in

In [None]:
def load_tags(filename):
    with open(filename, "r") as f:
        tags = set(line.strip().lower() for line in f if line.strip())
    return tags

skills = load_tags("datasets/skills_cluster_names.txt")
majors = load_tags("datasets/unique_tags_majors.txt")
job_categories = load_tags("datasets/unique_tags_job_categories.txt")
interests = load_tags("datasets/unique_tags_interests.txt")

all_tags = skills | majors | job_categories | interests
all_tags_list = sorted(all_tags)

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(all_tags_list)

In [27]:

clustering = AgglomerativeClustering(
    n_clusters=None,
    distance_threshold=0.48,
    metric='cosine',
    linkage='average'
)
labels = clustering.fit_predict(embeddings)


clusters = defaultdict(list)
for tag, label in zip(all_tags_list, labels):
    clusters[label].append(tag)

for cluster_id in sorted(clusters.keys()):
    print(f"Cluster {cluster_id}: {clusters[cluster_id]}")

Cluster 0: ['bio-tech', 'cell technology', 'medical technology']
Cluster 1: ['circuit simulation', 'process simulation']
Cluster 2: ['broadcast media', 'digital-media', 'multimedia production']
Cluster 3: ['technical documentation', 'technical requests', 'technical support', 'technical writing']
Cluster 4: ['erp software', 'erp systems', 'sap', 'sap developer', 'sap erp']
Cluster 5: ['chemical', 'chemical engineering', 'chemical synthesis', 'chemical technician']
Cluster 6: ['nlp models', 'nlp tools', 'text analysis', 'text processing']
Cluster 7: ['data migration', 'data transfer', 'file transfer']
Cluster 8: ['coaching', 'sports']
Cluster 9: ['cybersecurity', 'firewall security', 'network security', 'security software']
Cluster 10: ['event coordination', 'event planning', 'planning systems']
Cluster 11: ['hvac systems', 'power systems', 'utility systems']
Cluster 12: ['itil framework', 'itsm tools']
Cluster 13: ['data retrieval', 'indexing', 'search technologies']
Cluster 14: ['fiber

## After using llm model (microsoft/phi-2) created categories for clusters, u can find in parsed_tags