In [1]:
import requests
import pandas as pd
import pickle 
import re
from tqdm import tqdm 

In [2]:
def read_pickle(path):
	with open(path,'rb') as f:
		return pickle.load(f) 
def get_title_esco(text):
    title_pattern = re.compile(r'<title>(.*?)</title>', re.IGNORECASE | re.DOTALL)
    match = title_pattern.search(text)
    if match:
        title_content = match.group(1).strip()
        return title_content
    else:
        print("Title not found.")

## Load data from JobBERT-evaluation-dataset

In [3]:
val = pd.read_csv('/home/nicky/project/JDAN-acl-2024-github/data/titles.csv')
test = pd.read_csv('/home/nicky/project/JDAN-acl-2024-github/data/titles.test.csv')

In [4]:
test.head(3)

Unnamed: 0.1,Unnamed: 0,vacancyTitle,conceptUri
0,0,Marketing,http://data.europa.eu/esco/occupation/47e81c7f...
1,1,Marketing Advisor,http://data.europa.eu/esco/occupation/47e81c7f...
2,2,Admin Marketing,http://data.europa.eu/esco/occupation/47e81c7f...


## Scraping the corresponding label of each job title from ESCO website 

In [None]:
val_labels = []
for i in tqdm(range(len(val))):
    try:
        jobtitle = val.iloc[i]['vacancyTitle']
        url = val.iloc[i]['conceptUri'].split('/')[-1]
        esco_url = f'https://esco.ec.europa.eu/en/classification/occupation?uri=http%3A%2F%2Fdata.europa.eu%2Fesco%2Foccupation%2F{url}'
        x = requests.get(esco_url)
        label = get_title_esco(x.text).split('|')[0]
        val_labels.append([jobtitle,label.strip()])
    except:
        print(i)

test_labels = []
for i in tqdm(range(len(test))):
    try:
        jobtitle = test.iloc[i]['vacancyTitle']
        url = test.iloc[i]['conceptUri'].split('/')[-1]
        esco_url = f'https://esco.ec.europa.eu/en/classification/occupation?uri=http%3A%2F%2Fdata.europa.eu%2Fesco%2Foccupation%2F{url}'
        x = requests.get(esco_url)
        label = get_title_esco(x.text).split('|')[0]
        test_labels.append([jobtitle,label.strip()])
    except:
        print(i)


## Convert the job title and its corresponding label to retrieval format (used in pytrec_eval library latter)

In [None]:
val_labels = sorted(list(set([ label for _,label in val if label])))
test_labels = sorted(list(set([ label for _,label in test if label])))

val_rqel,val_jts = dict(),[]
for jt,label in val:
    temp_label = dict()
    for j in val_labels:
        if label == j:
            temp_label[j] = 1
        else:
            temp_label[j] = 0
    val_rqel[jt] = temp_label
    val_jts.append(jt)
    
test_rqel,test_jts = dict(),[]
for jt,label in test:
    temp_label = dict()
    for j in test_labels:
        if label == j:
            temp_label[j] = 1
        else:
            temp_label[j] = 0
    test_rqel[jt] = temp_label
    test_jts.append(jt)

## Save file

In [None]:
with open('data/jobbert-val.pkl', 'wb') as f:
    pickle.dump(val_labels, f)
with open('data/jobbert-test.pkl', 'wb') as f:
    pickle.dump(test_labels, f)
    
with open('data/jobbert-val-dict.pkl', 'wb') as f:
    pickle.dump(val_rqel, f)
with open('data/jobbert-test-dict.pkl', 'wb') as f:
    pickle.dump(test_rqel, f)
    
with open('data/jobbert-val-jt.pkl', 'wb') as f:
    pickle.dump(val_jts, f)
with open('data/jobbert-test-jt.pkl', 'wb') as f:
    pickle.dump(test_jts, f)
    
with open('data/jobbert-val-labels.pkl', 'wb') as f:
    pickle.dump(val_labels, f)
with open('data/jobbert-test-labels.pkl', 'wb') as f:
    pickle.dump(test_labels, f)