# Data preparation

In [2]:
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import os
import sys

import tiktoken
import openai

from langdetect import detect, detect_langs, DetectorFactory

from tqdm.asyncio import tqdm
from tqdm.asyncio import tqdm as atqdm

import logging
import asyncio
import aiofiles
import time
import json, ast
import re

from rapidfuzz import fuzz, utils
from collections import Counter
from itertools import chain


In [3]:
load_dotenv()
client = openai.OpenAI()
client_async = openai.AsyncOpenAI()

csv = pd.read_csv('./jobs_data_clean.csv')

sample_sections_test = csv.copy()
sample_sections_test.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query
0,Austria,Europe,True,True,local,google.at,Sandbox Innovation Sdn. Bhd. | Senior Mobile A...,Tideri Jobbörse,Austria,"Trabajo.org - Stellenangebote, Arbeit",We are a dynamic FinTech company headquartered...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTYW5kYm94IElubm92YXRpb24gU2...,2025-01-13 12:33:21 UTC,iOS developer
1,Austria,Europe,True,True,default,google.com,Senior Ios Developer,Pyramid Global Technologies,Austria,"Trabajo.org - Stellenangebote, Arbeit, StudySm...",A minimum of 6+ years of concurrent commercial...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTZW5pb3IgSW9zIERldmVsb3Blci...,2025-01-13 12:20:43 UTC,iOS developer
2,Austria,Europe,True,True,default,google.com,iOS Developer - Permanent remote,Bluestorm Recruitment by Dazzle,Austria,Jooble,iOS Developer\r\n\r\nOur client is a leading m...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRGV2ZWxvcGVyIC0gUGVybW...,2025-01-13 12:20:43 UTC,iOS developer
3,Austria,Europe,True,True,default,google.com,Sr. Ios Developer,Bykon,Austria,"Trabajo.org - Stellenangebote, Arbeit",In ByKon we're looking for an exceptional Sr.\...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTci4gSW9zIERldmVsb3BlciIsIm...,2025-01-13 12:20:43 UTC,iOS developer
4,Austria,Europe,True,True,default,google.com,Software Engineer/ iOS,Bitpanda,Anywhere,GrabJobs,Who we are\r\n\r\nWe simplify wealth creation....,True,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTb2Z0d2FyZSBFbmdpbmVlci8gaU...,2025-01-13 12:20:43 UTC,iOS developer


## Determine the language of vacancies

### `langdetect` library

In [4]:
DetectorFactory.seed = 0  # We record the result so that there are no accidental changes

def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"
    
def detect_language_with_confidence(text):
    try:
        lang_probs = detect_langs(text)
        if lang_probs:
            return str(lang_probs[0])  # Format: 'en:0.99'
    except:
        return "unknown"

#df["Language langdetect"] = df["Job Description"].apply(detect_language)
#df["Language langdetect confidence"] = df["Job Description"].apply(detect_language_with_confidence)

In [5]:
#df.to_csv('./jobs_data_langdetect.csv', index=False)
csv_1 = pd.read_csv('./jobs_data_langdetect.csv')
sample_sections_test = csv_1.copy()

In [6]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(df):
    # Derive column 'derivedCol' from column: 'Language langdetect confidence'
    # Transform based on the following examples:
    #    Language langdetect confidence    Output
    # 1: "en:0.9999973661975282"        => "0.9999973661975282"
    df.insert(20, "derivedCol", df["Language langdetect confidence"].str.split(":").str[-1])
    # Change column type to float64 for column: 'derivedCol'
    df = df.astype({'derivedCol': 'float64'})
    # Drop column: 'Language langdetect confidence'
    df = df.drop(columns=['Language langdetect confidence'])
    # Rename column 'derivedCol' to 'Language langdetect confidence'
    df = df.rename(columns={'derivedCol': 'Language langdetect confidence'})
    return df

sample_sections_test = clean_data(sample_sections_test.copy())
sample_sections_test.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query,Language langdetect,Language langdetect confidence
0,Austria,Europe,True,True,local,google.at,Sandbox Innovation Sdn. Bhd. | Senior Mobile A...,Tideri Jobbörse,Austria,"Trabajo.org - Stellenangebote, Arbeit",We are a dynamic FinTech company headquartered...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTYW5kYm94IElubm92YXRpb24gU2...,2025-01-13 12:33:21 UTC,iOS developer,en,0.999997
1,Austria,Europe,True,True,default,google.com,Senior Ios Developer,Pyramid Global Technologies,Austria,"Trabajo.org - Stellenangebote, Arbeit, StudySm...",A minimum of 6+ years of concurrent commercial...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTZW5pb3IgSW9zIERldmVsb3Blci...,2025-01-13 12:20:43 UTC,iOS developer,en,0.999996
2,Austria,Europe,True,True,default,google.com,iOS Developer - Permanent remote,Bluestorm Recruitment by Dazzle,Austria,Jooble,iOS Developer\r\n\r\nOur client is a leading m...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRGV2ZWxvcGVyIC0gUGVybW...,2025-01-13 12:20:43 UTC,iOS developer,en,0.999994
3,Austria,Europe,True,True,default,google.com,Sr. Ios Developer,Bykon,Austria,"Trabajo.org - Stellenangebote, Arbeit",In ByKon we're looking for an exceptional Sr.\...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTci4gSW9zIERldmVsb3BlciIsIm...,2025-01-13 12:20:43 UTC,iOS developer,en,0.999997
4,Austria,Europe,True,True,default,google.com,Software Engineer/ iOS,Bitpanda,Anywhere,GrabJobs,Who we are\r\n\r\nWe simplify wealth creation....,True,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTb2Z0d2FyZSBFbmdpbmVlci8gaU...,2025-01-13 12:20:43 UTC,iOS developer,en,0.999996


### Confidence lower than 0.99

In [7]:
"""
Cell generated by Data Wrangler.
"""
def low_confidence(df):
    # Sort by column: 'Language langdetect confidence' (ascending)
    df = df.sort_values(['Language langdetect confidence'])
    # Filter rows based on column: 'Language langdetect confidence'
    df = df[df['Language langdetect confidence'] < 0.99]
    return df

not_enough_confidence = low_confidence(sample_sections_test.copy())
sample_sections_test = sample_sections_test.drop(not_enough_confidence.index)


In [8]:
"""
Cell generated by Data Wrangler.
"""
def clean_data1(df):
    # Drop column: 'Language langdetect confidence'
    df = df.drop(columns=['Language langdetect confidence'])
    # Rename column 'Language langdetect' to 'Language'
    df = df.rename(columns={'Language langdetect': 'Language'})
    return df

sample_sections_test = clean_data1(sample_sections_test.copy())
sample_sections_test.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query,Language
0,Austria,Europe,True,True,local,google.at,Sandbox Innovation Sdn. Bhd. | Senior Mobile A...,Tideri Jobbörse,Austria,"Trabajo.org - Stellenangebote, Arbeit",We are a dynamic FinTech company headquartered...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTYW5kYm94IElubm92YXRpb24gU2...,2025-01-13 12:33:21 UTC,iOS developer,en
1,Austria,Europe,True,True,default,google.com,Senior Ios Developer,Pyramid Global Technologies,Austria,"Trabajo.org - Stellenangebote, Arbeit, StudySm...",A minimum of 6+ years of concurrent commercial...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTZW5pb3IgSW9zIERldmVsb3Blci...,2025-01-13 12:20:43 UTC,iOS developer,en
2,Austria,Europe,True,True,default,google.com,iOS Developer - Permanent remote,Bluestorm Recruitment by Dazzle,Austria,Jooble,iOS Developer\r\n\r\nOur client is a leading m...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRGV2ZWxvcGVyIC0gUGVybW...,2025-01-13 12:20:43 UTC,iOS developer,en
3,Austria,Europe,True,True,default,google.com,Sr. Ios Developer,Bykon,Austria,"Trabajo.org - Stellenangebote, Arbeit",In ByKon we're looking for an exceptional Sr.\...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTci4gSW9zIERldmVsb3BlciIsIm...,2025-01-13 12:20:43 UTC,iOS developer,en
4,Austria,Europe,True,True,default,google.com,Software Engineer/ iOS,Bitpanda,Anywhere,GrabJobs,Who we are\r\n\r\nWe simplify wealth creation....,True,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTb2Z0d2FyZSBFbmdpbmVlci8gaU...,2025-01-13 12:20:43 UTC,iOS developer,en


### Chat GPT API

In [9]:
# Configure logging
logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")

# Suppress OpenAI API HTTP logs
logging.getLogger("httpx").setLevel(logging.WARNING)


async def chatgpt_async(
    *,
    input_column_name=None,
    output_column_name=None,
    input_text_length=None,
    output_text_length=5,
    num_rows=None,
    df=None,
    system_prompt=None,
    user_prompt=None,
    gpt_model=None,
    client=None,
    batch_size=None,
    cache_file=None,
    concurrency_limit=10,
    max_retries=5,          # ⬅️ применяется только к фатальным ошибкам
    retry_delay=0.25,       # fallback-задержка
):
    """
    Обрабатывает DataFrame батчами, отправляя тексты в OpenAI-Chat-API
    и заполняя колонку с ответами.  Rate-limit-429 игнорируется —
    функция будет ждать столько, сколько попросит сервер, пока не получит ответ.
    """

    # ---------- ВАЛИДАЦИЯ ----------
    if df is None:
        raise ValueError("df (dataframe) must be provided.")
    if input_column_name is None:
        raise ValueError("input_column_name must be provided.")
    if input_column_name not in df.columns:
        raise ValueError(f"Column «{input_column_name}» нет в DataFrame.")
    if output_column_name is None:
        raise ValueError("output_column_name must be provided.")
    if gpt_model is None:
        raise ValueError("gpt_model must be provided.")
    if user_prompt is None:
        raise ValueError("user_prompt must be provided.")
    if client is None:
        raise ValueError("client must be provided.")

    # ---------- ПОДГОТОВКА ДАННЫХ ----------
    df = df.head(num_rows).copy() if num_rows else df.copy()
    if output_column_name not in df.columns:
        df[output_column_name] = ""

    # ---------- КЭШ ----------
    cache = {}
    if cache_file and os.path.exists(cache_file):
        async with aiofiles.open(cache_file, "r", encoding="utf-8") as f:
            try:
                cache = json.loads(await f.read())
            except json.JSONDecodeError:
                logging.warning("Кэш повреждён или пуст — продолжаем без него.")

    semaphore = asyncio.Semaphore(concurrency_limit)  # ограничиваем параллелизм

    # ---------- ХЕЛПЕР: классифицируем ошибку ----------
    def _is_transient(err_msg: str) -> bool:
        """True → временная (rate-limit / network), False → фатальная."""
        em = err_msg.lower()
        # типичные сигналы, что просто надо подождать
        transient_keys = [
            "rate limit", "please try again", "quota reached soon",
            "tokens per min", "requests per min", "rpm", "tpm",
            "server overloaded", "timeout", "connection reset"
        ]
        return any(k in em for k in transient_keys)

    # ---------- ОБРАБОТКА ОДНОЙ СТРОКИ ----------
    async def process_row(index, row):
        async with semaphore:
            source_text = row[input_column_name]
            truncated = (
                " ".join(source_text.split()[:input_text_length])
                if input_text_length else source_text
            )

            # ----- кэш -----
            if cache_file and truncated in cache:
                df.at[index, output_column_name] = cache[truncated]
                return

            prompt = f"{user_prompt} {truncated}"
            messages = [{"role": "user", "content": prompt}]
            if system_prompt:
                messages.insert(0, {"role": "system", "content": system_prompt})

            fatal_attempts = 0  # считаем только фатальные сбои

            while True:  # бесконечно крутимся, пока rate-limit не пропадёт
                try:
                    resp = await client.chat.completions.with_raw_response.create(
                        model=gpt_model,
                        messages=messages,
                        max_tokens=output_text_length,
                        temperature=0,
                    )
                    answer = resp.parse().choices[0].message.content

                    if cache_file:
                        cache[truncated] = answer
                    df.at[index, output_column_name] = answer
                    return  # ✅ успех

                except Exception as e:
                    msg = str(e)

                    if _is_transient(msg):
                        # ---------- ВРЕМЕННАЯ / RATE-LIMIT ----------
                        # вытаскиваем delay
                        wait = None
                        m = re.search(r"in\s+(\d+(?:\.\d+)?)\s*(ms|s)", msg, re.I)
                        if m:
                            val = float(m.group(1))
                            wait = val / 1000 if m.group(2).lower() == "ms" else val
                        elif getattr(e, "response", None):
                            ra = e.response.headers.get("Retry-After")
                            if ra:
                                try:
                                    wait = float(ra)
                                except ValueError:
                                    pass
                        if wait is None:
                            wait = retry_delay
                        wait += 0.005          # маленький буфер
                        await asyncio.sleep(wait)
                        # никаких логов — это штатная пауза
                        continue

                    # ---------- ФАТАЛЬНАЯ ОШИБКА ----------
                    fatal_attempts += 1
                    if fatal_attempts < max_retries:
                        await asyncio.sleep(retry_delay)
                        continue

                    logging.error(
                        f"Row {index}: unrecoverable error after {max_retries} tries → {msg}"
                    )
                    # ячейку НЕ трогаем (остаётся пустая)
                    return

    # ---------- ПАКЕТЫ ----------
    total_rows = len(df)
    if batch_size and batch_size < total_rows:
        idx_split = np.array_split(range(total_rows),
                                   total_rows // batch_size + 1)
        batches = [df.iloc[idx] for idx in idx_split]
    else:
        batches = [df]

    # ---------- ОБРАБОТКА БАТЧЕЙ ----------
    with atqdm(total=len(batches), desc="Processing Batches", leave=True) as bar:
        for batch in batches:
            await asyncio.gather(
                *[process_row(i, r) for i, r in batch.iterrows()]
            )

            # сохраняем кэш
            if cache_file:
                async with aiofiles.open(cache_file, "w", encoding="utf-8") as f:
                    await f.write(json.dumps(cache, ensure_ascii=False, indent=4))

            bar.update(1)

    return df

propmpt structure:
"Detect the language of the text and return ONLY the ISO country code (e.g., en, fr, de, ect.). Text:{cell text}"

### Dealing with not enought cinfidence

In [10]:
df_result = await chatgpt_async(
    input_column_name="Job Description", 
    output_column_name="Language gpt-4o-2024-11-20",
    input_text_length=None,
    output_text_length=1,
    num_rows=None,  
    df=not_enough_confidence.copy(), 
    user_prompt="Detect the language of the text and return ONLY the ISO country code (e.g., en, fr, de, ect.). Text:",
    gpt_model="gpt-4o-2024-11-20",
    client=client_async,
    batch_size=10,
    concurrency_limit=10,
    cache_file="./cache/language_cache_gpt-4o-2024-11-20.json"
)

Processing Batches: 100%|██████████| 2/2 [00:00<00:00, 41.69it/s]


In [11]:
df_result['Manual check'] = ["drop", "cs", "drop", "ru", "drop", "fr", "en", "pl", "en", "drop", "fr", "en", "pl", "sv", "zh", "fr"]
df_result['Drop'] = [True, False, True, False, True, False, False, False, False, True, False, False, False, False, False, False]

df_result_final = df_result.copy()
df_result_final = df_result_final.loc[~df_result_final['Drop']]

In [12]:
"""
Cell generated by Data Wrangler.
"""
def clean_data2(df_result_final):
    # Drop columns: 'Drop', 'Language langdetect' and 2 other columns
    df_result_final = df_result_final.drop(columns=['Drop', 'Language langdetect', 'Language langdetect confidence', 'Language gpt-4o-2024-11-20'])
    # Rename column 'Manual check' to 'Language'
    df_result_final = df_result_final.rename(columns={'Manual check': 'Language'})
    return df_result_final

df_result_final_clean = clean_data2(df_result_final.copy())
df_result_final_clean.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query,Language
528,Czechia,Europe,True,True,default,google.com,Android Developer (part-time: 2h/week) @ Exper...,Experis Polska,Anywhere,"Jooble, Jobs Trabajo.org",O pozici / o projektu\r\nPůvodní popisek. Andr...,True,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIERldmVsb3BlciAocG...,2025-01-13 12:09:27 UTC,Android developer,cs
780,Germany,Europe,True,True,default,google.com,JUNIOR IOS DEVELOPER,Check24,Germany,Layboard,Требования 1-2 years of experience with iOS de...,,€4.5K a month,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJKVU5JT1IgSU9TIERFVkVMT1BFUi...,2025-01-13 12:22:03 UTC,iOS developer,ru
325,Canada,Northern America,False,False,default,google.com,"Développeur senior, Android / Senior Android D...",Cerence,"Montreal, Quebec, Canada","Indeed, Built In, Eluta.ca, Glassdoor, Adzuna,...",A Moving Experience.\r\n\r\n(English version b...,,,Full-time,,eyJqb2JfdGl0bGUiOiJEw6l2ZWxvcHBldXIgc2VuaW9yLC...,2025-01-13 12:09:10 UTC,Android developer,fr
2105,Sweden,Europe,True,True,default,google.com,Konsultuppdrag Ios and Android Developers - Of...,Senterprise,ستوكهولم، السويد,Emprego.pt,To one of our clients we are now looking for 1...,,,دوام كامل,,eyJqb2JfdGl0bGUiOiJLb25zdWx0dXBwZHJhZyBJb3MgYW...,2025-01-13 12:11:47 UTC,Android developer,en
1591,Poland,Europe,True,True,default,google.com,Android Developer,ALAN Systems,"Silesian Voivodeship, Poland",JobLeads,"Cześć odkrywco kodu!\r\n\r\nCzy jesteś gotowy,...",,PLN 180K–PLN 240K a year,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIERldmVsb3BlciIsIm...,2025-01-13 12:11:12 UTC,Android developer,pl


### Merging

In [13]:
df_language = pd.concat([sample_sections_test, df_result_final_clean], ignore_index=True)
df_language = df_language.sort_values(['Location'])
df_language.reset_index(inplace=True, drop=True)
df_language.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query,Language
0,Austria,Europe,True,True,local,google.at,Sandbox Innovation Sdn. Bhd. | Senior Mobile A...,Tideri Jobbörse,Austria,"Trabajo.org - Stellenangebote, Arbeit",We are a dynamic FinTech company headquartered...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTYW5kYm94IElubm92YXRpb24gU2...,2025-01-13 12:33:21 UTC,iOS developer,en
1,Austria,Europe,True,True,default,google.com,Mobile Application Developer,Pearson Carter,Austria,"Trabajo.org - Stellenangebote, Arbeit",Lead Mobile Developer | Hyper Growth Startup |...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJNb2JpbGUgQXBwbGljYXRpb24gRG...,2025-01-13 12:08:51 UTC,Android developer,en
2,Austria,Europe,True,True,default,google.com,"Android Developer – Kotlin (Austria based, Hyb...",Bitcoin Devs Company,"Vienna, Austria",Jobs3,Overview:\r\nThe Android Developer – Kotlin po...,,,Contractor,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIERldmVsb3BlciDigJ...,2025-01-13 12:08:48 UTC,Android developer,en
3,Austria,Europe,True,True,default,google.com,Android & iOS Developer,ventopay gmbh,Austria,StudySmarter - Talents,Was sind deine Aufgaben?\r\n• Du gestaltest at...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIFx1MDAyNiBpT1MgRG...,2025-01-13 12:20:43 UTC,iOS developer,de
4,Austria,Europe,True,True,default,google.com,iOS Developer Up3 (f/m/d),Drei Österreich,"Vienna, Austria","MyAbility.jobs, Drei., Jobted.at",Do you want to push the frontier of digital se...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRGV2ZWxvcGVyIFVwMyAoZi...,2025-01-13 12:20:39 UTC,iOS developer,en


## Translate Job Descriptions into English 

Original:
This is a [SRC] to [TGT]
translation, please provide
the [TGT] translation for these
sentences:


My version:
This is a [SRC] to [TGT]
translation, please provide
the [TGT] translation for this
job description:

In [14]:
df_language = df_language.copy()

df_language['Language'].unique()

array(['en', 'de', 'it', 'fr', 'nl', 'es', 'hr', 'ru', 'bg', 'cs', 'pl',
       'uk', 'da', 'fi', 'hu', 'lv', 'zh', 'pt', 'ro', 'sk', 'sl', 'sv'],
      dtype=object)

In [15]:
language_mapping = {
    'en': 'English',
    'de': 'German',
    'it': 'Italian',
    'fr': 'French',
    'nl': 'Dutch',
    'es': 'Spanish',
    'hr': 'Croatian',
    'ru': 'Russian',
    'bg': 'Bulgarian',
    'cs': 'Czech',
    'pl': 'Polish',
    'uk': 'Ukrainian',
    'da': 'Danish',
    'fi': 'Finnish',
    'hu': 'Hungarian',
    'lv': 'Latvian',
    'zh': 'Chinese',
    'pt': 'Portuguese',
    'ro': 'Romanian',
    'sk': 'Slovak',
    'sl': 'Slovenian',
    'sv': 'Swedish'
}

df_language['Language Name'] = df_language['Language'].map(language_mapping)


In [16]:
async def translate_non_english_descriptions(df, language_col, job_desc_col, translated_col, client, gpt_model, language_mapping, batch_size=10, concurrency_limit=10, cache_file="./cache/job_description_english_cache.json"):
    """
    Translates non-English job descriptions into English using chatgpt_async().
    Copies English job descriptions directly without translation.

    Parameters:
    - df: DataFrame containing job descriptions and their languages.
    - language_col: Column name for ISO language codes.
    - job_desc_col: Column name for job descriptions.
    - translated_col: Column name where translated descriptions will be stored.
    - client: OpenAI async client instance.
    - gpt_model: GPT model name.
    - batch_size: Number of rows to process per batch.
    - concurrency_limit: Max concurrent API requests.
    - cache_file: Path to store cache.

    Returns:
    - df: Updated DataFrame with translated job descriptions.
    """
    if language_mapping is None:
        raise ValueError("language_mapping(key-value pairs) must be provided.")
    # Language mapping dictionary

    # Ensure the translated column exists
    if translated_col not in df.columns:
        df[translated_col] = ""

    # Step 1: Copy English job descriptions (skip API calls for them)
    df.loc[df[language_col] == "en", translated_col] = df.loc[df[language_col] == "en", job_desc_col]

    # Step 2: Filter only non-English rows that still need translation
    mask_non_english = (df[language_col] != "en") & (df[translated_col] == "")
    
    if not mask_non_english.any():
        return df  # Nothing to translate

    # Step 3: Generate dynamic prompts directly using .loc[] (avoiding SettingWithCopyWarning)
    df.loc[mask_non_english, "user_prompt"] = df.loc[mask_non_english, language_col].map(
        lambda lang: f"This is a {language_mapping.get(lang, 'Unknown')} to English translation, please provide the English translation for this job description:"
    )

    # Step 4: Run chatgpt_async() for translation
    df_translated = await chatgpt_async(
        input_column_name=job_desc_col,
        output_column_name=translated_col,
        df=df.loc[mask_non_english],  # No .copy(), keeping reference to df
        user_prompt="user_prompt",  # Dynamic per row
        gpt_model=gpt_model,
        client=client,
        batch_size=batch_size,
        concurrency_limit=concurrency_limit,
        cache_file=cache_file
    )

    # Step 5: Merge results back into original DataFrame
    df.update(df_translated)

    # Step 6: Remove the "user_prompt" column
    df.drop(columns=["user_prompt"], inplace=True, errors="ignore")

    return df

In [17]:
df_translated = await translate_non_english_descriptions(
    df=df_language.copy(),
    language_col="Language",
    job_desc_col="Job Description",
    translated_col="Job Description English",
    client=client_async,
    gpt_model="gpt-4o-2024-11-20",
    batch_size=10,
    concurrency_limit=10,
    language_mapping=language_mapping,
    cache_file="./cache/job_description_english_cache_V2_gpt-4o-2024-11-20.json"
)

Processing Batches: 100%|██████████| 42/42 [00:01<00:00, 41.42it/s]


### Drop too short Job Descriptions

In [18]:
df_translated = df_translated[df_translated['Job Description English'].str.split().str.len() >= 14].reset_index(drop=True)

## Extract the information from the "Job Description in English" into logical sections

### Ground Truth testing
#### Dataframe into Text

In [19]:
sample = df_translated.sample(n=30, random_state=42)

def column_to_txt(column, output, df):
    separator = "\n\n\n" + "-" * 100 + "\n\n\n"
    with open(output, "w", encoding="utf-8") as f:
        f.write(separator.join(df[column].tolist()))

#column_to_txt(column='Job Description English', output="./ground truth/job_descriptions_sample.txt", df=sample)

####  Load Ground Truth into a DataFrame

In [20]:
def txt_to_column(txt_path, df):
    df = df.copy()
    with open(txt_path, "r", encoding="utf-8") as f:
        content = f.read().strip()  # Read and remove extra spaces
    ground_truth_labels = content.split("-" * 100)
    ground_truth_labels = [label.strip("\n") for label in ground_truth_labels]
    
    if len(ground_truth_labels) > len(df):
        ground_truth_labels = ground_truth_labels[:len(df)]
        df["Ground Truth"] = ground_truth_labels
    else:
        df["Ground Truth"] = ground_truth_labels
    return df

test = txt_to_column(txt_path="./ground truth/job_descriptions_ground_truth.txt", df=sample)
test.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query,Language,Language Name,Job Description English,Ground Truth
1575,Poland,Europe,True,True,default,google.com,Android Developer,Connectis_,"Kórnik, Poland",HitPraca.pl,...,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIERldmVsb3BlciIsIm...,2025-01-13 12:11:12 UTC,Android developer,pl,Polish,**Android Developer** \n**Location:** Poznań ...,1. Platform: Android\n2. Salary: Not mentioned...
324,Canada,Northern America,False,False,default,google.com,iOS Developer (Remote),McAfee,"Waterloo, ON, Canada",Blind,...,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRGV2ZWxvcGVyIChSZW1vdG...,2025-01-13 12:21:02 UTC,iOS developer,en,English,Role OverviewMcAfee is searching for an interm...,1. Platform: iOS\n2. Salary: Not mentioned\n3....
1491,Netherlands,Europe,True,True,default,google.com,IOS developer,StarApple,"Eindhoven, Netherlands",Werkzoeken.nl,...,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJJT1MgZGV2ZWxvcGVyIiwiY29tcG...,2025-01-13 12:22:59 UTC,iOS developer,nl,Dutch,**Organization & Department**\n\nAre you inter...,"1. Platform: iOS\n2. Salary: €3,200 to €5,000\..."
1815,Romania,Europe,True,True,default,google.com,Android Developer / CTS Expert,Luxoft,Romania,"Indeed, Jooble, LinkedIn",...,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIERldmVsb3BlciAvIE...,2025-01-13 12:11:28 UTC,Android developer,en,English,"Remote Romania, Romania\r\n\r\nAndroid\r\n\r\n...",1. Platform: Android\n2. Salary: Not mentioned...
2174,United Kingdom,Europe,False,False,default,google.com,Embedded Software Developer (Android),Integrity,"Chesterton, United Kingdom",SitePoint,...,,Full-time,,eyJqb2JfdGl0bGUiOiJFbWJlZGRlZCBTb2Z0d2FyZSBEZX...,2025-01-13 12:12:03 UTC,Android developer,en,English,If you have experience developing embedded rea...,1. Platform: Android\n2. Salary: £40-50K plus ...


#### Extract sections from Ground Truth

In [21]:
# Function to extract each section
def extract_section(text, section_name):
    pattern = rf"{section_name}:\s*(.*?)(?=\n\d+\.|\Z)"
    match = re.search(pattern, text, re.DOTALL)
    return match.group(1).strip() if match else None

#Apply to all rowsr
test["Platform_GT"] = test["Ground Truth"].apply(lambda x: extract_section(x, r"1\. Platform"))
test["Salary_GT"] = test["Ground Truth"].apply(lambda x: extract_section(x, r"2\. Salary"))
test["Requirements_GT"] = test["Ground Truth"].apply(lambda x: extract_section(x, r"3\. Requirements"))
test["Nice to have_GT"] = test["Ground Truth"].apply(lambda x: extract_section(x, r"4\. Nice to have"))
test["Responsibilities_GT"] = test["Ground Truth"].apply(lambda x: extract_section(x, r"5\. Responsibilities"))
test["Benefits_GT"] = test["Ground Truth"].apply(lambda x: extract_section(x, r"6\. Benefits"))


test.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Language,Language Name,Job Description English,Ground Truth,Platform_GT,Salary_GT,Requirements_GT,Nice to have_GT,Responsibilities_GT,Benefits_GT
1575,Poland,Europe,True,True,default,google.com,Android Developer,Connectis_,"Kórnik, Poland",HitPraca.pl,...,pl,Polish,**Android Developer** \n**Location:** Poznań ...,1. Platform: Android\n2. Salary: Not mentioned...,Android,Not mentioned,- Minimum 3 years of commercial experience as ...,- Familiarity with technologies and tools such...,- Designing and implementing new screens in th...,- Opportunity to participate in integration ev...
324,Canada,Northern America,False,False,default,google.com,iOS Developer (Remote),McAfee,"Waterloo, ON, Canada",Blind,...,en,English,Role OverviewMcAfee is searching for an interm...,1. Platform: iOS\n2. Salary: Not mentioned\n3....,iOS,Not mentioned,- You can develop iOS applications at an inter...,- Diving deep into lower-level libraries like ...,"(list out or say ""Not mentioned"")\n - Your wo...",Not mentioned
1491,Netherlands,Europe,True,True,default,google.com,IOS developer,StarApple,"Eindhoven, Netherlands",Werkzoeken.nl,...,nl,Dutch,**Organization & Department**\n\nAre you inter...,"1. Platform: iOS\n2. Salary: €3,200 to €5,000\...",iOS,"€3,200 to €5,000",- 3 years of work experience with Swift\n - B...,- Experience in an environment where cybersecu...,- Primarily focus on backend development for t...,- 28 vacation days + the option to purchase 10...
1815,Romania,Europe,True,True,default,google.com,Android Developer / CTS Expert,Luxoft,Romania,"Indeed, Jooble, LinkedIn",...,en,English,"Remote Romania, Romania\r\n\r\nAndroid\r\n\r\n...",1. Platform: Android\n2. Salary: Not mentioned...,Android,Not mentioned,- Deep expertise in Google CTS tests\n - Stro...,- Docker\n - AUTOSAR (AUTomotive Open System ...,- Work inside an intercultural team of many la...,Not mentioned
2174,United Kingdom,Europe,False,False,default,google.com,Embedded Software Developer (Android),Integrity,"Chesterton, United Kingdom",SitePoint,...,en,English,If you have experience developing embedded rea...,1. Platform: Android\n2. Salary: £40-50K plus ...,Android,£40-50K plus benefits,- Degree in relevant subject\n - Embedded rea...,- Experience of Linux kernel and system progra...,Not mentioned,- An early finish on Fridays\n - Bonus Privat...


#### Estract Sections From GPT's version

In [22]:
system_prompt = """
You are an AI assistant. Your role is to extract specific information from job descriptions and format them in a strict structure.
"""

user_prompt = """
I will provide a job description. Please extract and present the information in **this exact order**:

1. Platform: (Android/iOS/Cross-platform)
2. Salary: (If stated; otherwise "Not mentioned")
3. Requirements: (verbatim from the job description or "Not mentioned")
4. Nice to have: (verbatim or "Not mentioned")
5. Responsibilities: (verbatim or "Not mentioned")
6. Benefits: (verbatim or "Not mentioned")

**Guidelines**:
- **DO NOT reword, paraphrase, or summarize** any part of the job description. Copy the sentences exactly as they appear.
- Combine all mandatory or required skill sections (e.g., "Requirements," "Skills," "Key Technologies," "About You") under **Requirements**.
- If the job description specifically says something is "a plus," "beneficial," or otherwise indicates it’s optional, place it under "Nice to have" even if it appears under a "Requirements" heading in the job description.
- If there is an "About you" or "About Role" section (or similar) that describs duties or tasks, include those under "Responsibilities".
- If the information is not in the job description, write "Not mentioned" for that section.
- For multiple platforms (e.g., Android, iOS), list them all in **Platform** and use headings under Requirements (and other sections, if needed) like "General Requirements:", "For Android Developers:", "For iOS Developers:".
- Present your answer **only** in the format above.

---
Here is the job description:
"""

df_extracted_test = await chatgpt_async(
    input_column_name="Job Description English", 
    output_column_name="Job Description Extracted",
    input_text_length=None,
    output_text_length=None,
    num_rows=None,  
    df=test.copy(), 
    system_prompt=system_prompt,
    user_prompt=user_prompt,
    gpt_model="gpt-4o-2024-11-20",
    client=client_async,
    batch_size=5,
    concurrency_limit=10,
    cache_file="./cache/job_description_extracted_cache_gpt-4o-2024-11-20.json"
)

Processing Batches: 100%|██████████| 7/7 [00:01<00:00,  6.51it/s]


In [23]:
df_extracted_test.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Language Name,Job Description English,Ground Truth,Platform_GT,Salary_GT,Requirements_GT,Nice to have_GT,Responsibilities_GT,Benefits_GT,Job Description Extracted
1575,Poland,Europe,True,True,default,google.com,Android Developer,Connectis_,"Kórnik, Poland",HitPraca.pl,...,Polish,**Android Developer** \n**Location:** Poznań ...,1. Platform: Android\n2. Salary: Not mentioned...,Android,Not mentioned,- Minimum 3 years of commercial experience as ...,- Familiarity with technologies and tools such...,- Designing and implementing new screens in th...,- Opportunity to participate in integration ev...,1. Platform: Android \n2. Salary: Not mention...
324,Canada,Northern America,False,False,default,google.com,iOS Developer (Remote),McAfee,"Waterloo, ON, Canada",Blind,...,English,Role OverviewMcAfee is searching for an interm...,1. Platform: iOS\n2. Salary: Not mentioned\n3....,iOS,Not mentioned,- You can develop iOS applications at an inter...,- Diving deep into lower-level libraries like ...,"(list out or say ""Not mentioned"")\n - Your wo...",Not mentioned,1. Platform: iOS/macOS \n2. Salary: Not menti...
1491,Netherlands,Europe,True,True,default,google.com,IOS developer,StarApple,"Eindhoven, Netherlands",Werkzoeken.nl,...,Dutch,**Organization & Department**\n\nAre you inter...,"1. Platform: iOS\n2. Salary: €3,200 to €5,000\...",iOS,"€3,200 to €5,000",- 3 years of work experience with Swift\n - B...,- Experience in an environment where cybersecu...,- Primarily focus on backend development for t...,- 28 vacation days + the option to purchase 10...,"1. Platform: iOS \n2. Salary: €3,200 to €5,00..."
1815,Romania,Europe,True,True,default,google.com,Android Developer / CTS Expert,Luxoft,Romania,"Indeed, Jooble, LinkedIn",...,English,"Remote Romania, Romania\r\n\r\nAndroid\r\n\r\n...",1. Platform: Android\n2. Salary: Not mentioned...,Android,Not mentioned,- Deep expertise in Google CTS tests\n - Stro...,- Docker\n - AUTOSAR (AUTomotive Open System ...,- Work inside an intercultural team of many la...,Not mentioned,1. Platform: Android \n2. Salary: Not mention...
2174,United Kingdom,Europe,False,False,default,google.com,Embedded Software Developer (Android),Integrity,"Chesterton, United Kingdom",SitePoint,...,English,If you have experience developing embedded rea...,1. Platform: Android\n2. Salary: £40-50K plus ...,Android,£40-50K plus benefits,- Degree in relevant subject\n - Embedded rea...,- Experience of Linux kernel and system progra...,Not mentioned,- An early finish on Fridays\n - Bonus Privat...,1. Platform: Android \n2. Salary: £40-50K plu...


In [24]:
#Apply to all rowsr
df_extracted_test["Platform_GPT"] = df_extracted_test["Job Description Extracted"].apply(lambda x: extract_section(x, r"1\. Platform"))
df_extracted_test["Salary_GPT"] = df_extracted_test["Job Description Extracted"].apply(lambda x: extract_section(x, r"2\. Salary"))
df_extracted_test["Requirements_GPT"] = df_extracted_test["Job Description Extracted"].apply(lambda x: extract_section(x, r"3\. Requirements"))
df_extracted_test["Nice to have_GPT"] = df_extracted_test["Job Description Extracted"].apply(lambda x: extract_section(x, r"4\. Nice to have"))
df_extracted_test["Responsibilities_GPT"] = df_extracted_test["Job Description Extracted"].apply(lambda x: extract_section(x, r"5\. Responsibilities"))
df_extracted_test["Benefits_GPT"] = df_extracted_test["Job Description Extracted"].apply(lambda x: extract_section(x, r"6\. Benefits"))


df_extracted_test.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Nice to have_GT,Responsibilities_GT,Benefits_GT,Job Description Extracted,Platform_GPT,Salary_GPT,Requirements_GPT,Nice to have_GPT,Responsibilities_GPT,Benefits_GPT
1575,Poland,Europe,True,True,default,google.com,Android Developer,Connectis_,"Kórnik, Poland",HitPraca.pl,...,- Familiarity with technologies and tools such...,- Designing and implementing new screens in th...,- Opportunity to participate in integration ev...,1. Platform: Android \n2. Salary: Not mention...,Android,Not mentioned,- Minimum 3 years of commercial experience as ...,- Familiarity with technologies and tools such...,- Designing and implementing new screens in th...,- Opportunity to participate in integration ev...
324,Canada,Northern America,False,False,default,google.com,iOS Developer (Remote),McAfee,"Waterloo, ON, Canada",Blind,...,- Diving deep into lower-level libraries like ...,"(list out or say ""Not mentioned"")\n - Your wo...",Not mentioned,1. Platform: iOS/macOS \n2. Salary: Not menti...,iOS/macOS,Not mentioned,- You can develop iOS applications at an inter...,- Diving deep into lower-level libraries like ...,- Dive deep into anti-censorship technologies ...,Not mentioned
1491,Netherlands,Europe,True,True,default,google.com,IOS developer,StarApple,"Eindhoven, Netherlands",Werkzoeken.nl,...,- Experience in an environment where cybersecu...,- Primarily focus on backend development for t...,- 28 vacation days + the option to purchase 10...,"1. Platform: iOS \n2. Salary: €3,200 to €5,00...",iOS,"€3,200 to €5,000",- 3 years of work experience with Swift \n ...,- Experience in an environment where cybersecu...,- You’ll primarily focus on backend developmen...,- 28 vacation days + the option to purchase 10...
1815,Romania,Europe,True,True,default,google.com,Android Developer / CTS Expert,Luxoft,Romania,"Indeed, Jooble, LinkedIn",...,- Docker\n - AUTOSAR (AUTomotive Open System ...,- Work inside an intercultural team of many la...,Not mentioned,1. Platform: Android \n2. Salary: Not mention...,Android,Not mentioned,- Deep expertise in Google CTS tests \n - S...,- Docker \n - AUTOSAR (AUTomotive Open Syst...,- Work inside an intercultural team of many la...,Not mentioned
2174,United Kingdom,Europe,False,False,default,google.com,Embedded Software Developer (Android),Integrity,"Chesterton, United Kingdom",SitePoint,...,- Experience of Linux kernel and system progra...,Not mentioned,- An early finish on Fridays\n - Bonus Privat...,1. Platform: Android \n2. Salary: £40-50K plu...,Android,£40-50K plus benefits,- Degree in relevant subject \n - Embedded ...,- Experience of Linux kernel and system progra...,- Develop and support the software running on ...,- An early finish on Fridays \n - Bonus \n...


#### Calculate Fuzzy Scores

1. Token Sort Ratio (Handling Word Order Differences)<br>
If two sentences contain the same words but in different order, Levenshtein distance alone might give a low similarity score. Instead, we:
- Split both sentences into words.
- Sort them alphabetically.
- Recalculate Levenshtein Distance.

2. Token Set Ratio (Handling Partial Overlaps)<br>
If one string is a subset of another, Token Set Ratio helps:
- Convert both sentences into sets of unique words.
- Compare only the common words.

In [25]:
def compare_fuzzy_sections(df):
    sections = ['Platform', 'Salary', 'Requirements', 'Nice to have', 'Responsibilities', 'Benefits']
    
    results = []

    for section in sections:
        col_gt = f"{section}_GT"
        col_gpt = f"{section}_GPT"
        
        df[f"{section} Token Set Ratio"] = df.apply(
            lambda row: fuzz.token_set_ratio(str(row[col_gt]), str(row[col_gpt])), axis=1
        ).round(2)
        
        avg_score = df[f"{section} Token Set Ratio"].mean().round(2)
        results.append((section, avg_score))

    print("Average Token Set Ratio per section:")
    for section, score in results:
        print(f"{section}: {score}")

    values = [value for _, value in results]
    print(f"\nOverall Average Token Set Ratio: {np.mean(values):.2f}")

compare_fuzzy_sections(df_extracted_test)


Average Token Set Ratio per section:
Platform: 98.33
Salary: 97.35
Requirements: 98.61
Nice to have: 93.38
Responsibilities: 90.88
Benefits: 91.85

Overall Average Token Set Ratio: 95.07


### Full Extraction 

In [26]:
df_extracted = await chatgpt_async(
    input_column_name="Job Description English", 
    output_column_name="Job Description Extracted",
    input_text_length=None,
    output_text_length=None,
    num_rows=None,  
    df=df_translated.copy(), 
    system_prompt=system_prompt,
    user_prompt=user_prompt,
    gpt_model="gpt-4o-2024-11-20",
    client=client_async,
    batch_size=30,
    concurrency_limit=35,
    cache_file="./cache/job_description_extracted_cache_gpt-4o-2024-11-20.json"
)

Processing Batches: 100%|██████████| 95/95 [00:14<00:00,  6.72it/s]


In [27]:
df_extracted.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query,Language,Language Name,Job Description English,Job Description Extracted
0,Austria,Europe,True,True,local,google.at,Sandbox Innovation Sdn. Bhd. | Senior Mobile A...,Tideri Jobbörse,Austria,"Trabajo.org - Stellenangebote, Arbeit",...,,Full-time,,eyJqb2JfdGl0bGUiOiJTYW5kYm94IElubm92YXRpb24gU2...,2025-01-13 12:33:21 UTC,iOS developer,en,English,We are a dynamic FinTech company headquartered...,"1. Platform: Android, iOS, Cross-platform \n2..."
1,Austria,Europe,True,True,default,google.com,Mobile Application Developer,Pearson Carter,Austria,"Trabajo.org - Stellenangebote, Arbeit",...,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJNb2JpbGUgQXBwbGljYXRpb24gRG...,2025-01-13 12:08:51 UTC,Android developer,en,English,Lead Mobile Developer | Hyper Growth Startup |...,1. Platform: Cross-platform \n2. Salary: $160...
2,Austria,Europe,True,True,default,google.com,"Android Developer – Kotlin (Austria based, Hyb...",Bitcoin Devs Company,"Vienna, Austria",Jobs3,...,,Contractor,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIERldmVsb3BlciDigJ...,2025-01-13 12:08:48 UTC,Android developer,en,English,Overview:\r\nThe Android Developer – Kotlin po...,1. Platform: Android \n2. Salary: Not mention...
3,Austria,Europe,True,True,default,google.com,Android & iOS Developer,ventopay gmbh,Austria,StudySmarter - Talents,...,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIFx1MDAyNiBpT1MgRG...,2025-01-13 12:20:43 UTC,iOS developer,de,German,**What are your tasks?** \n- You design attra...,1. Platform: Android/iOS \n2. Salary: Not men...
4,Austria,Europe,True,True,default,google.com,iOS Developer Up3 (f/m/d),Drei Österreich,"Vienna, Austria","MyAbility.jobs, Drei., Jobted.at",...,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRGV2ZWxvcGVyIFVwMyAoZi...,2025-01-13 12:20:39 UTC,iOS developer,en,English,Do you want to push the frontier of digital se...,1. Platform: iOS \n2. Salary: The gross annua...


## Extracting sections into columns

### Testing

In [28]:
df_sections = df_extracted.copy()

sample_sections_test = df_sections.sample(n=50, random_state=42)

In [29]:
#Apply to all rowsr
sample_sections_test["Platform"] = sample_sections_test["Job Description Extracted"].apply(lambda x: extract_section(x, r"1\. Platform"))
sample_sections_test["Salary_E"] = sample_sections_test["Job Description Extracted"].apply(lambda x: extract_section(x, r"2\. Salary"))
sample_sections_test["Requirements"] = sample_sections_test["Job Description Extracted"].apply(lambda x: extract_section(x, r"3\. Requirements"))
sample_sections_test["Nice to have"] = sample_sections_test["Job Description Extracted"].apply(lambda x: extract_section(x, r"4\. Nice to have"))
sample_sections_test["Responsibilities"] = sample_sections_test["Job Description Extracted"].apply(lambda x: extract_section(x, r"5\. Responsibilities"))
sample_sections_test["Benefits"] = sample_sections_test["Job Description Extracted"].apply(lambda x: extract_section(x, r"6\. Benefits"))


sample_sections_test.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Language,Language Name,Job Description English,Job Description Extracted,Platform,Salary_E,Requirements,Nice to have,Responsibilities,Benefits
1575,Poland,Europe,True,True,default,google.com,Android Developer,Connectis_,"Kórnik, Poland",HitPraca.pl,...,pl,Polish,**Android Developer** \n**Location:** Poznań ...,1. Platform: Android \n2. Salary: Not mention...,Android,Not mentioned,- Minimum 3 years of commercial experience as ...,- Familiarity with technologies and tools such...,- Designing and implementing new screens in th...,- Opportunity to participate in integration ev...
324,Canada,Northern America,False,False,default,google.com,iOS Developer (Remote),McAfee,"Waterloo, ON, Canada",Blind,...,en,English,Role OverviewMcAfee is searching for an interm...,1. Platform: iOS/macOS \n2. Salary: Not menti...,iOS/macOS,Not mentioned,- You can develop iOS applications at an inter...,- Diving deep into lower-level libraries like ...,- Dive deep into anti-censorship technologies ...,Not mentioned
1491,Netherlands,Europe,True,True,default,google.com,IOS developer,StarApple,"Eindhoven, Netherlands",Werkzoeken.nl,...,nl,Dutch,**Organization & Department**\n\nAre you inter...,"1. Platform: iOS \n2. Salary: €3,200 to €5,00...",iOS,"€3,200 to €5,000",- 3 years of work experience with Swift \n ...,- Experience in an environment where cybersecu...,- You’ll primarily focus on backend developmen...,- 28 vacation days + the option to purchase 10...
1815,Romania,Europe,True,True,default,google.com,Android Developer / CTS Expert,Luxoft,Romania,"Indeed, Jooble, LinkedIn",...,en,English,"Remote Romania, Romania\r\n\r\nAndroid\r\n\r\n...",1. Platform: Android \n2. Salary: Not mention...,Android,Not mentioned,- Deep expertise in Google CTS tests \n - S...,- Docker \n - AUTOSAR (AUTomotive Open Syst...,- Work inside an intercultural team of many la...,Not mentioned
2174,United Kingdom,Europe,False,False,default,google.com,Embedded Software Developer (Android),Integrity,"Chesterton, United Kingdom",SitePoint,...,en,English,If you have experience developing embedded rea...,1. Platform: Android \n2. Salary: £40-50K plu...,Android,£40-50K plus benefits,- Degree in relevant subject \n - Embedded ...,- Experience of Linux kernel and system progra...,- Develop and support the software running on ...,- An early finish on Fridays \n - Bonus \n...


### Deployment

In [30]:
df_sections["Platform"] = df_sections["Job Description Extracted"].apply(lambda x: extract_section(x, r"1\. Platform"))
df_sections["Salary_E"] = df_sections["Job Description Extracted"].apply(lambda x: extract_section(x, r"2\. Salary"))
df_sections["Requirements"] = df_sections["Job Description Extracted"].apply(lambda x: extract_section(x, r"3\. Requirements"))
df_sections["Nice to have"] = df_sections["Job Description Extracted"].apply(lambda x: extract_section(x, r"4\. Nice to have"))
df_sections["Responsibilities"] = df_sections["Job Description Extracted"].apply(lambda x: extract_section(x, r"5\. Responsibilities"))
df_sections["Benefits"] = df_sections["Job Description Extracted"].apply(lambda x: extract_section(x, r"6\. Benefits"))


df_sections.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Language,Language Name,Job Description English,Job Description Extracted,Platform,Salary_E,Requirements,Nice to have,Responsibilities,Benefits
0,Austria,Europe,True,True,local,google.at,Sandbox Innovation Sdn. Bhd. | Senior Mobile A...,Tideri Jobbörse,Austria,"Trabajo.org - Stellenangebote, Arbeit",...,en,English,We are a dynamic FinTech company headquartered...,"1. Platform: Android, iOS, Cross-platform \n2...","Android, iOS, Cross-platform",Not mentioned,- Educational Background: Bachelor's degree in...,- Flutter experience is a plus. \n - Progra...,"- Develop, test, and deploy high quality mobil...",- Young & dynamic workplace & culture (with of...
1,Austria,Europe,True,True,default,google.com,Mobile Application Developer,Pearson Carter,Austria,"Trabajo.org - Stellenangebote, Arbeit",...,en,English,Lead Mobile Developer | Hyper Growth Startup |...,1. Platform: Cross-platform \n2. Salary: $160...,Cross-platform,"$160,000 + Super",Not mentioned,Not mentioned,Not mentioned,Not mentioned
2,Austria,Europe,True,True,default,google.com,"Android Developer – Kotlin (Austria based, Hyb...",Bitcoin Devs Company,"Vienna, Austria",Jobs3,...,en,English,Overview:\r\nThe Android Developer – Kotlin po...,1. Platform: Android \n2. Salary: Not mention...,Android,Not mentioned,• Passionate about mobile platforms and transl...,Not mentioned,• Ensure that the app meets our quality standa...,Not mentioned
3,Austria,Europe,True,True,default,google.com,Android & iOS Developer,ventopay gmbh,Austria,StudySmarter - Talents,...,de,German,**What are your tasks?** \n- You design attra...,1. Platform: Android/iOS \n2. Salary: Not men...,Android/iOS,Not mentioned,- A completed IT education (HTL/FH/University)...,- Professional experience in developing native...,- You design attractive modules and software p...,"- We are a stable, owner-managed company with ..."
4,Austria,Europe,True,True,default,google.com,iOS Developer Up3 (f/m/d),Drei Österreich,"Vienna, Austria","MyAbility.jobs, Drei., Jobted.at",...,en,English,Do you want to push the frontier of digital se...,1. Platform: iOS \n2. Salary: The gross annua...,iOS,The gross annual salary according to the colle...,• At least 3 years of experience as iOS develo...,Not mentioned,• Develop the iOS app of our fully digital pro...,• Top mobile phone of your choice incl. employ...


## Take only iOS and Android vacancies

In [31]:
platform_counts = df_sections["Platform"].value_counts().reset_index()
platform_counts.columns = ['Platform', 'Count']
print(platform_counts)

                        Platform  Count
0                            iOS   1094
1                        Android   1080
2                   Android, iOS    156
3   Android, iOS, Cross-platform    109
4                  Not mentioned     77
..                           ...    ...
67   iOS, Android, Windows Phone      1
68        Android/Cross-platform      1
69       Android, iOS, Web-based      1
70               Android, Huawei      1
71           Android, macOS, iOS      1

[72 rows x 2 columns]


In [32]:
df_sections[df_sections['Platform'] == 'Not mentioned']

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Language,Language Name,Job Description English,Job Description Extracted,Platform,Salary_E,Requirements,Nice to have,Responsibilities,Benefits
11,Austria,Europe,True,True,default,google.com,Senior Django+Angular Developer (Full-Stack or...,iRonin - Web DevOps Mobile Developers: Ruby on...,Austria,StudySmarter - Talents,...,en,English,Senior Django+Angular Developer (Full-Stack or...,1. Platform: Not mentioned \n2. Salary: Not m...,Not mentioned,Not mentioned,- Experience: Minimum 5 years in software deve...,Not mentioned,- Enhance our legacy product catalog applicati...,- Flexible working hours. The most important t...
142,Bulgaria,Europe,True,True,local,google.bg,Mid/Senior Android Developer,Easy Consult Ltd,Anywhere,Jobgether,...,en,English,"This a Full Remote job, the offer is available...",1. Platform: Not mentioned \n2. Salary: Not m...,Not mentioned,Not mentioned,- 4+ years of experience in Kotlin development...,Not mentioned,- ﻿﻿Actively participate in development relate...,- Simple recruitment process as well as quick ...
319,Canada,Northern America,False,False,default,google.com,IOS Developer,Bounteous,"Calgary, AB, Canada","Startup Jobs, Jobs Trabajo.org",...,en,English,Opening from Canada\r\nBounteous x Accolite ma...,1. Platform: Not mentioned \n2. Salary: Not m...,Not mentioned,Not mentioned,Not mentioned,Not mentioned,Not mentioned,Not mentioned
383,Canada,Northern America,False,False,default,google.com,iPhone / iPad & Android Developer,eMacity Leads,"Ontario, Canada","Indeed, Glassdoor, SimplyHired",...,en,English,Level of experience: Middle/High\r\n\r\nPositi...,1. Platform: Not mentioned \n2. Salary: Not m...,Not mentioned,Not mentioned,To analyse and implement highly efficient webs...,"Joomla, WordPress as a plus.",Not mentioned,Not mentioned
395,Canada,Northern America,False,False,default,google.com,Experienced iOS Developer Wanted for Large-Sca...,"LGS, une Société IBM an IBM Company","Montreal, Quebec, Canada",Jobs Trabajo.org,...,en,English,"LGS, an IBM Company, is committed to providing...","1. Platform: Not mentioned \n2. Salary: $120,...",Not mentioned,"$120,000 - $180,000 per year, depending on exp...",Not mentioned,Not mentioned,- Participate in the development of client pro...,Not mentioned
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703,United States,Northern America,False,False,default,google.com,SWE - Tools & Automation Engineer - Developer ...,Apple,"San Diego, CA","Careers At Apple, AARP Job Board, ZipRecruiter...",...,en,English,"Summary\r\nPosted: Oct 16, 2024\r\nWeekly Hour...","1. Platform: Not mentioned \n2. Salary: $129,...",Not mentioned,"$129,600 - $236,300",• Excellent understanding of the software deve...,"• Knowledge of iOS, macOS, and Xcode. \n • ...",- Collaborate closely with software developers...,"• At Apple, base pay is one part of our total ..."
2724,United States,Northern America,False,False,default,google.com,Android Developer/ Ios Developer,Keylent Inc,"Detroit, MI","ZipRecruiter, Smart Recruiters Jobs, Ladders",...,en,English,Company Description\r\n\r\nWe established Keyl...,1. Platform: Not mentioned \n2. Salary: Not m...,Not mentioned,Not mentioned,• Strong Object Oriented development backgroun...,Not mentioned,Not mentioned,Not mentioned
2753,United States,Northern America,False,False,default,google.com,Sr. iOS Developer,FlexIT Inc,"Beaverton, OR","Indeed, ZipRecruiter, SimplyHired, BeBee, Lear...",...,en,English,RESPONSIBILITIES\r\n• Work with talented engin...,1. Platform: Not mentioned \n2. Salary: Not m...,Not mentioned,Not mentioned,Not mentioned,Not mentioned,• Work with talented engineers for the technic...,Not mentioned
2782,United States,Northern America,False,False,default,google.com,"Senior iOS Developer, Payments",Score Media and Gaming Inc.,"Philadelphia, PA","Startup Jobs, Ladders",...,en,English,"theScore, a wholly-owned subsidiary of PENN En...",1. Platform: Not mentioned \n2. Salary: Not m...,Not mentioned,Not mentioned,"• A solid foundation in computer science, with...","• Experience with Kubernetes, Kafka, gRPC, exp...",• Working with our preferred technology stack ...,• Competitive compensation package. \n • Co...


In [33]:
df_sections[(df_sections['Platform'] == 'iOS/macOS') | (df_sections['Platform'] == 'iOS, tvOS')]

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Language,Language Name,Job Description English,Job Description Extracted,Platform,Salary_E,Requirements,Nice to have,Responsibilities,Benefits
324,Canada,Northern America,False,False,default,google.com,iOS Developer (Remote),McAfee,"Waterloo, ON, Canada",Blind,...,en,English,Role OverviewMcAfee is searching for an interm...,1. Platform: iOS/macOS \n2. Salary: Not menti...,iOS/macOS,Not mentioned,- You can develop iOS applications at an inter...,- Diving deep into lower-level libraries like ...,- Dive deep into anti-censorship technologies ...,Not mentioned
385,Canada,Northern America,False,False,default,google.com,"Software Developer in Test, Creativity Apps",Apple,"Vancouver, BC, Canada",Careers At Apple,...,en,English,"Summary\r\nPosted: Sep 4, 2024\r\n\r\nRole Num...","1. Platform: iOS/macOS \n2. Salary: $113,400 ...",iOS/macOS,"$113,400 and $215,300",• 5+ years experience in QA/QE \n • Minimum...,• 2+ years of Full-stack Software Developer in...,"In this role, you will be responsible for plan...","• At Apple, base pay is one part of our total ..."
534,Czechia,Europe,True,True,default,google.com,iOS Developer,Sledovanitv.cz,"Brno, Czechia",Indeed.cz,...,cs,Czech,We are SledovaniTV.cz - the most technological...,"1. Platform: iOS, tvOS \n2. Salary: Not menti...","iOS, tvOS",Not mentioned,- Advanced knowledge of iOS development (exper...,- Your own project to showcase,- Have the opportunity to dive into our iOS an...,"- The opportunity to work in a young, inspirin..."
1273,Mexico,Northern America,False,False,default,google.com,Senior Ios Developer,Bhuvi It Solutions,"Guadalajara, Jalisco, Mexico","BeBee, Trabajo.org - Vacantes De Empleo, Traba...",...,en,English,Job Title: iOS Developer\r\n\r\nWe are seeking...,"1. Platform: iOS, tvOS \n2. Salary: $120,000 ...","iOS, tvOS","$120,000 - $180,000 per year.",• 4+ years of professional software developmen...,Not mentioned,"• Produce a reliable, performant, configurable...",• TN Visa Sponsorship. \n • USDPay. \n •...


### Rename rows with 'iOS/macOS' or 'iOS, tvOS' to iOS

In [34]:
# Replace the first two rows with 'iOS, tvOS'
df_sections.loc[df_sections[df_sections['Platform'] == 'iOS, tvOS'].iloc[:2].index, 'Platform'] = 'iOS'

# Replace the first two rows with 'iOS/macOS'
df_sections.loc[df_sections[df_sections['Platform'] == 'iOS/macOS'].iloc[:2].index, 'Platform'] = 'iOS'


### Extraction only iOS and Android Vacancies

In [35]:
df_final = df_sections.copy()

df_final = df_final[df_final['Platform'].isin(['iOS', 'Android'])]
df_final = df_final.sort_values(by='Location').reset_index(drop=True)

df_final.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Language,Language Name,Job Description English,Job Description Extracted,Platform,Salary_E,Requirements,Nice to have,Responsibilities,Benefits
0,Austria,Europe,True,True,default,google.com,"Android Developer – Kotlin (Austria based, Hyb...",Bitcoin Devs Company,"Vienna, Austria",Jobs3,...,en,English,Overview:\r\nThe Android Developer – Kotlin po...,1. Platform: Android \n2. Salary: Not mention...,Android,Not mentioned,• Passionate about mobile platforms and transl...,Not mentioned,• Ensure that the app meets our quality standa...,Not mentioned
1,Austria,Europe,True,True,default,google.com,ios entwickler 80–100% w/m/d,CHANCENLAND VORARLBERG,"Dornbirn, Austria","IT-Career.at, STEMJOBS.AT, IT-JOBS.AT",...,de,German,**iOS Developer 80–100% w/m/d**\n\n**Job Descr...,1. Platform: iOS \n2. Salary: Not mentioned ...,iOS,Not mentioned,"- **Motivation over experience:** Curiosity, i...",Not mentioned,- You will work with us on exciting projects f...,"- A compact, battle-tested team and flat hiera..."
2,Austria,Europe,True,True,default,google.com,Middle iOS developer,Processica,"Vienna, Austria",JOBITT,...,en,English,Looking for a iOS Developer. Playing well in a...,1. Platform: iOS \n2. Salary: Not mentioned ...,iOS,Not mentioned,Playing well in a team and has strong analytic...,Not mentioned,Not mentioned,Not mentioned
3,Austria,Europe,True,True,default,google.com,iOS Developer,Raiffeisen Gruppe,"Linz, Austria","Jooble, Trabajo.org - Stellenangebote, Arbeit",...,de,German,**Your Role in the Team** \n- You contribute ...,1. Platform: iOS \n2. Salary: Not mentioned ...,iOS,Not mentioned,- You are familiar with Continuous Integration...,Not mentioned,- You contribute to the further development of...,- Home Office \n - Flexible working hours ...
4,Austria,Europe,True,True,default,google.com,iOS Software Engineer,Cybermoth,"Vienna, Austria","Expertini, Talent.com",...,en,English,We are searching for iOS Software Engineers wi...,1. Platform: iOS \n2. Salary: Depending on qu...,iOS,Depending on qualifications and professional e...,• Worked on at least one native Swift applicat...,Not mentioned,• In this position you will be part of one or ...,• You will be part of a company with an inspir...


## Extract technologies and tools

### Ground Truth testing

In [36]:
df_test = df_final.copy()

df_extract_sample = df_test.sample(n=40, random_state=42)

In [37]:
df_extract_sample["Full Requirements"] = (
    "3. Requirements:\n" + df_extract_sample["Requirements"].astype(str) + "\n\n" +
    "4. Nice to have:\n" + df_extract_sample["Nice to have"].astype(str) + "\n\n" +
    "5. Responsibilities:\n" + df_extract_sample["Responsibilities"].astype(str)
)
df_extract_sample

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Language Name,Job Description English,Job Description Extracted,Platform,Salary_E,Requirements,Nice to have,Responsibilities,Benefits,Full Requirements
282,Canada,Northern America,False,False,default,google.com,Développeur iOS senior -Senior iOS Developer,OneSpan,"Montreal, Quebec, Canada","Greenhouse, Indeed, Kovasys IT Recruitment Age...",...,French,The English translation for the job descriptio...,1. Platform: iOS \n2. Salary: Not mentioned ...,iOS,Not mentioned,Not mentioned,Not mentioned,Not mentioned,Not mentioned,3. Requirements:\nNot mentioned\n\n4. Nice to ...
479,France,Europe,True,True,default,google.com,Senior Android Developer - Video Expert (All G...,Dailymotion,"Paris, France",Ivy Exec,...,English,Company Description\r\n\r\nDailymotion is the ...,1. Platform: Android \n2. Salary: Not mention...,Android,Not mentioned,• Excellent knowledge of Android SDK \n • A...,• Proven experience in developing video-based ...,• Design and build advanced applications for t...,Not mentioned,3. Requirements:\n• Excellent knowledge of And...
1307,Romania,Europe,True,True,local,google.ro,iOS Developer,ASSIST SOFTWARE,"Suceava, Romania",IOS Jobs,...,English,Are you an iOS mobile innovator? We need you! ...,1. Platform: iOS \n2. Salary: Competitive sal...,iOS,Competitive salary,• Ability to design and implement iOS applicat...,"• Experience with Phonegap, HTML5 \n • Adep...",Not mentioned,Relocation package for 2 months \n Employme...,3. Requirements:\n• Ability to design and impl...
1254,Portugal,Europe,True,True,default,google.com,Principal iOS Software Engineer,Sky,"Aveiro, Portugal",Empregos Trabajo.org,...,English,Role Overview\r\n\r\nSky Portugal is a leading...,"1. Platform: iOS \n2. Salary: $120,000 - $180...",iOS,"$120,000 - $180,000 per annum",• 5+ years of experience in native iOS develop...,Not mentioned,"• Design, develop, and deliver high-quality so...",• Above market salary \n • Yearly bonus \n...,3. Requirements:\n• 5+ years of experience in ...
507,Germany,Europe,True,True,default,google.com,iOS Developer - Health Care App (m/w/d) Trier,HIBA GmbH,"Trier, Germany",XING,...,German,**iOS Developer - Health Care App (m/f/d) Trie...,1. Platform: iOS \n2. Salary: Not mentioned ...,iOS,Not mentioned,- Completed degree in computer science or a co...,Not mentioned,- Development and maintenance of iOS applicati...,- Continuous training opportunities to ensure ...,3. Requirements:\n- Completed degree in comput...
495,France,Europe,True,True,default,google.com,Android (Kotlin) Developer - Freelance,Veepee,"Nantes, France",Okko Jobs,...,English,The vente-privee group has consolidated its va...,1. Platform: Android \n2. Salary: Not mention...,Android,Not mentioned,• At least 3 years of experience in building A...,Not mentioned,• Propose and work on new solutions with respe...,• Large amount of good quality data (over 1M v...,3. Requirements:\n• At least 3 years of experi...
416,Estonia,Europe,True,True,default,google.com,iOS Software Development Expert,Addendum,"Tallinn, Estonia","BeBee, Trabajo.org",...,English,About the Company\r\n\r\nAddendum is a global ...,1. Platform: iOS \n2. Salary: $40-$80 per hou...,iOS,$40-$80 per hour,• Proven experience in developing native iOS a...,Not mentioned,• Develop high-quality native iOS applications...,• Flexible working hours. \n • Top talent c...,3. Requirements:\n• Proven experience in devel...
2009,United States,Northern America,False,False,default,google.com,iOS Developer,CloudBerg Tec,"Salt Lake City, UT","Indeed, Glassdoor, Jooble, OPTnation, LinkedIn...",...,English,Job Overviews\r\n\r\nLocation:\r\nSalt Lake Ci...,1. Platform: iOS \n2. Salary: Not mentioned ...,iOS,Not mentioned,- We are looking for a passionate iOS develope...,Not mentioned,- Designing and building mobile applications f...,Not mentioned,3. Requirements:\n- We are looking for a passi...
1727,United Kingdom,Europe,False,False,default,google.com,Senior Android Engineer,ASOS.com,"London, United Kingdom","LinkedIn, Lever, Smart Recruiters Jobs, Greenh...",...,English,"Company Description\r\n\r\nWe’re ASOS, the onl...",1. Platform: Android \n2. Salary: Not mention...,Android,Not mentioned,- You are a passionate Senior Android develope...,Not mentioned,- Deliver top quality projects made using best...,- Employee discount (hello ASOS discount!) \n...,3. Requirements:\n- You are a passionate Senio...
1716,United Kingdom,Europe,False,False,default,google.com,Android Developer,AES,"Cambridge, United Kingdom",BeBee,...,English,Android Developer (Hybrid)\r\n\r\nWe are seeki...,"1. Platform: Android \n2. Salary: £75,000 - £...",Android,"£75,000 - £85,000 per annum + Excellent Benefi...",• A HNC/D / Degree in Software Engineering (or...,Not mentioned,"• Developing, configuring, and optimizing the ...","• £75,000 - £85,000 per annum \n • Excellen...",3. Requirements:\n• A HNC/D / Degree in Softwa...


#### Dataframe into Text

In [38]:
#column_to_txt(column='Full Requirements', output="./ground truth/Full_requirements_sample.txt", df=df_extract_sample)

####  Load Ground Truth into a DataFrame

In [39]:
df_extract_sample = txt_to_column(txt_path="./ground truth/Full_requirements_ground_truth.txt", df=df_extract_sample)
df_extract_sample.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Job Description English,Job Description Extracted,Platform,Salary_E,Requirements,Nice to have,Responsibilities,Benefits,Full Requirements,Ground Truth
282,Canada,Northern America,False,False,default,google.com,Développeur iOS senior -Senior iOS Developer,OneSpan,"Montreal, Quebec, Canada","Greenhouse, Indeed, Kovasys IT Recruitment Age...",...,The English translation for the job descriptio...,1. Platform: iOS \n2. Salary: Not mentioned ...,iOS,Not mentioned,Not mentioned,Not mentioned,Not mentioned,Not mentioned,3. Requirements:\nNot mentioned\n\n4. Nice to ...,{}
479,France,Europe,True,True,default,google.com,Senior Android Developer - Video Expert (All G...,Dailymotion,"Paris, France",Ivy Exec,...,Company Description\r\n\r\nDailymotion is the ...,1. Platform: Android \n2. Salary: Not mention...,Android,Not mentioned,• Excellent knowledge of Android SDK \n • A...,• Proven experience in developing video-based ...,• Design and build advanced applications for t...,Not mentioned,3. Requirements:\n• Excellent knowledge of And...,"{\n ""languages_and_runtimes"": [""Kotlin"", ""co..."
1307,Romania,Europe,True,True,local,google.ro,iOS Developer,ASSIST SOFTWARE,"Suceava, Romania",IOS Jobs,...,Are you an iOS mobile innovator? We need you! ...,1. Platform: iOS \n2. Salary: Competitive sal...,iOS,Competitive salary,• Ability to design and implement iOS applicat...,"• Experience with Phonegap, HTML5 \n • Adep...",Not mentioned,Relocation package for 2 months \n Employme...,3. Requirements:\n• Ability to design and impl...,"{\n ""languages_and_runtimes"": [""Objective-C""..."
1254,Portugal,Europe,True,True,default,google.com,Principal iOS Software Engineer,Sky,"Aveiro, Portugal",Empregos Trabajo.org,...,Role Overview\r\n\r\nSky Portugal is a leading...,"1. Platform: iOS \n2. Salary: $120,000 - $180...",iOS,"$120,000 - $180,000 per annum",• 5+ years of experience in native iOS develop...,Not mentioned,"• Design, develop, and deliver high-quality so...",• Above market salary \n • Yearly bonus \n...,3. Requirements:\n• 5+ years of experience in ...,"{\n ""languages_and_runtimes"":[""Swift""],\n ..."
507,Germany,Europe,True,True,default,google.com,iOS Developer - Health Care App (m/w/d) Trier,HIBA GmbH,"Trier, Germany",XING,...,**iOS Developer - Health Care App (m/f/d) Trie...,1. Platform: iOS \n2. Salary: Not mentioned ...,iOS,Not mentioned,- Completed degree in computer science or a co...,Not mentioned,- Development and maintenance of iOS applicati...,- Continuous training opportunities to ensure ...,3. Requirements:\n- Completed degree in comput...,"{\n ""languages_and_runtimes"": [""Objective-C""..."


#### Chat GPT's version 

In [40]:
system_prompt_1 = """
You are a senior extraction assistant.

GOAL
Given a natural-language job-vacancy text, return one and only one
valid, minified JSON object that maps every technology term found in
the input to exactly ONE key from ALLOWED_KEYS.

STRICT RULES
1. Output = raw JSON only (no Markdown, no comments, no extra text).
2. Use keys from ALLOWED_KEYS exactly as written. Omit any key whose array would be empty.
3. Each value is an **array of unique strings**, sorted alphabetically and preserving the term’s original spelling/case from the input.
4. Ignore soft skills and vague nouns (e.g. communication, documentation, performance, detail, English, collaboration, problem-solving, university).
5. **IGNORE** overly generic technology words/phrases (e.g. "design patterns", "android ui", "json", "xml", "clean code", "API integration", "unit testing", "Continuous integration", "CI", "CD", "CI/CD", "cryptography", "application testing" ect.).
6. DO NOT hallucinate. Include a term **only** if it appears verbatim in the input text.
7. If a technology term matches **exactly** (case-insensitive) one of the values in ALLOWED_KEYS, then it MUST be assigned to that exact key — even if the name could hypothetically fit other categories.
8. The order of keys in the output JSON MUST strictly follow the order of keys as they appear in the ALLOWED_KEYS section below. Only include keys that have at least one matched term. If a key has no matched terms, it MUST be completely omitted from the output, even if this breaks the visual continuity of the ALLOWED_KEYS order.


ALLOWED_KEYS (with examples)
{
    "languages_and_runtimes": ["Swift", "Kotlin", "Java", "Dart", "Objective-C", "Coroutines", "RxSwift", "Combine", "GCD", "RxJava", "Flows", "JavaScript", "TypeScript", "C", "C++", "Python", "Golang", "PHP"],
    "ui_and_cross_platform_frameworks": ["SwiftUI", "UIKit", "Jetpack Compose", "Flutter", "React Native", "Xamarin", "Ionic", "WatchKit", "Cocoa Touch", "PhoneGap", "Cordova", "Kotlin Multiplatform"],
    "architectural_patterns": ["MVVM", "VIPER", "Clean Architecture", "MVI", "MVC", "MVP", "Redux"],
    "dependency_injection_frameworks": ["Dagger", "Hilt", "Koin", "Swinject"],
    "build_and_dependency_management": ["Gradle", "CocoaPods", "SPM", "Bazel", "Buck", "Xcode", "Android Studio", "CircleCI", "Bamboo", "CocoaPods"],
    "data_and_caching": ["Core Data", "Room", "Realm", "SQLite", "Firestore", "MongoDB", "SAP UltraLite", "MySQL", "NoSQL"],
    "networking_and_api": ["Retrofit", "OkHttp", "URLSession", "Alamofire", "GraphQL", "REST API", "WebSockets", "gRPC", "Protocol Buffers"],
    "backend_or_baas_integration": ["Firebase", "AWS Amplify", "Azure Mobile", "Parse", "AWS Mobile Hub", "AWS Cognito", "AWS S3"],
    "device_and_platform_services_and_third_party_sdks": ["ARKit", "HealthKit", "CoreML", "Core Animation", "Android SDK", "Android NDK", "Push Notifications", "BLE", "NFC", "Camera", "Location", "Sensors", "Stripe SDK", "Facebook SDK", "AdMob", "Google Maps", "ExoPlayer", "Glide", "Stripe", "PayPal SDK", "Binder", "AIDL", "JNI", "CTS", "HAL", "LoRa", "CarPlay", "Android Auto", "CydiaSubstrate", "Frida", "WebViews"],
    "security_and_cryptography": ["Keychain", "TLS pinning", "OAuth2", "Veracode", "Checkmarx", "OWASP"],
    "testing_frameworks": ["XCTest", "JUnit", "Espresso", "Mockito", "Robolectric"],
    "debugging_or_profiling": ["Instruments", "Android Profiler"],
    "version_control": ["Git", "GIT", "git", "SVN", "Mercurial", "Gerrit", "GitFlow", "SourceTree", "Fork", "Bitbucket"],
    "ci_cd_and_release_automation": ["GitHub Actions", "Jenkins", "Bitrise", "fastlane", "CircleCI", "Bamboo", "GitLab CI", "Docker"],
    "monitoring_analytics_and_crash_reporting": ["Crashlytics", "Sentry", "Datadog", "Firebase Analytics", "App Center"],
    "development_methodologies": ["Scrum", "Kanban", "Agile", "SAFe", "TDD", "BDD", "DevOps"],
    "testing_process_and_qa": ["test coverage", "regression testing"],
    "code_quality_and_static_analysis": ["SonarQube", "SwiftLint", "Veracode", "Checkmarx"],
    "documentation_and_knowledge_sharing": ["Swagger", "OpenAPI", "Javadoc", "Confluence", "HIG"],
    "collaboration_pm_and_design_handoff": ["Jira", "Trello", "Figma", "Zeplin", "Rally/AgileCentral"],
    "distribution_and_store_operations": ["TestFlight", "App Store Connect", "Google Play Console"],
    "compliance_and_certifications": ["SOC 2", "GDPR"]
}

EXAMPLE RESPONSE
{
    "languages_and_runtimes":["Swift","Java"],
    "version_control":["Git"]
}
"""


user_prompt_1 = """
Extract every technology, tool, framework, library, service or formal
methodology mentioned in the text below and output the JSON exactly as
specified in the system prompt.

---
INPUT TEXT:
"""


df_extracted_requirements_test = await chatgpt_async(
    input_column_name="Full Requirements", 
    output_column_name="Extracted Technologies GPT",
    input_text_length=None,
    output_text_length=None,
    num_rows=None,  
    df=df_extract_sample.copy(), 
    system_prompt=system_prompt_1,
    user_prompt=user_prompt_1,
    gpt_model="gpt-4o-2024-11-20",
    client=client_async,
    batch_size=40,
    concurrency_limit=40,
    cache_file= "./cache/Full_requirements_cache.json"
)

Processing Batches: 100%|██████████| 1/1 [00:00<00:00, 28.79it/s]


In [41]:
df_extracted_requirements_test.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Job Description Extracted,Platform,Salary_E,Requirements,Nice to have,Responsibilities,Benefits,Full Requirements,Ground Truth,Extracted Technologies GPT
282,Canada,Northern America,False,False,default,google.com,Développeur iOS senior -Senior iOS Developer,OneSpan,"Montreal, Quebec, Canada","Greenhouse, Indeed, Kovasys IT Recruitment Age...",...,1. Platform: iOS \n2. Salary: Not mentioned ...,iOS,Not mentioned,Not mentioned,Not mentioned,Not mentioned,Not mentioned,3. Requirements:\nNot mentioned\n\n4. Nice to ...,{},{}
479,France,Europe,True,True,default,google.com,Senior Android Developer - Video Expert (All G...,Dailymotion,"Paris, France",Ivy Exec,...,1. Platform: Android \n2. Salary: Not mention...,Android,Not mentioned,• Excellent knowledge of Android SDK \n • A...,• Proven experience in developing video-based ...,• Design and build advanced applications for t...,Not mentioned,3. Requirements:\n• Excellent knowledge of And...,"{\n ""languages_and_runtimes"": [""Kotlin"", ""co...","{\n ""languages_and_runtimes"":[""Kotlin"",""Cor..."
1307,Romania,Europe,True,True,local,google.ro,iOS Developer,ASSIST SOFTWARE,"Suceava, Romania",IOS Jobs,...,1. Platform: iOS \n2. Salary: Competitive sal...,iOS,Competitive salary,• Ability to design and implement iOS applicat...,"• Experience with Phonegap, HTML5 \n • Adep...",Not mentioned,Relocation package for 2 months \n Employme...,3. Requirements:\n• Ability to design and impl...,"{\n ""languages_and_runtimes"": [""Objective-C""...","{\n ""languages_and_runtimes"":[""Objective-C""..."
1254,Portugal,Europe,True,True,default,google.com,Principal iOS Software Engineer,Sky,"Aveiro, Portugal",Empregos Trabajo.org,...,"1. Platform: iOS \n2. Salary: $120,000 - $180...",iOS,"$120,000 - $180,000 per annum",• 5+ years of experience in native iOS develop...,Not mentioned,"• Design, develop, and deliver high-quality so...",• Above market salary \n • Yearly bonus \n...,3. Requirements:\n• 5+ years of experience in ...,"{\n ""languages_and_runtimes"":[""Swift""],\n ...","{\n ""languages_and_runtimes"":[""Swift""],\n ..."
507,Germany,Europe,True,True,default,google.com,iOS Developer - Health Care App (m/w/d) Trier,HIBA GmbH,"Trier, Germany",XING,...,1. Platform: iOS \n2. Salary: Not mentioned ...,iOS,Not mentioned,- Completed degree in computer science or a co...,Not mentioned,- Development and maintenance of iOS applicati...,- Continuous training opportunities to ensure ...,3. Requirements:\n- Completed degree in comput...,"{\n ""languages_and_runtimes"": [""Objective-C""...","{\n ""languages_and_runtimes"":[""Objective-C""..."


### Token set ratio

In [42]:
def extract_values(cell):
    """
    Превращает JSON-ячейку в плоский список строковых значений.
    Если ячейка пустая / некорректная — возвращает [].
    """
    if pd.isna(cell) or str(cell).strip() == "":
        return []

    try:
        parsed = json.loads(cell)
    except (ValueError, TypeError):
        return []

    values = []
    if isinstance(parsed, dict):
        for v in parsed.values():
            if isinstance(v, list):
                values.extend(v)
            elif v is not None:
                values.append(v)
    elif isinstance(parsed, list):
        values.extend(parsed)
    else:
        values.append(parsed)

    # нормализуем так же, как делает RapidFuzz внутри token_set_ratio
    return [utils.default_process(str(x)) for x in values if str(x).strip()]

def technologies_token_set_ratio(gt_cell, gpt_cell):
    """
    Считает token_set_ratio для двух JSON-ячееĸ,
    используя только значения.
    """
    vals_gt  = extract_values(gt_cell)
    vals_gpt = extract_values(gpt_cell)

    # оба пусты → полное совпадение
    if not vals_gt and not vals_gpt:
        return 100

    s_gt  = " ".join(vals_gt)
    s_gpt = " ".join(vals_gpt)
    return fuzz.token_set_ratio(s_gt, s_gpt)


df_extracted_requirements_test["Token Set Ratio"] = (df_extracted_requirements_test.apply(lambda row: technologies_token_set_ratio(row["Ground Truth"], row["Extracted Technologies GPT"]),axis=1,).round(2))
print("Average Token Set Ratio :", df_extracted_requirements_test["Token Set Ratio"].mean().round(2))


Average Token Set Ratio : 96.73


### Full extraction

In [43]:
df_final_extraction = df_final.copy()

In [44]:
df_final_extraction["Full Requirements"] = (
    "3. Requirements:\n" + df_final_extraction["Requirements"].astype(str) + "\n\n" +
    "4. Nice to have:\n" + df_final_extraction["Nice to have"].astype(str) + "\n\n" +
    "5. Responsibilities:\n" + df_final_extraction["Responsibilities"].astype(str)
)

In [45]:
df_final_extracted = await chatgpt_async(
    input_column_name="Full Requirements", 
    output_column_name="Extracted Technologies GPT",
    input_text_length=None,
    output_text_length=None,
    num_rows=None,  
    df=df_final_extraction.copy(), 
    system_prompt=system_prompt_1,
    user_prompt=user_prompt_1,
    gpt_model="gpt-4o-2024-11-20",
    client=client_async,
    batch_size=25,
    concurrency_limit=25,
    cache_file= "./cache/Full_requirements_cache.json"
)

Processing Batches: 100%|██████████| 88/88 [00:03<00:00, 27.88it/s]


In [46]:
df_final_extracted.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Job Description English,Job Description Extracted,Platform,Salary_E,Requirements,Nice to have,Responsibilities,Benefits,Full Requirements,Extracted Technologies GPT
0,Austria,Europe,True,True,default,google.com,"Android Developer – Kotlin (Austria based, Hyb...",Bitcoin Devs Company,"Vienna, Austria",Jobs3,...,Overview:\r\nThe Android Developer – Kotlin po...,1. Platform: Android \n2. Salary: Not mention...,Android,Not mentioned,• Passionate about mobile platforms and transl...,Not mentioned,• Ensure that the app meets our quality standa...,Not mentioned,3. Requirements:\n• Passionate about mobile pl...,"{\n ""languages_and_runtimes"":[""Kotlin"",""Cor..."
1,Austria,Europe,True,True,default,google.com,ios entwickler 80–100% w/m/d,CHANCENLAND VORARLBERG,"Dornbirn, Austria","IT-Career.at, STEMJOBS.AT, IT-JOBS.AT",...,**iOS Developer 80–100% w/m/d**\n\n**Job Descr...,1. Platform: iOS \n2. Salary: Not mentioned ...,iOS,Not mentioned,"- **Motivation over experience:** Curiosity, i...",Not mentioned,- You will work with us on exciting projects f...,"- A compact, battle-tested team and flat hiera...",3. Requirements:\n- **Motivation over experien...,{}
2,Austria,Europe,True,True,default,google.com,Middle iOS developer,Processica,"Vienna, Austria",JOBITT,...,Looking for a iOS Developer. Playing well in a...,1. Platform: iOS \n2. Salary: Not mentioned ...,iOS,Not mentioned,Playing well in a team and has strong analytic...,Not mentioned,Not mentioned,Not mentioned,3. Requirements:\nPlaying well in a team and h...,{}
3,Austria,Europe,True,True,default,google.com,iOS Developer,Raiffeisen Gruppe,"Linz, Austria","Jooble, Trabajo.org - Stellenangebote, Arbeit",...,**Your Role in the Team** \n- You contribute ...,1. Platform: iOS \n2. Salary: Not mentioned ...,iOS,Not mentioned,- You are familiar with Continuous Integration...,Not mentioned,- You contribute to the further development of...,- Home Office \n - Flexible working hours ...,3. Requirements:\n- You are familiar with Cont...,"{\n ""languages_and_runtimes"":[""Swift""],\n ..."
4,Austria,Europe,True,True,default,google.com,iOS Software Engineer,Cybermoth,"Vienna, Austria","Expertini, Talent.com",...,We are searching for iOS Software Engineers wi...,1. Platform: iOS \n2. Salary: Depending on qu...,iOS,Depending on qualifications and professional e...,• Worked on at least one native Swift applicat...,Not mentioned,• In this position you will be part of one or ...,• You will be part of a company with an inspir...,3. Requirements:\n• Worked on at least one nat...,"{\n ""languages_and_runtimes"":[""Java"",""Kotli..."


## Remove hallucinated words

In [83]:
# ── 0. Настройки ────────────────────────────────────────────────
STANDARDIZE = True            # ⬅ выключи, если нужно оставить исходные формы
USE_REGEX   = False           # ⬅ как и раньше

# ── 1. Загружаем словарь и строим «вариант → канон» ─────────────
with open(".//synonyms.json", encoding="utf8") as f:
    SYNONYMS = json.load(f)

VAR2CANON = {v.lower(): canon      # "m-v-c"  -> "mvc"
             for canon, variants in SYNONYMS.items()
             for v in variants}

# ── 2. Вспомогалки ──────────────────────────────────────────────
def make_ngrams(tokens, n_max=3):
    return {" ".join(tokens[i:i+n])
            for n in range(1, n_max + 1)
            for i in range(len(tokens) - n + 1)}

def is_real(term, text_flat, ngrams,
            *, use_regex=USE_REGEX, synonyms=SYNONYMS):
    term_l = term.lower()
    if term_l not in synonyms:           # нет в списке «проверяемых» → сразу True
        return True

    patterns = synonyms[term_l]
    if any(p.lower() in ngrams for p in patterns):
        return True
    if use_regex and any(re.search(rf"\b{p}\b", text_flat) for p in patterns):
        return True
    return False

# ── 3. Счётчики ────────────────────────────────────────────────
removed_counter       = Counter()   # что удалили как галлюцинацию
canonicalized_counter = Counter()   # сколько раз заменили на канон

def to_canon(term):
    """Если STANDARDIZE=True и вариант есть в словаре → вернём канон."""
    t_low = term.lower()
    if STANDARDIZE and t_low in VAR2CANON:
        canon = VAR2CANON[t_low]
        if canon != t_low:          # реальная замена, а не «mvc» -> «mvc»
            canonicalized_counter[canon] += 1
        return canon
    return t_low                    # либо исходную форму (в lower)

# ── 4. Фильтр ──────────────────────────────────────────────────
def remove_hallucinated(row):
    text_raw   = (row.get("Full Requirements") or "")
    text_lower = text_raw.lower()

    token_pattern = r"\.?[a-z0-9\+\#-]+(?:\.[a-z0-9\+\#-]+)*"
    tokens = re.findall(token_pattern, text_lower)

    ngrams  = make_ngrams(tokens)

    extracted = row.get("Extracted Technologies GPT", "")
    try:
        tech_dict = json.loads(extracted)
    except Exception:
        return extracted            # повреждённый JSON

    for cat in list(tech_dict.keys()):
        cleaned = []
        for term in tech_dict[cat]:
            canon_term = to_canon(term)
            if is_real(canon_term, text_lower, ngrams):
                cleaned.append(canon_term)
            else:
                removed_counter[canon_term] += 1

        if cleaned:
            tech_dict[cat] = list(dict.fromkeys(cleaned))  # убираем дубли
        else:
            del tech_dict[cat]

    return json.dumps(tech_dict, ensure_ascii=False)

# ── 5. Запуск ──────────────────────────────────────────────────
df_final_extracted["Extracted Technologies Clean"] = (
    df_final_extracted.apply(remove_hallucinated, axis=1)
)

# ── 6. Итоговая статистика ─────────────────────────────────────
print("\n== Hallucinations removed ==")
print(pd.DataFrame(removed_counter.items(),
                   columns=["hallucinated_term", "removed_count"])
        .sort_values("removed_count", ascending=False)
        .reset_index(drop=True))

print("\n== The terms are canonized ==")
print(pd.DataFrame(canonicalized_counter.items(),
                   columns=["canonical_term", "replaced_count"])
        .sort_values("replaced_count", ascending=False)
        .reset_index(drop=True))

print(f"\nTotal rows processed: {len(df_final_extracted)}")
print(f"Total hallucinated terms removed: {sum(removed_counter.values())}")
print(f"Total terms canonicalized: {sum(canonicalized_counter.values())}")


== Hallucinations removed ==
      hallucinated_term  removed_count
0   google play console             68
1                   git             64
2           android sdk             64
3     app store connect             59
4                   hig             37
..                  ...            ...
56             webviews              1
57               devops              1
58               okhttp              1
59              graphql              1
60                  ble              1

[61 rows x 2 columns]

== The terms are canonized ==
          canonical_term  replaced_count
0               rest api             169
1      kotlin coroutines             136
2        jetpack compose              41
3                  flows              20
4      google play store              19
5            objective-c              19
6                 gitlab              13
7            android hal              12
8                 golang               6
9                 rxjava              

### Inspection

In [84]:
def flat_terms(tech_json: str) -> set[str]:
    """{"cat": ["A", "B"]}  → {"a", "b"}   (lower-case)"""
    try:
        d = json.loads(tech_json)
    except Exception:
        return set()
    return {t.lower() for lst in d.values() for t in lst}

# исходный → множество   |   очищенный → множество
orig_sets  = df_final_extracted["Extracted Technologies GPT"].apply(flat_terms)
clean_sets = df_final_extracted["Extracted Technologies Clean"].apply(flat_terms)

df_final_extracted["removed_terms"] = orig_sets.subtract(clean_sets)

In [86]:
term = "git"                       # или "google play console", "android sdk", …

mask = df_final_extracted["removed_terms"].apply(lambda s: term.lower() in s)

view_cols = ["Full Requirements",
             "Extracted Technologies GPT",
             "Extracted Technologies Clean"]

df_debug = df_final_extracted.loc[mask, view_cols]

print(f"Rows with removed '{term}': {len(df_debug)}")
df_debug.head(5)                   # покажет первые 3 совпадения

Rows with removed 'git': 64


Unnamed: 0,Full Requirements,Extracted Technologies GPT,Extracted Technologies Clean
60,3. Requirements:\n- Several years of professio...,"{\n ""languages_and_runtimes"":[""C#"",""Java"",""...","{""languages_and_runtimes"": [""c#"", ""java"", ""kot..."
113,3. Requirements:\n• Bachelor’s degree in Compu...,"{\n ""languages_and_runtimes"":[""Java"",""Kotli...","{""languages_and_runtimes"": [""java"", ""kotlin""],..."
126,3. Requirements:\n• Experience developing and ...,"{\n ""languages_and_runtimes"":[""JavaScript"",...","{""languages_and_runtimes"": [""javascript"", ""swi..."
176,3. Requirements:\n• Minimum of 3+ years of pos...,"{\n ""languages_and_runtimes"":[""C++"",""Go"",""J...","{""languages_and_runtimes"": [""c++"", ""golang"", ""..."
217,3. Requirements:\n• At least 5+ years of Andro...,"{\n ""languages_and_runtimes"":[""Java"",""Kotli...","{""languages_and_runtimes"": [""java"", ""kotlin""],..."


----

In [50]:
blacklist = {
    # было
    'objective-c','git', 'agile', 'android sdk', 'android ndk', 'hig', 'tdd',
    'android studio', 'c++', 'swift', 'c#', 'dagger', 'java', 'swiftui',

    # ➕ языки и базовые рантаймы
    'kotlin', 'dart', 'python', 'javascript', 'typescript', 'golang', 'php',

    # ➕ DI-фреймворки
    'hilt', 'koin', 'swinject',

    # ➕ инструменты сборки
    'gradle', 'bazel', 'buck', 'xcode',

    # ➕ БД / кеш
    'sqlite', 'realm', 'room', 'firestore', 'mongodb',

    # ➕ сеть / API
    'retrofit', 'okhttp', 'graphql',

    # ➕ BaaS
    'firebase', 'parse',

    # ➕ «глупые» однословные SDK
    'camera', 'sensors', 'location', 'nfc', 'ble',

    # ➕ архитектурные аббревиатуры
    'mvvm', 'mvc', 'mvp', 'mvi', 'viper', 'redux',

    # ➕ методологии
    'scrum', 'kanban', 'bdd', 'devops', 'safe',

    # ➕ CI/CD и релиз-тулзы
    'jenkins', 'docker', 'bitrise', 'fastlane', 'circleci',

    # ➕ тесты
    'xctest', 'junit', 'espresso', 'mockito', 'robolectric',

    # ➕ мониторинг / краш-репортинг
    'crashlytics', 'sentry', 'datadog',

    # ➕ «коллаборация»
    'jira', 'trello', 'figma', 'zeplin'
}

In [51]:
def remove_hallucinated(row):
    text = (row.get("Full Requirements") or "").lower()
    extracted = row.get("Extracted Technologies GPT", "")
    try:
        tech_dict = json.loads(extracted)
    except Exception:
        return extracted
    
    for key in list(tech_dict.keys()):
        filtered = []
        for term in tech_dict[key]:
            term_l = term.lower()
            if term_l in blacklist and term_l not in text:
                removed_counter[term_l] += 1
                continue
            filtered.append(term)
        if filtered:
            tech_dict[key] = filtered
        else:
            del tech_dict[key]
    
    return json.dumps(tech_dict, ensure_ascii=False)


In [52]:
removed_counter = Counter()

df_final_extracted["Extracted Technologies Clean"] = df_final_extracted.apply(remove_hallucinated, axis=1)

summary_df = (
    pd.DataFrame(removed_counter.items(), columns=["hallucinated_term", "removed_count"])
    .sort_values("removed_count", ascending=False)
    .reset_index(drop=True)
)

print(summary_df)

print(f"Total rows processed: {len(df_final_extracted)}")
print(f"Total hallucinated terms removed: {summary_df['removed_count'].sum()}")


   hallucinated_term  removed_count
0        android sdk             65
1                git             51
2                tdd             46
3                hig             44
4        objective-c             35
5            swiftui             26
6        android ndk             21
7     android studio             16
8              agile             15
9               java             11
10            dagger              9
11          circleci              9
12            sqlite              7
13              mvvm              6
14             xcode              5
15             swift              4
16             junit              4
17           sensors              3
18        javascript              2
19       robolectric              2
20               ble              2
21          swinject              1
22               mvc              1
23             scrum              1
24            devops              1
25             figma              1
26            okhttp        

## Cleaning and Normolization

### Extract values

In [53]:
# Функция для извлечения значений из JSON-строки
def extract_values(json_str):
    if pd.isna(json_str) or json_str.strip() == '{}' or json_str.strip() == '':
        return None
    try:
        data = json.loads(json_str)
        values = []
        for val in data.values():
            if isinstance(val, list):
                values.extend(map(str, val))  # добавляем элементы списка как строки
            else:
                values.append(str(val))  # добавляем одиночное значение как строку
        return ', '.join(values) if values else None
    except json.JSONDecodeError:
        return None

In [54]:
df_final_extracted['Technologies Only'] = df_final_extracted['Extracted Technologies Clean'].apply(extract_values).str.lower()

### Cleaning and Normolization

In [55]:
rename_map = {
    "android automotive": "android auto",
    "android jetpack": "jetpack compose",
    "appcenter": "app center",
    "apple sdks": "ios sdk",
    "compose": "jetpack compose",
    "compose ui": "jetpack compose",
    "coroutines": "kotlin coroutines",
    "cocoa": "cocoa touch",
    "flow": "flows",
    "go": "golang",
    "google play": "google play store",
    "hal": "android hal",
    "ios sdks": "ios sdk",
    "jetpack": "jetpack compose",
    "kotlin coroutine": "kotlin coroutines",
    "nodejs": "node.js",
    "objective c": "objective-c",
    "objectivec": "objective-c",
    "play store": "google play store",
    "playstore": "google play store",
    "protobuf": "protocol buffers",
    "rest": "rest api",
    "restful apis": "rest api",
    "restful": "rest api",
    "swift ui": "swiftui",
    "websocket": "websockets",
    "rx": "reactive extensions",
    "coredata": "core data",
    "rooms": "room",
    "stripe" : "stripe sdk",
    "kmp": "kotlin multiplatform"
}

remove_list = [
    "android",
    "android ui",
    "cd",
    "ci",
    "ci/cd",
    "hybrid apps",
    "instruments",
    "ios",
    "sdk",
    "ui tests",
    "unit testing",
    "unit tests",
    "dependency injection",
    "reactive extensions",
    "beta testing",
    "test coverage",
    "a/b testing",
    "unit",
    "react",
    "regression testing",
    "oop"
]


In [56]:
# Функция для нормализации строки
def normalize_tech_string(tech_str):
    if pd.isna(tech_str):
        return None
    try:
        tech_list = tech_str.split(',')
        clean_terms = []
        for term in tech_list:
            term_clean = term.strip().lower()
            term_renamed = rename_map.get(term_clean, term_clean)
            if term_renamed not in remove_list:
                clean_terms.append(term_renamed)
        return ', '.join(clean_terms) if clean_terms else None
    except Exception:
        return None

# Применяем к колонке
df_final_extracted['Technologies Only'] = df_final_extracted['Technologies Only'].apply(normalize_tech_string)

In [57]:
# Удалим пропуски, если есть
tech_series = df_final_extracted['Technologies Only'].dropna()

# Разделим строки по запятой, удалим лишние пробелы, соберём всё в одну серию
all_techs = tech_series.str.split(',').explode().str.strip()

# Посчитаем частоты
tech_counts = all_techs.value_counts().reset_index()



tech_counts = tech_counts[tech_counts["count"] >= 3]

tech_counts

Unnamed: 0,Technologies Only,count
0,swift,937
1,kotlin,853
2,git,658
3,java,600
4,rest api,515
...,...,...
184,material design,3
185,unity engine,3
186,tls pinning,3
187,moya,3


### Cheking

In [58]:
# Загрузка данных
with open('.\\key_values.json', 'r') as f:
    json_data = json.load(f)

# Получение списков технологий
tech_list = tech_counts['Technologies Only'].str.strip().str.lower().unique().tolist()
json_tech_list = []
for category, items in json_data.items():
    json_tech_list.extend([item.strip().lower() for item in items])

tech_set = set(tech_list)
json_set = set(json_tech_list)

# Поиск различий
only_in_tech_counts = tech_set - json_set
only_in_json = json_set - tech_set

# Вывод сводки
print(f"\n\nСводка сравнения:")
print(f"Всего технологий в tech_counts: {len(tech_set)}")
print(f"Всего технологий в JSON: {len(json_set)}")
print(f"Технологий только в tech_counts: {len(only_in_tech_counts)}")
print(f"Технологий только в JSON: {len(only_in_json)}")

if only_in_tech_counts:
    print("\nТехнологии в tech_counts, но отсутствуют в JSON:")
    for tech in sorted(only_in_tech_counts):
        print(f"- {tech}")

if only_in_json:
    print("\nТехнологии в JSON, но отсутствуют в tech_counts:")
    for tech in sorted(only_in_json):
        print(f"- {tech}")




Сводка сравнения:
Всего технологий в tech_counts: 189
Всего технологий в JSON: 189
Технологий только в tech_counts: 0
Технологий только в JSON: 0


### Deploy

In [59]:
with open('.\\key_values.json', 'r', encoding='utf-8') as f:
    kv = json.load(f)                  # categories → [tech1, tech2, …]

# -----------------------------------------------------------
# 2. Готовим «обратный» словарь: tech (lower) → category name
# -----------------------------------------------------------
reverse_map = {
    tech.lower(): category
    for category, tech_list in kv.items()
    for tech in tech_list
}

# -----------------------------------------------------------
# 3. Функция, которая строит JSON-объект для одной ячейки
# -----------------------------------------------------------
def categorize(tech_cell: str) -> dict:
    if pd.isna(tech_cell) or not tech_cell.strip():
        return {}
    
    result = {}
    # «kotlin, retrofit » → ['kotlin', 'retrofit']
    for raw in tech_cell.split(','):
        tech = raw.strip()
        if not tech:
            continue
        cat = reverse_map.get(tech.lower())   # ищем категорию
        if cat:
            # добавляем, избегая дубликатов
            result.setdefault(cat, []).append(tech)
    
    return result


In [60]:
df_final_extracted['Technologies Categorized'] = df_final_extracted['Technologies Only'].apply(categorize)


### Final variant

In [61]:
# загружаем маппинг "lower → правильный регистр"
with open(".\\map.json", 'r', encoding='utf-8') as f:
    proper_case = json.load(f)

def fix_casing(cat_dict: dict[str, list[str]]) -> dict[str, list[str]]:
    for category, tech_list in cat_dict.items():
        cat_dict[category] = [
            proper_case.get(t.lower(), t)        # если нет в маппинге — оставляем как есть
            for t in tech_list
        ]
    return cat_dict

In [62]:
df_final_extracted['Technologies Categorized'] = df_final_extracted['Technologies Categorized'].apply(fix_casing)

## Keys to columns

In [63]:
with open(".\\key_values.json", 'r', encoding='utf-8') as f:
    data = json.load(f)

# получаем список ключей
keys_list = list(data.keys())
#keys_list

In [64]:
df_final_extracted.copy()

# -------------------------------------------------------
# 2) Гарантируем, что в колонке dict, а не строка JSON
# -------------------------------------------------------
def to_dict(x):
    """str → dict  |  dict → dict  |  NaN/пусто → {}"""
    if pd.isna(x) or (isinstance(x, str) and not x.strip()):
        return {}
    if isinstance(x, str):
        return json.loads(x)
    return x

df_final_extracted['Tech_dict'] = df_final_extracted['Technologies Categorized'].apply(to_dict)

In [65]:
# -------------------------------------------------------
# 3) Находим все уникальные категории (ключи)
# -------------------------------------------------------
all_categories = set(chain.from_iterable(df_final_extracted['Tech_dict'].map(dict.keys)))

for cat in sorted(all_categories):
    df_final_extracted[cat] = df_final_extracted['Tech_dict'].apply(
        lambda d: ', '.join(d.get(cat, []))          # список → строка
                   if d.get(cat) else ''              # нет технологий → ""
    )

In [66]:
df_final_extracted

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,languages_and_runtimes,monitoring_analytics_and_crash_reporting,native_interop,networking_and_api,security_and_cryptography,testing_frameworks,ui_cross_platform,ui_guidelines,ui_native,version_control_and_branching
0,Austria,Europe,True,True,default,google.com,"Android Developer – Kotlin (Austria based, Hyb...",Bitcoin Devs Company,"Vienna, Austria",Jobs3,...,Kotlin,,,Retrofit,,,,,,
1,Austria,Europe,True,True,default,google.com,ios entwickler 80–100% w/m/d,CHANCENLAND VORARLBERG,"Dornbirn, Austria","IT-Career.at, STEMJOBS.AT, IT-JOBS.AT",...,,,,,,,,,,
2,Austria,Europe,True,True,default,google.com,Middle iOS developer,Processica,"Vienna, Austria",JOBITT,...,,,,,,,,,,
3,Austria,Europe,True,True,default,google.com,iOS Developer,Raiffeisen Gruppe,"Linz, Austria","Jooble, Trabajo.org - Stellenangebote, Arbeit",...,Swift,,,,,,,,,
4,Austria,Europe,True,True,default,google.com,iOS Software Engineer,Cybermoth,"Vienna, Austria","Expertini, Talent.com",...,"Java, Kotlin, Objective-C, Swift",,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2173,United States,Northern America,False,False,default,google.com,Android Developer (Kotlin) - W2,(Not Available),"Las Vegas, NV",Teal,...,Kotlin,,,REST API,,"Espresso, JUnit",,,Jetpack Compose,
2174,United States,Northern America,False,False,default,google.com,Android Dev,ManpowerGroup,"Richmond, VA","Jobs, EarnBetter, USNLX Ability Jobs - Nationa...",...,Kotlin,,,,,,,,,
2175,United States,Northern America,False,False,default,google.com,Android Developer,codeforce360,Illinois,"SmartRecruiters Job Search, Glassdoor, Dice, J...",...,"Java, JavaScript",Google Analytics,,,OAuth,,Apache Cordova,,,
2176,United States,Northern America,False,False,default,google.com,Mobile Android Developer - INTL - LATAM OR INDIA,Insight Global,"Pittsburgh, PA",Insight Global,...,,,,,,,,,,


In [67]:
#df_final_extracted.to_csv(".\\df_final.csv", index=False)