# Data Preparation
## Import libararies

In [37]:
from langdetect import detect, detect_langs, DetectorFactory
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd
import numpy as np
import openai
import json
import sys
import re
import os

from rapidfuzz import fuzz, utils
from collections import Counter
from itertools import chain


src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.insert(0, src_path)

%load_ext autoreload
%autoreload 2

from jobs_tools import data_cleaning, chat_gpt, tests_helpers


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
load_dotenv()
client = openai.OpenAI()
client_async = openai.AsyncOpenAI()

## Initial data cleaning

In [3]:
jobs_df = pd.read_csv('../data/csv/jobs_data.csv').copy()

# Step 1
jobs_df = data_cleaning.remove_exact_duplicates(jobs_df)
# Step 2
jobs_df_clean = data_cleaning.remove_job_id_duplicates(jobs_df)

jobs_df_clean.head()

Step 1: Removing exact duplicates
- Initial number of rows: 5834
- Duplicates removed: 0
- Remaining rows: 5834

Step 2: Removing Job ID duplicates within the same country
- Initial row count: 5834
- Duplicates removed: 2971
- Remaining rows: 2863



Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query
0,Austria,Europe,True,True,local,google.at,Sandbox Innovation Sdn. Bhd. | Senior Mobile A...,Tideri Jobbörse,Austria,"Trabajo.org - Stellenangebote, Arbeit",We are a dynamic FinTech company headquartered...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTYW5kYm94IElubm92YXRpb24gU2...,2025-01-13 12:33:21 UTC,iOS developer
1,Austria,Europe,True,True,default,google.com,Senior Ios Developer,Pyramid Global Technologies,Austria,"Trabajo.org - Stellenangebote, Arbeit, StudySm...",A minimum of 6+ years of concurrent commercial...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTZW5pb3IgSW9zIERldmVsb3Blci...,2025-01-13 12:20:43 UTC,iOS developer
2,Austria,Europe,True,True,default,google.com,iOS Developer - Permanent remote,Bluestorm Recruitment by Dazzle,Austria,Jooble,iOS Developer\r\n\r\nOur client is a leading m...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRGV2ZWxvcGVyIC0gUGVybW...,2025-01-13 12:20:43 UTC,iOS developer
3,Austria,Europe,True,True,default,google.com,Sr. Ios Developer,Bykon,Austria,"Trabajo.org - Stellenangebote, Arbeit",In ByKon we're looking for an exceptional Sr.\...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTci4gSW9zIERldmVsb3BlciIsIm...,2025-01-13 12:20:43 UTC,iOS developer
4,Austria,Europe,True,True,default,google.com,Software Engineer/ iOS,Bitpanda,Anywhere,GrabJobs,Who we are\r\n\r\nWe simplify wealth creation....,True,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTb2Z0d2FyZSBFbmdpbmVlci8gaU...,2025-01-13 12:20:43 UTC,iOS developer


## Detect the language of job postings using the `langdetect` library

In [4]:
DetectorFactory.seed = 0  # We record the result so that there are no accidental changes

def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"
    
def detect_language_with_confidence(text):
    try:
        lang_probs = detect_langs(text)
        if lang_probs:
            return str(lang_probs[0])  # Format: 'en:0.99'
    except:
        return "unknown"

jobs_df_clean["Language langdetect confidence"] = jobs_df_clean["Job Description"].apply(detect_language_with_confidence)
jobs_df_clean.to_csv('../data/csv/jobs_data_langdetect.csv', index=False)

#jobs_df_clean = pd.read_csv('../data/csv/jobs_data_langdetect.csv').copy()
jobs_df_clean.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query,Language langdetect confidence
0,Austria,Europe,True,True,local,google.at,Sandbox Innovation Sdn. Bhd. | Senior Mobile A...,Tideri Jobbörse,Austria,"Trabajo.org - Stellenangebote, Arbeit",We are a dynamic FinTech company headquartered...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTYW5kYm94IElubm92YXRpb24gU2...,2025-01-13 12:33:21 UTC,iOS developer,en:0.9999973661975282
1,Austria,Europe,True,True,default,google.com,Senior Ios Developer,Pyramid Global Technologies,Austria,"Trabajo.org - Stellenangebote, Arbeit, StudySm...",A minimum of 6+ years of concurrent commercial...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTZW5pb3IgSW9zIERldmVsb3Blci...,2025-01-13 12:20:43 UTC,iOS developer,en:0.999996472315817
2,Austria,Europe,True,True,default,google.com,iOS Developer - Permanent remote,Bluestorm Recruitment by Dazzle,Austria,Jooble,iOS Developer\r\n\r\nOur client is a leading m...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRGV2ZWxvcGVyIC0gUGVybW...,2025-01-13 12:20:43 UTC,iOS developer,en:0.9999944159931299
3,Austria,Europe,True,True,default,google.com,Sr. Ios Developer,Bykon,Austria,"Trabajo.org - Stellenangebote, Arbeit",In ByKon we're looking for an exceptional Sr.\...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTci4gSW9zIERldmVsb3BlciIsIm...,2025-01-13 12:20:43 UTC,iOS developer,en:0.9999966313140446
4,Austria,Europe,True,True,default,google.com,Software Engineer/ iOS,Bitpanda,Anywhere,GrabJobs,Who we are\r\n\r\nWe simplify wealth creation....,True,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTb2Z0d2FyZSBFbmdpbmVlci8gaU...,2025-01-13 12:20:43 UTC,iOS developer,en:0.9999964386438764


### Split the column with "en:0.99" into "en" and "0.99"

In [5]:
jobs_df_clean = data_cleaning.split_values(jobs_df_clean)
jobs_df_clean.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query,Language,Confidence
0,Austria,Europe,True,True,local,google.at,Sandbox Innovation Sdn. Bhd. | Senior Mobile A...,Tideri Jobbörse,Austria,"Trabajo.org - Stellenangebote, Arbeit",We are a dynamic FinTech company headquartered...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTYW5kYm94IElubm92YXRpb24gU2...,2025-01-13 12:33:21 UTC,iOS developer,en,0.999997
1,Austria,Europe,True,True,default,google.com,Senior Ios Developer,Pyramid Global Technologies,Austria,"Trabajo.org - Stellenangebote, Arbeit, StudySm...",A minimum of 6+ years of concurrent commercial...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTZW5pb3IgSW9zIERldmVsb3Blci...,2025-01-13 12:20:43 UTC,iOS developer,en,0.999996
2,Austria,Europe,True,True,default,google.com,iOS Developer - Permanent remote,Bluestorm Recruitment by Dazzle,Austria,Jooble,iOS Developer\r\n\r\nOur client is a leading m...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRGV2ZWxvcGVyIC0gUGVybW...,2025-01-13 12:20:43 UTC,iOS developer,en,0.999994
3,Austria,Europe,True,True,default,google.com,Sr. Ios Developer,Bykon,Austria,"Trabajo.org - Stellenangebote, Arbeit",In ByKon we're looking for an exceptional Sr.\...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTci4gSW9zIERldmVsb3BlciIsIm...,2025-01-13 12:20:43 UTC,iOS developer,en,0.999997
4,Austria,Europe,True,True,default,google.com,Software Engineer/ iOS,Bitpanda,Anywhere,GrabJobs,Who we are\r\n\r\nWe simplify wealth creation....,True,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTb2Z0d2FyZSBFbmdpbmVlci8gaU...,2025-01-13 12:20:43 UTC,iOS developer,en,0.999996


### Handling rows with confidence below 0.99

In [6]:
low_confidence_df = data_cleaning.low_confidence_data(jobs_df_clean)
jobs_df_clean = jobs_df_clean.drop(low_confidence_df.index)

#### ChatGPT detects language

In [7]:
df_result = await chat_gpt.chatgpt_async(
    input_column_name="Job Description", 
    output_column_name="Language gpt-4o-2024-11-20",
    input_text_length=None,
    output_text_length=1,
    num_rows=None,  
    df=low_confidence_df.copy(), 
    user_prompt="Detect the language of the text and return ONLY the ISO country code (e.g., en, fr, de, ect.). Text:",
    gpt_model="gpt-4o-2024-11-20",
    client=client_async,
    batch_size=10,
    concurrency_limit=10,
    cache_file="../data/cache/language_cache_gpt-4o-2024-11-20.json"
)
df_result.head()

Processing Batches: 100%|██████████| 2/2 [00:00<00:00, 42.26it/s]


Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query,Language,Confidence,Language gpt-4o-2024-11-20
1614,Poland,Europe,True,True,default,google.com,Senior Android Developer,FREAM S. A.,"Wrocław, Poland","The:Protocol, No Fluff Jobs, Capgemini Careers...",...,,,Contractor,No degree mentioned,eyJqb2JfdGl0bGUiOiJTZW5pb3IgQW5kcm9pZCBEZXZlbG...,2025-01-13 12:11:12 UTC,Android developer,nl,0.571425,en
528,Czechia,Europe,True,True,default,google.com,Android Developer (part-time: 2h/week) @ Exper...,Experis Polska,Anywhere,"Jooble, Jobs Trabajo.org",...,True,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIERldmVsb3BlciAocG...,2025-01-13 12:09:27 UTC,Android developer,en,0.571427,cs
434,Cyprus,Europe,False,False,default,google.com,iOS Applications Developer,MetaQuotes,"Limassol, Cyprus","MetaQuotes, Evoplay, Lever, Хабр Карьера - Hab...",...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgQXBwbGljYXRpb25zIERldm...,2025-01-13 12:21:15 UTC,iOS developer,it,0.714282,en
780,Germany,Europe,True,True,default,google.com,JUNIOR IOS DEVELOPER,Check24,Germany,Layboard,...,,€4.5K a month,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJKVU5JT1IgSU9TIERFVkVMT1BFUi...,2025-01-13 12:22:03 UTC,iOS developer,en,0.714283,ru
481,Czechia,Europe,True,True,default,google.com,iOS developer - Ostrava,confidential,"Ostrava, Czechia",Jooble,...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgZGV2ZWxvcGVyIC0gT3N0cm...,2025-01-13 12:21:22 UTC,iOS developer,no,0.714283,cs


#### Manual check

In [8]:
df_result['Manual check'] = ["drop", "cs", "drop", "ru", "drop", "fr", "en", "pl", "en", "drop", "fr", "en", "pl", "sv", "zh", "fr"]
df_result['Drop'] = [True, False, True, False, True, False, False, False, False, True, False, False, False, False, False, False]
df_result = df_result.loc[~df_result['Drop']]

df_result = data_cleaning.clean_data(df_result)
df_result.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query,Language,Confidence
528,Czechia,Europe,True,True,default,google.com,Android Developer (part-time: 2h/week) @ Exper...,Experis Polska,Anywhere,"Jooble, Jobs Trabajo.org",O pozici / o projektu\r\nPůvodní popisek. Andr...,True,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIERldmVsb3BlciAocG...,2025-01-13 12:09:27 UTC,Android developer,cs,0.571427
780,Germany,Europe,True,True,default,google.com,JUNIOR IOS DEVELOPER,Check24,Germany,Layboard,Требования 1-2 years of experience with iOS de...,,€4.5K a month,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJKVU5JT1IgSU9TIERFVkVMT1BFUi...,2025-01-13 12:22:03 UTC,iOS developer,ru,0.714283
325,Canada,Northern America,False,False,default,google.com,"Développeur senior, Android / Senior Android D...",Cerence,"Montreal, Quebec, Canada","Indeed, Built In, Eluta.ca, Glassdoor, Adzuna,...",A Moving Experience.\r\n\r\n(English version b...,,,Full-time,,eyJqb2JfdGl0bGUiOiJEw6l2ZWxvcHBldXIgc2VuaW9yLC...,2025-01-13 12:09:10 UTC,Android developer,fr,0.714285
2105,Sweden,Europe,True,True,default,google.com,Konsultuppdrag Ios and Android Developers - Of...,Senterprise,ستوكهولم، السويد,Emprego.pt,To one of our clients we are now looking for 1...,,,دوام كامل,,eyJqb2JfdGl0bGUiOiJLb25zdWx0dXBwZHJhZyBJb3MgYW...,2025-01-13 12:11:47 UTC,Android developer,en,0.857138
1591,Poland,Europe,True,True,default,google.com,Android Developer,ALAN Systems,"Silesian Voivodeship, Poland",JobLeads,"Cześć odkrywco kodu!\r\n\r\nCzy jesteś gotowy,...",,PLN 180K–PLN 240K a year,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIERldmVsb3BlciIsIm...,2025-01-13 12:11:12 UTC,Android developer,pl,0.857139


#### Merge back

In [9]:
jobs_df_clean = pd.concat([jobs_df_clean, df_result], ignore_index=True)
jobs_df_clean = jobs_df_clean.sort_values(['Location'])
jobs_df_clean.reset_index(inplace=True, drop=True)
jobs_df_clean = jobs_df_clean.drop(columns=['Confidence'])

jobs_df_clean.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query,Language
0,Austria,Europe,True,True,local,google.at,Sandbox Innovation Sdn. Bhd. | Senior Mobile A...,Tideri Jobbörse,Austria,"Trabajo.org - Stellenangebote, Arbeit",We are a dynamic FinTech company headquartered...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTYW5kYm94IElubm92YXRpb24gU2...,2025-01-13 12:33:21 UTC,iOS developer,en
1,Austria,Europe,True,True,default,google.com,Mobile Application Developer,Pearson Carter,Austria,"Trabajo.org - Stellenangebote, Arbeit",Lead Mobile Developer | Hyper Growth Startup |...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJNb2JpbGUgQXBwbGljYXRpb24gRG...,2025-01-13 12:08:51 UTC,Android developer,en
2,Austria,Europe,True,True,default,google.com,"Android Developer – Kotlin (Austria based, Hyb...",Bitcoin Devs Company,"Vienna, Austria",Jobs3,Overview:\r\nThe Android Developer – Kotlin po...,,,Contractor,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIERldmVsb3BlciDigJ...,2025-01-13 12:08:48 UTC,Android developer,en
3,Austria,Europe,True,True,default,google.com,Android & iOS Developer,ventopay gmbh,Austria,StudySmarter - Talents,Was sind deine Aufgaben?\r\n• Du gestaltest at...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIFx1MDAyNiBpT1MgRG...,2025-01-13 12:20:43 UTC,iOS developer,de
4,Austria,Europe,True,True,default,google.com,iOS Developer Up3 (f/m/d),Drei Österreich,"Vienna, Austria","MyAbility.jobs, Drei., Jobted.at",Do you want to push the frontier of digital se...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRGV2ZWxvcGVyIFVwMyAoZi...,2025-01-13 12:20:39 UTC,iOS developer,en


## Translate job descriptions into english 

Original:
This is a [SRC] to [TGT]
translation, please provide
the [TGT] translation for these
sentences:


My version:
This is a [SRC] to [TGT]
translation, please provide
the [TGT] translation for this
job description:

In [10]:
with open('../data/json/language_map.json', 'r') as f:
    language_mapping = json.load(f)

df_translated = await chat_gpt.translate_non_english_descriptions(
    df=jobs_df_clean.copy(),
    language_col="Language",
    job_desc_col="Job Description",
    translated_col="Job Description English",
    client=client_async,
    gpt_model="gpt-4o-2024-11-20",
    batch_size=10,
    concurrency_limit=10,
    language_mapping=language_mapping,
    cache_file="../data/cache/job_description_english_cache_V2_gpt-4o-2024-11-20.json"
)

Processing Batches: 100%|██████████| 42/42 [00:01<00:00, 40.62it/s]


### Drop too short job descriptions

In [11]:
df_translated = df_translated[df_translated['Job Description English'].str.split().str.len() >= 14].reset_index(drop=True)
#df_translated.to_csv('../data/csv/df_translated.csv', index=False)

## Split job descriptions into structured sections

### Comparison of ChatGPT's Extraction vs. Ground Truth

Average Token Set Ratio per section:
- Platform: 98.33
- Salary: 97.35
- Requirements: 98.61
- Nice to have: 93.38
- Responsibilities: 90.88
- Benefits: 91.85
- Overall Average Token Set Ratio: 95.07

Tests: [link](./tests.ipynb#split-job-descriptions-into-structured-sections)

### Full Extraction 

In [12]:
system_prompt = """
You are an AI assistant. Your role is to extract specific information from job descriptions and format them in a strict structure.
"""

user_prompt = """
I will provide a job description. Please extract and present the information in **this exact order**:

1. Platform: (Android/iOS/Cross-platform)
2. Salary: (If stated; otherwise "Not mentioned")
3. Requirements: (verbatim from the job description or "Not mentioned")
4. Nice to have: (verbatim or "Not mentioned")
5. Responsibilities: (verbatim or "Not mentioned")
6. Benefits: (verbatim or "Not mentioned")

**Guidelines**:
- **DO NOT reword, paraphrase, or summarize** any part of the job description. Copy the sentences exactly as they appear.
- Combine all mandatory or required skill sections (e.g., "Requirements," "Skills," "Key Technologies," "About You") under **Requirements**.
- If the job description specifically says something is "a plus," "beneficial," or otherwise indicates it’s optional, place it under "Nice to have" even if it appears under a "Requirements" heading in the job description.
- If there is an "About you" or "About Role" section (or similar) that describs duties or tasks, include those under "Responsibilities".
- If the information is not in the job description, write "Not mentioned" for that section.
- For multiple platforms (e.g., Android, iOS), list them all in **Platform** and use headings under Requirements (and other sections, if needed) like "General Requirements:", "For Android Developers:", "For iOS Developers:".
- Present your answer **only** in the format above.

---
Here is the job description:
"""

df_split = await chat_gpt.chatgpt_async(
    input_column_name="Job Description English", 
    output_column_name="Job Description Extracted",
    input_text_length=None,
    output_text_length=None,
    num_rows=None,  
    df=df_translated.copy(), 
    system_prompt=system_prompt,
    user_prompt=user_prompt,
    gpt_model="gpt-4o-2024-11-20",
    client=client_async,
    batch_size=30,
    concurrency_limit=35,
    cache_file="../data/cache/job_description_extracted_cache_gpt-4o-2024-11-20.json"
)
df_split.head()

Processing Batches: 100%|██████████| 95/95 [00:12<00:00,  7.39it/s]


Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query,Language,Job Description English,Job Description Extracted
0,Austria,Europe,True,True,local,google.at,Sandbox Innovation Sdn. Bhd. | Senior Mobile A...,Tideri Jobbörse,Austria,"Trabajo.org - Stellenangebote, Arbeit",...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTYW5kYm94IElubm92YXRpb24gU2...,2025-01-13 12:33:21 UTC,iOS developer,en,We are a dynamic FinTech company headquartered...,"1. Platform: Android, iOS, Cross-platform \n2..."
1,Austria,Europe,True,True,default,google.com,Mobile Application Developer,Pearson Carter,Austria,"Trabajo.org - Stellenangebote, Arbeit",...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJNb2JpbGUgQXBwbGljYXRpb24gRG...,2025-01-13 12:08:51 UTC,Android developer,en,Lead Mobile Developer | Hyper Growth Startup |...,1. Platform: Cross-platform \n2. Salary: $160...
2,Austria,Europe,True,True,default,google.com,"Android Developer – Kotlin (Austria based, Hyb...",Bitcoin Devs Company,"Vienna, Austria",Jobs3,...,,,Contractor,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIERldmVsb3BlciDigJ...,2025-01-13 12:08:48 UTC,Android developer,en,Overview:\r\nThe Android Developer – Kotlin po...,1. Platform: Android \n2. Salary: Not mention...
3,Austria,Europe,True,True,default,google.com,Android & iOS Developer,ventopay gmbh,Austria,StudySmarter - Talents,...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIFx1MDAyNiBpT1MgRG...,2025-01-13 12:20:43 UTC,iOS developer,de,**What are your tasks?** \n- You design attra...,1. Platform: Android/iOS \n2. Salary: Not men...
4,Austria,Europe,True,True,default,google.com,iOS Developer Up3 (f/m/d),Drei Österreich,"Vienna, Austria","MyAbility.jobs, Drei., Jobted.at",...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRGV2ZWxvcGVyIFVwMyAoZi...,2025-01-13 12:20:39 UTC,iOS developer,en,Do you want to push the frontier of digital se...,1. Platform: iOS \n2. Salary: The gross annua...


## Extracting sections into columns

In [13]:
df_sections = df_split.copy()

df_sections["Platform"] = df_sections["Job Description Extracted"].apply(lambda x: tests_helpers.extract_section(x, r"1\. Platform"))
df_sections["Salary_E"] = df_sections["Job Description Extracted"].apply(lambda x: tests_helpers.extract_section(x, r"2\. Salary"))
df_sections["Requirements"] = df_sections["Job Description Extracted"].apply(lambda x: tests_helpers.extract_section(x, r"3\. Requirements"))
df_sections["Nice to have"] = df_sections["Job Description Extracted"].apply(lambda x: tests_helpers.extract_section(x, r"4\. Nice to have"))
df_sections["Responsibilities"] = df_sections["Job Description Extracted"].apply(lambda x: tests_helpers.extract_section(x, r"5\. Responsibilities"))
df_sections["Benefits"] = df_sections["Job Description Extracted"].apply(lambda x: tests_helpers.extract_section(x, r"6\. Benefits"))

df_sections.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Search Query,Language,Job Description English,Job Description Extracted,Platform,Salary_E,Requirements,Nice to have,Responsibilities,Benefits
0,Austria,Europe,True,True,local,google.at,Sandbox Innovation Sdn. Bhd. | Senior Mobile A...,Tideri Jobbörse,Austria,"Trabajo.org - Stellenangebote, Arbeit",...,iOS developer,en,We are a dynamic FinTech company headquartered...,"1. Platform: Android, iOS, Cross-platform \n2...","Android, iOS, Cross-platform",Not mentioned,- Educational Background: Bachelor's degree in...,- Flutter experience is a plus. \n - Progra...,"- Develop, test, and deploy high quality mobil...",- Young & dynamic workplace & culture (with of...
1,Austria,Europe,True,True,default,google.com,Mobile Application Developer,Pearson Carter,Austria,"Trabajo.org - Stellenangebote, Arbeit",...,Android developer,en,Lead Mobile Developer | Hyper Growth Startup |...,1. Platform: Cross-platform \n2. Salary: $160...,Cross-platform,"$160,000 + Super",Not mentioned,Not mentioned,Not mentioned,Not mentioned
2,Austria,Europe,True,True,default,google.com,"Android Developer – Kotlin (Austria based, Hyb...",Bitcoin Devs Company,"Vienna, Austria",Jobs3,...,Android developer,en,Overview:\r\nThe Android Developer – Kotlin po...,1. Platform: Android \n2. Salary: Not mention...,Android,Not mentioned,• Passionate about mobile platforms and transl...,Not mentioned,• Ensure that the app meets our quality standa...,Not mentioned
3,Austria,Europe,True,True,default,google.com,Android & iOS Developer,ventopay gmbh,Austria,StudySmarter - Talents,...,iOS developer,de,**What are your tasks?** \n- You design attra...,1. Platform: Android/iOS \n2. Salary: Not men...,Android/iOS,Not mentioned,- A completed IT education (HTL/FH/University)...,- Professional experience in developing native...,- You design attractive modules and software p...,"- We are a stable, owner-managed company with ..."
4,Austria,Europe,True,True,default,google.com,iOS Developer Up3 (f/m/d),Drei Österreich,"Vienna, Austria","MyAbility.jobs, Drei., Jobted.at",...,iOS developer,en,Do you want to push the frontier of digital se...,1. Platform: iOS \n2. Salary: The gross annua...,iOS,The gross annual salary according to the colle...,• At least 3 years of experience as iOS develo...,Not mentioned,• Develop the iOS app of our fully digital pro...,• Top mobile phone of your choice incl. employ...


## Keep only iOS and Android vacancies 

In [14]:
platform_counts = df_sections["Platform"].value_counts().reset_index()
platform_counts.columns = ['Platform', 'Count']
print(platform_counts)

                        Platform  Count
0                            iOS   1094
1                        Android   1080
2                   Android, iOS    156
3   Android, iOS, Cross-platform    109
4                  Not mentioned     77
..                           ...    ...
67   iOS, Android, Windows Phone      1
68        Android/Cross-platform      1
69       Android, iOS, Web-based      1
70               Android, Huawei      1
71           Android, macOS, iOS      1

[72 rows x 2 columns]


In [15]:
df_sections[(df_sections['Platform'] == 'iOS/macOS') | (df_sections['Platform'] == 'iOS, tvOS')]

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Search Query,Language,Job Description English,Job Description Extracted,Platform,Salary_E,Requirements,Nice to have,Responsibilities,Benefits
324,Canada,Northern America,False,False,default,google.com,iOS Developer (Remote),McAfee,"Waterloo, ON, Canada",Blind,...,iOS developer,en,Role OverviewMcAfee is searching for an interm...,1. Platform: iOS/macOS \n2. Salary: Not menti...,iOS/macOS,Not mentioned,- You can develop iOS applications at an inter...,- Diving deep into lower-level libraries like ...,- Dive deep into anti-censorship technologies ...,Not mentioned
385,Canada,Northern America,False,False,default,google.com,"Software Developer in Test, Creativity Apps",Apple,"Vancouver, BC, Canada",Careers At Apple,...,iOS developer,en,"Summary\r\nPosted: Sep 4, 2024\r\n\r\nRole Num...","1. Platform: iOS/macOS \n2. Salary: $113,400 ...",iOS/macOS,"$113,400 and $215,300",• 5+ years experience in QA/QE \n • Minimum...,• 2+ years of Full-stack Software Developer in...,"In this role, you will be responsible for plan...","• At Apple, base pay is one part of our total ..."
534,Czechia,Europe,True,True,default,google.com,iOS Developer,Sledovanitv.cz,"Brno, Czechia",Indeed.cz,...,iOS developer,cs,We are SledovaniTV.cz - the most technological...,"1. Platform: iOS, tvOS \n2. Salary: Not menti...","iOS, tvOS",Not mentioned,- Advanced knowledge of iOS development (exper...,- Your own project to showcase,- Have the opportunity to dive into our iOS an...,"- The opportunity to work in a young, inspirin..."
1273,Mexico,Northern America,False,False,default,google.com,Senior Ios Developer,Bhuvi It Solutions,"Guadalajara, Jalisco, Mexico","BeBee, Trabajo.org - Vacantes De Empleo, Traba...",...,iOS developer,en,Job Title: iOS Developer\r\n\r\nWe are seeking...,"1. Platform: iOS, tvOS \n2. Salary: $120,000 ...","iOS, tvOS","$120,000 - $180,000 per year.",• 4+ years of professional software developmen...,Not mentioned,"• Produce a reliable, performant, configurable...",• TN Visa Sponsorship. \n • USDPay. \n •...


### Rename rows with 'iOS/macOS' or 'iOS, tvOS' to iOS

In [16]:
# Replace the first two rows with 'iOS, tvOS'
df_sections.loc[df_sections[df_sections['Platform'] == 'iOS, tvOS'].iloc[:2].index, 'Platform'] = 'iOS'
# Replace the first two rows with 'iOS/macOS'
df_sections.loc[df_sections[df_sections['Platform'] == 'iOS/macOS'].iloc[:2].index, 'Platform'] = 'iOS'

### Keep only iOS and Android vacancies

In [17]:
df_filtered = df_sections.copy()

df_filtered = df_filtered[df_filtered['Platform'].isin(['iOS', 'Android'])]
df_filtered = df_filtered.sort_values(by='Location').reset_index(drop=True)
df_filtered.head()
#df_filtered.to_csv('../data/csv/df_filtered.csv', index=False)

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Search Query,Language,Job Description English,Job Description Extracted,Platform,Salary_E,Requirements,Nice to have,Responsibilities,Benefits
0,Austria,Europe,True,True,default,google.com,"Android Developer – Kotlin (Austria based, Hyb...",Bitcoin Devs Company,"Vienna, Austria",Jobs3,...,Android developer,en,Overview:\r\nThe Android Developer – Kotlin po...,1. Platform: Android \n2. Salary: Not mention...,Android,Not mentioned,• Passionate about mobile platforms and transl...,Not mentioned,• Ensure that the app meets our quality standa...,Not mentioned
1,Austria,Europe,True,True,default,google.com,ios entwickler 80–100% w/m/d,CHANCENLAND VORARLBERG,"Dornbirn, Austria","IT-Career.at, STEMJOBS.AT, IT-JOBS.AT",...,iOS developer,de,**iOS Developer 80–100% w/m/d**\n\n**Job Descr...,1. Platform: iOS \n2. Salary: Not mentioned ...,iOS,Not mentioned,"- **Motivation over experience:** Curiosity, i...",Not mentioned,- You will work with us on exciting projects f...,"- A compact, battle-tested team and flat hiera..."
2,Austria,Europe,True,True,default,google.com,Middle iOS developer,Processica,"Vienna, Austria",JOBITT,...,iOS developer,en,Looking for a iOS Developer. Playing well in a...,1. Platform: iOS \n2. Salary: Not mentioned ...,iOS,Not mentioned,Playing well in a team and has strong analytic...,Not mentioned,Not mentioned,Not mentioned
3,Austria,Europe,True,True,default,google.com,iOS Developer,Raiffeisen Gruppe,"Linz, Austria","Jooble, Trabajo.org - Stellenangebote, Arbeit",...,iOS developer,de,**Your Role in the Team** \n- You contribute ...,1. Platform: iOS \n2. Salary: Not mentioned ...,iOS,Not mentioned,- You are familiar with Continuous Integration...,Not mentioned,- You contribute to the further development of...,- Home Office \n - Flexible working hours ...
4,Austria,Europe,True,True,default,google.com,iOS Software Engineer,Cybermoth,"Vienna, Austria","Expertini, Talent.com",...,iOS developer,en,We are searching for iOS Software Engineers wi...,1. Platform: iOS \n2. Salary: Depending on qu...,iOS,Depending on qualifications and professional e...,• Worked on at least one native Swift applicat...,Not mentioned,• In this position you will be part of one or ...,• You will be part of a company with an inspir...


## Extract technologies and tools
### Comparison of ChatGPT's Extraction vs. Ground Truth

Average Token Set Ratio : 96.73

Tests: [link](./tests.ipynb#extract-technologies-and-tools)

### Merge 3 columns into 1

In [18]:
df_extracted = df_filtered.copy()

df_extracted["Full Requirements"] = (
    "3. Requirements:\n" + df_extracted["Requirements"].astype(str) + "\n\n" +
    "4. Nice to have:\n" + df_extracted["Nice to have"].astype(str) + "\n\n" +
    "5. Responsibilities:\n" + df_extracted["Responsibilities"].astype(str)
)
df_extracted.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Language,Job Description English,Job Description Extracted,Platform,Salary_E,Requirements,Nice to have,Responsibilities,Benefits,Full Requirements
0,Austria,Europe,True,True,default,google.com,"Android Developer – Kotlin (Austria based, Hyb...",Bitcoin Devs Company,"Vienna, Austria",Jobs3,...,en,Overview:\r\nThe Android Developer – Kotlin po...,1. Platform: Android \n2. Salary: Not mention...,Android,Not mentioned,• Passionate about mobile platforms and transl...,Not mentioned,• Ensure that the app meets our quality standa...,Not mentioned,3. Requirements:\n• Passionate about mobile pl...
1,Austria,Europe,True,True,default,google.com,ios entwickler 80–100% w/m/d,CHANCENLAND VORARLBERG,"Dornbirn, Austria","IT-Career.at, STEMJOBS.AT, IT-JOBS.AT",...,de,**iOS Developer 80–100% w/m/d**\n\n**Job Descr...,1. Platform: iOS \n2. Salary: Not mentioned ...,iOS,Not mentioned,"- **Motivation over experience:** Curiosity, i...",Not mentioned,- You will work with us on exciting projects f...,"- A compact, battle-tested team and flat hiera...",3. Requirements:\n- **Motivation over experien...
2,Austria,Europe,True,True,default,google.com,Middle iOS developer,Processica,"Vienna, Austria",JOBITT,...,en,Looking for a iOS Developer. Playing well in a...,1. Platform: iOS \n2. Salary: Not mentioned ...,iOS,Not mentioned,Playing well in a team and has strong analytic...,Not mentioned,Not mentioned,Not mentioned,3. Requirements:\nPlaying well in a team and h...
3,Austria,Europe,True,True,default,google.com,iOS Developer,Raiffeisen Gruppe,"Linz, Austria","Jooble, Trabajo.org - Stellenangebote, Arbeit",...,de,**Your Role in the Team** \n- You contribute ...,1. Platform: iOS \n2. Salary: Not mentioned ...,iOS,Not mentioned,- You are familiar with Continuous Integration...,Not mentioned,- You contribute to the further development of...,- Home Office \n - Flexible working hours ...,3. Requirements:\n- You are familiar with Cont...
4,Austria,Europe,True,True,default,google.com,iOS Software Engineer,Cybermoth,"Vienna, Austria","Expertini, Talent.com",...,en,We are searching for iOS Software Engineers wi...,1. Platform: iOS \n2. Salary: Depending on qu...,iOS,Depending on qualifications and professional e...,• Worked on at least one native Swift applicat...,Not mentioned,• In this position you will be part of one or ...,• You will be part of a company with an inspir...,3. Requirements:\n• Worked on at least one nat...


### Full extraction 

In [19]:
system_prompt_1 = """
You are a senior extraction assistant.

GOAL
Given a natural-language job-vacancy text, return one and only one
valid, minified JSON object that maps every technology term found in
the input to exactly ONE key from ALLOWED_KEYS.

STRICT RULES
1. Output = raw JSON only (no Markdown, no comments, no extra text).
2. Use keys from ALLOWED_KEYS exactly as written. Omit any key whose array would be empty.
3. Each value is an **array of unique strings**, sorted alphabetically and preserving the term’s original spelling/case from the input.
4. Ignore soft skills and vague nouns (e.g. communication, documentation, performance, detail, English, collaboration, problem-solving, university).
5. **IGNORE** overly generic technology words/phrases (e.g. "design patterns", "android ui", "json", "xml", "clean code", "API integration", "unit testing", "Continuous integration", "CI", "CD", "CI/CD", "cryptography", "application testing" ect.).
6. DO NOT hallucinate. Include a term **only** if it appears verbatim in the input text.
7. If a technology term matches **exactly** (case-insensitive) one of the values in ALLOWED_KEYS, then it MUST be assigned to that exact key — even if the name could hypothetically fit other categories.
8. The order of keys in the output JSON MUST strictly follow the order of keys as they appear in the ALLOWED_KEYS section below. Only include keys that have at least one matched term. If a key has no matched terms, it MUST be completely omitted from the output, even if this breaks the visual continuity of the ALLOWED_KEYS order.


ALLOWED_KEYS (with examples)
{
    "languages_and_runtimes": ["Swift", "Kotlin", "Java", "Dart", "Objective-C", "Coroutines", "RxSwift", "Combine", "GCD", "RxJava", "Flows", "JavaScript", "TypeScript", "C", "C++", "Python", "Golang", "PHP"],
    "ui_and_cross_platform_frameworks": ["SwiftUI", "UIKit", "Jetpack Compose", "Flutter", "React Native", "Xamarin", "Ionic", "WatchKit", "Cocoa Touch", "PhoneGap", "Cordova", "Kotlin Multiplatform"],
    "architectural_patterns": ["MVVM", "VIPER", "Clean Architecture", "MVI", "MVC", "MVP", "Redux"],
    "dependency_injection_frameworks": ["Dagger", "Hilt", "Koin", "Swinject"],
    "build_and_dependency_management": ["Gradle", "CocoaPods", "SPM", "Bazel", "Buck", "Xcode", "Android Studio", "CircleCI", "Bamboo", "CocoaPods"],
    "data_and_caching": ["Core Data", "Room", "Realm", "SQLite", "Firestore", "MongoDB", "SAP UltraLite", "MySQL", "NoSQL"],
    "networking_and_api": ["Retrofit", "OkHttp", "URLSession", "Alamofire", "GraphQL", "REST API", "WebSockets", "gRPC", "Protocol Buffers"],
    "backend_or_baas_integration": ["Firebase", "AWS Amplify", "Azure Mobile", "Parse", "AWS Mobile Hub", "AWS Cognito", "AWS S3"],
    "device_and_platform_services_and_third_party_sdks": ["ARKit", "HealthKit", "CoreML", "Core Animation", "Android SDK", "Android NDK", "Push Notifications", "BLE", "NFC", "Camera", "Location", "Sensors", "Stripe SDK", "Facebook SDK", "AdMob", "Google Maps", "ExoPlayer", "Glide", "Stripe", "PayPal SDK", "Binder", "AIDL", "JNI", "CTS", "HAL", "LoRa", "CarPlay", "Android Auto", "CydiaSubstrate", "Frida", "WebViews"],
    "security_and_cryptography": ["Keychain", "TLS pinning", "OAuth2", "Veracode", "Checkmarx", "OWASP"],
    "testing_frameworks": ["XCTest", "JUnit", "Espresso", "Mockito", "Robolectric"],
    "debugging_or_profiling": ["Instruments", "Android Profiler"],
    "version_control": ["Git", "GIT", "git", "SVN", "Mercurial", "Gerrit", "GitFlow", "SourceTree", "Fork", "Bitbucket"],
    "ci_cd_and_release_automation": ["GitHub Actions", "Jenkins", "Bitrise", "fastlane", "CircleCI", "Bamboo", "GitLab CI", "Docker"],
    "monitoring_analytics_and_crash_reporting": ["Crashlytics", "Sentry", "Datadog", "Firebase Analytics", "App Center"],
    "development_methodologies": ["Scrum", "Kanban", "Agile", "SAFe", "TDD", "BDD", "DevOps"],
    "testing_process_and_qa": ["test coverage", "regression testing"],
    "code_quality_and_static_analysis": ["SonarQube", "SwiftLint", "Veracode", "Checkmarx"],
    "documentation_and_knowledge_sharing": ["Swagger", "OpenAPI", "Javadoc", "Confluence", "HIG"],
    "collaboration_pm_and_design_handoff": ["Jira", "Trello", "Figma", "Zeplin", "Rally/AgileCentral"],
    "distribution_and_store_operations": ["TestFlight", "App Store Connect", "Google Play Console"],
    "compliance_and_certifications": ["SOC 2", "GDPR"]
}

EXAMPLE RESPONSE
{
    "languages_and_runtimes":["Swift","Java"],
    "version_control":["Git"]
}
"""


user_prompt_1 = """
Extract every technology, tool, framework, library, service or formal
methodology mentioned in the text below and output the JSON exactly as
specified in the system prompt.

---
INPUT TEXT:
"""

df_extracted = await chat_gpt.chatgpt_async(
    input_column_name="Full Requirements", 
    output_column_name="Extracted Technologies GPT",
    input_text_length=None,
    output_text_length=None,
    num_rows=None,  
    df=df_extracted.copy(), 
    system_prompt=system_prompt_1,
    user_prompt=user_prompt_1,
    gpt_model="gpt-4o-2024-11-20",
    client=client_async,
    batch_size=25,
    concurrency_limit=25,
    cache_file= "../data/cache/Full_requirements_cache.json"
)
df_extracted.head()

Processing Batches: 100%|██████████| 88/88 [00:03<00:00, 24.59it/s]


Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Job Description English,Job Description Extracted,Platform,Salary_E,Requirements,Nice to have,Responsibilities,Benefits,Full Requirements,Extracted Technologies GPT
0,Austria,Europe,True,True,default,google.com,"Android Developer – Kotlin (Austria based, Hyb...",Bitcoin Devs Company,"Vienna, Austria",Jobs3,...,Overview:\r\nThe Android Developer – Kotlin po...,1. Platform: Android \n2. Salary: Not mention...,Android,Not mentioned,• Passionate about mobile platforms and transl...,Not mentioned,• Ensure that the app meets our quality standa...,Not mentioned,3. Requirements:\n• Passionate about mobile pl...,"{\n ""languages_and_runtimes"":[""Kotlin"",""Cor..."
1,Austria,Europe,True,True,default,google.com,ios entwickler 80–100% w/m/d,CHANCENLAND VORARLBERG,"Dornbirn, Austria","IT-Career.at, STEMJOBS.AT, IT-JOBS.AT",...,**iOS Developer 80–100% w/m/d**\n\n**Job Descr...,1. Platform: iOS \n2. Salary: Not mentioned ...,iOS,Not mentioned,"- **Motivation over experience:** Curiosity, i...",Not mentioned,- You will work with us on exciting projects f...,"- A compact, battle-tested team and flat hiera...",3. Requirements:\n- **Motivation over experien...,{}
2,Austria,Europe,True,True,default,google.com,Middle iOS developer,Processica,"Vienna, Austria",JOBITT,...,Looking for a iOS Developer. Playing well in a...,1. Platform: iOS \n2. Salary: Not mentioned ...,iOS,Not mentioned,Playing well in a team and has strong analytic...,Not mentioned,Not mentioned,Not mentioned,3. Requirements:\nPlaying well in a team and h...,{}
3,Austria,Europe,True,True,default,google.com,iOS Developer,Raiffeisen Gruppe,"Linz, Austria","Jooble, Trabajo.org - Stellenangebote, Arbeit",...,**Your Role in the Team** \n- You contribute ...,1. Platform: iOS \n2. Salary: Not mentioned ...,iOS,Not mentioned,- You are familiar with Continuous Integration...,Not mentioned,- You contribute to the further development of...,- Home Office \n - Flexible working hours ...,3. Requirements:\n- You are familiar with Cont...,"{\n ""languages_and_runtimes"":[""Swift""],\n ..."
4,Austria,Europe,True,True,default,google.com,iOS Software Engineer,Cybermoth,"Vienna, Austria","Expertini, Talent.com",...,We are searching for iOS Software Engineers wi...,1. Platform: iOS \n2. Salary: Depending on qu...,iOS,Depending on qualifications and professional e...,• Worked on at least one native Swift applicat...,Not mentioned,• In this position you will be part of one or ...,• You will be part of a company with an inspir...,3. Requirements:\n• Worked on at least one nat...,"{\n ""languages_and_runtimes"":[""Java"",""Kotli..."


## Remove hallucinated words

In [20]:
df_extracted["Extracted Technologies Clean"] = (
    df_extracted.apply(data_cleaning.remove_hallucinated, axis=1)
)


print("\n== Hallucinations removed ==")
print(pd.DataFrame(data_cleaning.removed_counter.items(),
                   columns=["hallucinated_term", "removed_count"])
        .sort_values("removed_count", ascending=False)
        .reset_index(drop=True))

print("\n== The terms are canonized ==")
print(pd.DataFrame(data_cleaning.canonicalized_counter.items(),
                   columns=["canonical_term", "replaced_count"])
        .sort_values("replaced_count", ascending=False)
        .reset_index(drop=True))

print(f"\nTotal rows processed: {len(df_extracted)}")
print(f"Total hallucinated terms removed: {sum(data_cleaning.removed_counter.values())}")
print(f"Total terms canonicalized: {sum(data_cleaning.canonicalized_counter.values())}")


== Hallucinations removed ==
                          hallucinated_term  removed_count
0                                       git             64
1                               android sdk             55
2                                       hig             37
3                                     swift             27
4                            android studio             17
5                                     agile             17
6                                      mvvm             16
7                                    binder             13
8                                      java             11
9                                    dagger              9
10                       clean architecture              7
11                         android profiler              7
12                                   oauth2              7
13                              google maps              6
14                                      tdd              6
15                        

### Inspect exact rows with removed term

In [21]:
orig_sets  = df_extracted["Extracted Technologies GPT"].apply(data_cleaning.flat_terms)
clean_sets = df_extracted["Extracted Technologies Clean"].apply(data_cleaning.flat_terms)
df_extracted["removed_terms"] = orig_sets.subtract(clean_sets)


term = "git"                       # or "swift", "android sdk", …

mask = df_extracted["removed_terms"].apply(lambda s: term.lower() in s)
view_cols = ["Full Requirements",
             "Extracted Technologies GPT",
             "Extracted Technologies Clean"]
df_debug = df_extracted.loc[mask, view_cols]

print(f"Rows with removed '{term}': {len(df_debug)}")
df_debug.head(5)                 # will show the first 5 matches


Rows with removed 'git': 64


Unnamed: 0,Full Requirements,Extracted Technologies GPT,Extracted Technologies Clean
60,3. Requirements:\n- Several years of professio...,"{\n ""languages_and_runtimes"":[""C#"",""Java"",""...","{""languages_and_runtimes"": [""c#"", ""java"", ""kot..."
113,3. Requirements:\n• Bachelor’s degree in Compu...,"{\n ""languages_and_runtimes"":[""Java"",""Kotli...","{""languages_and_runtimes"": [""java"", ""kotlin""],..."
126,3. Requirements:\n• Experience developing and ...,"{\n ""languages_and_runtimes"":[""JavaScript"",...","{""languages_and_runtimes"": [""javascript"", ""swi..."
176,3. Requirements:\n• Minimum of 3+ years of pos...,"{\n ""languages_and_runtimes"":[""C++"",""Go"",""J...","{""languages_and_runtimes"": [""c++"", ""golang"", ""..."
217,3. Requirements:\n• At least 5+ years of Andro...,"{\n ""languages_and_runtimes"":[""Java"",""Kotli...","{""languages_and_runtimes"": [""java"", ""kotlin""],..."


In [22]:
df_extracted = df_extracted.drop(columns=['removed_terms'])

## Cleaning and Normolization

In [41]:
df_clean = df_extracted.copy()

df_clean['Technologies Only'] = df_clean['Extracted Technologies Clean'].apply(data_cleaning.extract_values).str.lower()
df_clean['Technologies Only'] = df_clean['Technologies Only'].apply(data_cleaning.normalize_tech_string)
df_clean.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Platform,Salary_E,Requirements,Nice to have,Responsibilities,Benefits,Full Requirements,Extracted Technologies GPT,Extracted Technologies Clean,Technologies Only
0,Austria,Europe,True,True,default,google.com,"Android Developer – Kotlin (Austria based, Hyb...",Bitcoin Devs Company,"Vienna, Austria",Jobs3,...,Android,Not mentioned,• Passionate about mobile platforms and transl...,Not mentioned,• Ensure that the app meets our quality standa...,Not mentioned,3. Requirements:\n• Passionate about mobile pl...,"{\n ""languages_and_runtimes"":[""Kotlin"",""Cor...","{""languages_and_runtimes"": [""kotlin"", ""kotlin ...","kotlin, kotlin coroutines, mvvm, hilt, retrofi..."
1,Austria,Europe,True,True,default,google.com,ios entwickler 80–100% w/m/d,CHANCENLAND VORARLBERG,"Dornbirn, Austria","IT-Career.at, STEMJOBS.AT, IT-JOBS.AT",...,iOS,Not mentioned,"- **Motivation over experience:** Curiosity, i...",Not mentioned,- You will work with us on exciting projects f...,"- A compact, battle-tested team and flat hiera...",3. Requirements:\n- **Motivation over experien...,{},{},
2,Austria,Europe,True,True,default,google.com,Middle iOS developer,Processica,"Vienna, Austria",JOBITT,...,iOS,Not mentioned,Playing well in a team and has strong analytic...,Not mentioned,Not mentioned,Not mentioned,3. Requirements:\nPlaying well in a team and h...,{},{},
3,Austria,Europe,True,True,default,google.com,iOS Developer,Raiffeisen Gruppe,"Linz, Austria","Jooble, Trabajo.org - Stellenangebote, Arbeit",...,iOS,Not mentioned,- You are familiar with Continuous Integration...,Not mentioned,- You contribute to the further development of...,- Home Office \n - Flexible working hours ...,3. Requirements:\n- You are familiar with Cont...,"{\n ""languages_and_runtimes"":[""Swift""],\n ...","{""languages_and_runtimes"": [""swift""], ""build_a...","swift, gradle, jenkins"
4,Austria,Europe,True,True,default,google.com,iOS Software Engineer,Cybermoth,"Vienna, Austria","Expertini, Talent.com",...,iOS,Depending on qualifications and professional e...,• Worked on at least one native Swift applicat...,Not mentioned,• In this position you will be part of one or ...,• You will be part of a company with an inspir...,3. Requirements:\n• Worked on at least one nat...,"{\n ""languages_and_runtimes"":[""Java"",""Kotli...","{""languages_and_runtimes"": [""java"", ""kotlin"", ...","java, kotlin, objective-c, swift, xcode, scrum"


## Filter the data?

In [42]:
# Remove missing values, if any
tech_series = df_clean['Technologies Only'].dropna()
# Split strings by comma, remove extra spaces, combine all into one Series
all_techs = tech_series.str.split(',').explode().str.strip()
# Count frequencies
tech_counts = all_techs.value_counts().reset_index()
# Keep only technologies that appear at least twice
tech_counts = tech_counts[tech_counts["count"] >= 2]

tech_counts.head(10)

Unnamed: 0,Technologies Only,count
0,swift,914
1,kotlin,853
2,git,644
3,java,600
4,rest api,518
5,agile,476
6,mvvm,452
7,objective-c,432
8,swiftui,374
9,android sdk,292


### Сheck for differences between "tech_counts" and "key_values.json"
"key_values.json" contains all technologies from "tech_counts". <br>
"key_values.json": [Link](../data/json/key_values.json)

In [43]:
tests_helpers.finding_defferences(json_path='../data/json/key_values.json', df=tech_counts, column='Technologies Only')



Comparison summary:
Total technologies in tech_counts: 212
Total technologies in key_values: 212
Technologies only in tech_counts: 0
Technologies only in key_values: 0


### "Technologies Only" -> JSON object

In [44]:
df_clean['Technologies Categorized'] = df_clean['Technologies Only'].apply(data_cleaning.categorize)
df_clean.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Salary_E,Requirements,Nice to have,Responsibilities,Benefits,Full Requirements,Extracted Technologies GPT,Extracted Technologies Clean,Technologies Only,Technologies Categorized
0,Austria,Europe,True,True,default,google.com,"Android Developer – Kotlin (Austria based, Hyb...",Bitcoin Devs Company,"Vienna, Austria",Jobs3,...,Not mentioned,• Passionate about mobile platforms and transl...,Not mentioned,• Ensure that the app meets our quality standa...,Not mentioned,3. Requirements:\n• Passionate about mobile pl...,"{\n ""languages_and_runtimes"":[""Kotlin"",""Cor...","{""languages_and_runtimes"": [""kotlin"", ""kotlin ...","kotlin, kotlin coroutines, mvvm, hilt, retrofi...","{'languages': ['kotlin'], 'concurrency_reactiv..."
1,Austria,Europe,True,True,default,google.com,ios entwickler 80–100% w/m/d,CHANCENLAND VORARLBERG,"Dornbirn, Austria","IT-Career.at, STEMJOBS.AT, IT-JOBS.AT",...,Not mentioned,"- **Motivation over experience:** Curiosity, i...",Not mentioned,- You will work with us on exciting projects f...,"- A compact, battle-tested team and flat hiera...",3. Requirements:\n- **Motivation over experien...,{},{},,{}
2,Austria,Europe,True,True,default,google.com,Middle iOS developer,Processica,"Vienna, Austria",JOBITT,...,Not mentioned,Playing well in a team and has strong analytic...,Not mentioned,Not mentioned,Not mentioned,3. Requirements:\nPlaying well in a team and h...,{},{},,{}
3,Austria,Europe,True,True,default,google.com,iOS Developer,Raiffeisen Gruppe,"Linz, Austria","Jooble, Trabajo.org - Stellenangebote, Arbeit",...,Not mentioned,- You are familiar with Continuous Integration...,Not mentioned,- You contribute to the further development of...,- Home Office \n - Flexible working hours ...,3. Requirements:\n- You are familiar with Cont...,"{\n ""languages_and_runtimes"":[""Swift""],\n ...","{""languages_and_runtimes"": [""swift""], ""build_a...","swift, gradle, jenkins","{'languages': ['swift'], 'build_dependency': [..."
4,Austria,Europe,True,True,default,google.com,iOS Software Engineer,Cybermoth,"Vienna, Austria","Expertini, Talent.com",...,Depending on qualifications and professional e...,• Worked on at least one native Swift applicat...,Not mentioned,• In this position you will be part of one or ...,• You will be part of a company with an inspir...,3. Requirements:\n• Worked on at least one nat...,"{\n ""languages_and_runtimes"":[""Java"",""Kotli...","{""languages_and_runtimes"": [""java"", ""kotlin"", ...","java, kotlin, objective-c, swift, xcode, scrum","{'languages': ['java', 'kotlin', 'objective-c'..."


### Сheck for differences between "key_values.json" and "map.json"
"map.json" contains all technologies from "key_values" but in canonical form.<br>
"map.json": [Link](../data/json/map.json)<br>
Examples:
- git -> Git
- aws -> Amazon Web Services (AWS)

In [45]:
tests_helpers.finding_defferences_1(map_path=Path("../data/json/map.json"), key_values_path=Path("../data/json/key_values.json"))

In map but not in key_values: 0

In key_values but not in map: 0

Everything is distributed correctly (no differences).


### Mapping

In [53]:
df_final = df_clean.copy()

df_final['Technologies Categorized'] = df_final['Technologies Categorized'].apply(data_cleaning.fix_casing)
df_final.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Salary_E,Requirements,Nice to have,Responsibilities,Benefits,Full Requirements,Extracted Technologies GPT,Extracted Technologies Clean,Technologies Only,Technologies Categorized
0,Austria,Europe,True,True,default,google.com,"Android Developer – Kotlin (Austria based, Hyb...",Bitcoin Devs Company,"Vienna, Austria",Jobs3,...,Not mentioned,• Passionate about mobile platforms and transl...,Not mentioned,• Ensure that the app meets our quality standa...,Not mentioned,3. Requirements:\n• Passionate about mobile pl...,"{\n ""languages_and_runtimes"":[""Kotlin"",""Cor...","{""languages_and_runtimes"": [""kotlin"", ""kotlin ...","kotlin, kotlin coroutines, mvvm, hilt, retrofi...","{'languages': ['Kotlin'], 'concurrency_reactiv..."
1,Austria,Europe,True,True,default,google.com,ios entwickler 80–100% w/m/d,CHANCENLAND VORARLBERG,"Dornbirn, Austria","IT-Career.at, STEMJOBS.AT, IT-JOBS.AT",...,Not mentioned,"- **Motivation over experience:** Curiosity, i...",Not mentioned,- You will work with us on exciting projects f...,"- A compact, battle-tested team and flat hiera...",3. Requirements:\n- **Motivation over experien...,{},{},,{}
2,Austria,Europe,True,True,default,google.com,Middle iOS developer,Processica,"Vienna, Austria",JOBITT,...,Not mentioned,Playing well in a team and has strong analytic...,Not mentioned,Not mentioned,Not mentioned,3. Requirements:\nPlaying well in a team and h...,{},{},,{}
3,Austria,Europe,True,True,default,google.com,iOS Developer,Raiffeisen Gruppe,"Linz, Austria","Jooble, Trabajo.org - Stellenangebote, Arbeit",...,Not mentioned,- You are familiar with Continuous Integration...,Not mentioned,- You contribute to the further development of...,- Home Office \n - Flexible working hours ...,3. Requirements:\n- You are familiar with Cont...,"{\n ""languages_and_runtimes"":[""Swift""],\n ...","{""languages_and_runtimes"": [""swift""], ""build_a...","swift, gradle, jenkins","{'languages': ['Swift'], 'build_dependency': [..."
4,Austria,Europe,True,True,default,google.com,iOS Software Engineer,Cybermoth,"Vienna, Austria","Expertini, Talent.com",...,Depending on qualifications and professional e...,• Worked on at least one native Swift applicat...,Not mentioned,• In this position you will be part of one or ...,• You will be part of a company with an inspir...,3. Requirements:\n• Worked on at least one nat...,"{\n ""languages_and_runtimes"":[""Java"",""Kotli...","{""languages_and_runtimes"": [""java"", ""kotlin"", ...","java, kotlin, objective-c, swift, xcode, scrum","{'languages': ['Java', 'Kotlin', 'Objective-C'..."


## To CSV
### Keys to columns

In [54]:
df_final = data_cleaning.keys_to_columns(key_values_path="../data/json/key_values.json", df=df_final)
df_final.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,native_interop,native_ui,networking,platform_android,platform_apple,runtimes,security_compliance,team_collaboration,testing_quality,version_control
0,Austria,Europe,True,True,default,google.com,"Android Developer – Kotlin (Austria based, Hyb...",Bitcoin Devs Company,"Vienna, Austria",Jobs3,...,,,Retrofit,,,,,,,
1,Austria,Europe,True,True,default,google.com,ios entwickler 80–100% w/m/d,CHANCENLAND VORARLBERG,"Dornbirn, Austria","IT-Career.at, STEMJOBS.AT, IT-JOBS.AT",...,,,,,,,,,,
2,Austria,Europe,True,True,default,google.com,Middle iOS developer,Processica,"Vienna, Austria",JOBITT,...,,,,,,,,,,
3,Austria,Europe,True,True,default,google.com,iOS Developer,Raiffeisen Gruppe,"Linz, Austria","Jooble, Trabajo.org - Stellenangebote, Arbeit",...,,,,,,,,,,
4,Austria,Europe,True,True,default,google.com,iOS Software Engineer,Cybermoth,"Vienna, Austria","Expertini, Talent.com",...,,,,,,,,,,


In [None]:
#df_final.to_csv("../data/csv/df_final.csv", index=False)