# Data preparation

In [59]:
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import os
import sys

import tiktoken
import openai

from langdetect import detect, detect_langs, DetectorFactory

from tqdm.asyncio import tqdm
from tqdm.asyncio import tqdm as atqdm

import logging
import asyncio
import aiofiles
import time
import json
import re

In [60]:
load_dotenv()
client = openai.OpenAI()
client_async = openai.AsyncOpenAI()

csv = pd.read_csv('./jobs_data_clean.csv')

df = csv.copy()
df.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query
0,Austria,Europe,True,True,local,google.at,Sandbox Innovation Sdn. Bhd. | Senior Mobile A...,Tideri Jobbörse,Austria,"Trabajo.org - Stellenangebote, Arbeit",We are a dynamic FinTech company headquartered...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTYW5kYm94IElubm92YXRpb24gU2...,2025-01-13 12:33:21 UTC,iOS developer
1,Austria,Europe,True,True,default,google.com,Senior Ios Developer,Pyramid Global Technologies,Austria,"Trabajo.org - Stellenangebote, Arbeit, StudySm...",A minimum of 6+ years of concurrent commercial...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTZW5pb3IgSW9zIERldmVsb3Blci...,2025-01-13 12:20:43 UTC,iOS developer
2,Austria,Europe,True,True,default,google.com,iOS Developer - Permanent remote,Bluestorm Recruitment by Dazzle,Austria,Jooble,iOS Developer\r\n\r\nOur client is a leading m...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRGV2ZWxvcGVyIC0gUGVybW...,2025-01-13 12:20:43 UTC,iOS developer
3,Austria,Europe,True,True,default,google.com,Sr. Ios Developer,Bykon,Austria,"Trabajo.org - Stellenangebote, Arbeit",In ByKon we're looking for an exceptional Sr.\...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTci4gSW9zIERldmVsb3BlciIsIm...,2025-01-13 12:20:43 UTC,iOS developer
4,Austria,Europe,True,True,default,google.com,Software Engineer/ iOS,Bitpanda,Anywhere,GrabJobs,Who we are\r\n\r\nWe simplify wealth creation....,True,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTb2Z0d2FyZSBFbmdpbmVlci8gaU...,2025-01-13 12:20:43 UTC,iOS developer


## Determine the language of vacancies

In [61]:
def count_tokens(text, model="gpt-4"):
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

text = "en"
print("Number of tokens:", count_tokens(text, "gpt-4-turbo"))

Number of tokens: 1


In [62]:
df.loc[0, "Job Description"]

"We are a dynamic FinTech company headquartered in Australia, now expanding our footprint to Malaysia.\r\nSpecializing in innovative payment and remittance solutions.\r\n\r\nWhy Join Us?\r\n\r\nYoung & dynamic workplace & culture (with office recreational amenities provided) Progressive career prospects Well rewarding project contributions Responsibilities:\r\nDevelop, test, and deploy high quality mobile apps for Android/iOS with a focus on performance, scalability, and reliability.\r\nCollaborate with product managers, designers, and developers to deliver tailored FinTech/Payments/Remittance solutions.\r\nImplement responsive design, accessibility, and security best practices.\r\nStay updated on mobile development trends and integrate new tools to improve efficiency.\r\nParticipate in code reviews, debugging, and troubleshooting to maintain code quality.\r\nOptimize mobile development processes for productivity and workflow efficiency.\r\nDesign scalable mobile applications using nat

### `langdetect` library

In [63]:
DetectorFactory.seed = 0  # Фиксируем результат, чтобы не было случайных изменений

def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"
    
def detect_language_with_confidence(text):
    try:
        lang_probs = detect_langs(text)
        if lang_probs:
            return str(lang_probs[0])  # Формат: 'en:0.99'
    except:
        return "unknown"

# Применяем к колонке Job Description
df["Language langdetect"] = df["Job Description"].apply(detect_language)
df["Language langdetect confidence"] = df["Job Description"].apply(detect_language_with_confidence)

In [64]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(df):
    # Derive column 'derivedCol' from column: 'Language langdetect confidence'
    # Transform based on the following examples:
    #    Language langdetect confidence    Output
    # 1: "en:0.9999973661975282"        => "0.9999973661975282"
    df.insert(20, "derivedCol", df["Language langdetect confidence"].str.split(":").str[-1])
    # Change column type to float64 for column: 'derivedCol'
    df = df.astype({'derivedCol': 'float64'})
    # Drop column: 'Language langdetect confidence'
    df = df.drop(columns=['Language langdetect confidence'])
    # Rename column 'derivedCol' to 'Language langdetect confidence'
    df = df.rename(columns={'derivedCol': 'Language langdetect confidence'})
    return df

df = clean_data(df.copy())
df.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query,Language langdetect,Language langdetect confidence
0,Austria,Europe,True,True,local,google.at,Sandbox Innovation Sdn. Bhd. | Senior Mobile A...,Tideri Jobbörse,Austria,"Trabajo.org - Stellenangebote, Arbeit",We are a dynamic FinTech company headquartered...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTYW5kYm94IElubm92YXRpb24gU2...,2025-01-13 12:33:21 UTC,iOS developer,en,0.999997
1,Austria,Europe,True,True,default,google.com,Senior Ios Developer,Pyramid Global Technologies,Austria,"Trabajo.org - Stellenangebote, Arbeit, StudySm...",A minimum of 6+ years of concurrent commercial...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTZW5pb3IgSW9zIERldmVsb3Blci...,2025-01-13 12:20:43 UTC,iOS developer,en,0.999996
2,Austria,Europe,True,True,default,google.com,iOS Developer - Permanent remote,Bluestorm Recruitment by Dazzle,Austria,Jooble,iOS Developer\r\n\r\nOur client is a leading m...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRGV2ZWxvcGVyIC0gUGVybW...,2025-01-13 12:20:43 UTC,iOS developer,en,0.999994
3,Austria,Europe,True,True,default,google.com,Sr. Ios Developer,Bykon,Austria,"Trabajo.org - Stellenangebote, Arbeit",In ByKon we're looking for an exceptional Sr.\...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTci4gSW9zIERldmVsb3BlciIsIm...,2025-01-13 12:20:43 UTC,iOS developer,en,0.999997
4,Austria,Europe,True,True,default,google.com,Software Engineer/ iOS,Bitpanda,Anywhere,GrabJobs,Who we are\r\n\r\nWe simplify wealth creation....,True,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTb2Z0d2FyZSBFbmdpbmVlci8gaU...,2025-01-13 12:20:43 UTC,iOS developer,en,0.999996


### Confidence lower than 0.99

In [65]:
"""
Cell generated by Data Wrangler.
"""
def low_confidence(df):
    # Sort by column: 'Language langdetect confidence' (ascending)
    df = df.sort_values(['Language langdetect confidence'])
    # Filter rows based on column: 'Language langdetect confidence'
    df = df[df['Language langdetect confidence'] < 0.99]
    return df

not_enough_confidence = low_confidence(df.copy())
df = df.drop(not_enough_confidence.index)


In [66]:
"""
Cell generated by Data Wrangler.
"""
def clean_data1(df):
    # Drop column: 'Language langdetect confidence'
    df = df.drop(columns=['Language langdetect confidence'])
    # Rename column 'Language langdetect' to 'Language'
    df = df.rename(columns={'Language langdetect': 'Language'})
    return df

df = clean_data1(df.copy())
df.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query,Language
0,Austria,Europe,True,True,local,google.at,Sandbox Innovation Sdn. Bhd. | Senior Mobile A...,Tideri Jobbörse,Austria,"Trabajo.org - Stellenangebote, Arbeit",We are a dynamic FinTech company headquartered...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTYW5kYm94IElubm92YXRpb24gU2...,2025-01-13 12:33:21 UTC,iOS developer,en
1,Austria,Europe,True,True,default,google.com,Senior Ios Developer,Pyramid Global Technologies,Austria,"Trabajo.org - Stellenangebote, Arbeit, StudySm...",A minimum of 6+ years of concurrent commercial...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTZW5pb3IgSW9zIERldmVsb3Blci...,2025-01-13 12:20:43 UTC,iOS developer,en
2,Austria,Europe,True,True,default,google.com,iOS Developer - Permanent remote,Bluestorm Recruitment by Dazzle,Austria,Jooble,iOS Developer\r\n\r\nOur client is a leading m...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRGV2ZWxvcGVyIC0gUGVybW...,2025-01-13 12:20:43 UTC,iOS developer,en
3,Austria,Europe,True,True,default,google.com,Sr. Ios Developer,Bykon,Austria,"Trabajo.org - Stellenangebote, Arbeit",In ByKon we're looking for an exceptional Sr.\...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTci4gSW9zIERldmVsb3BlciIsIm...,2025-01-13 12:20:43 UTC,iOS developer,en
4,Austria,Europe,True,True,default,google.com,Software Engineer/ iOS,Bitpanda,Anywhere,GrabJobs,Who we are\r\n\r\nWe simplify wealth creation....,True,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTb2Z0d2FyZSBFbmdpbmVlci8gaU...,2025-01-13 12:20:43 UTC,iOS developer,en


### Chat GPT API

In [155]:
# Configure logging
logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")

# Suppress OpenAI API HTTP logs
logging.getLogger("httpx").setLevel(logging.WARNING)



async def check_rate_limits(response_headers, rpm):
    """Checks OpenAI API rate limits and waits if needed, without processing reset time."""

    # Extract limits from headers
    limit_requests = int(response_headers.get("x-ratelimit-limit-requests", rpm))  # Default to given RPM if missing
    limit_tokens = int(response_headers.get("x-ratelimit-limit-tokens", 0))  # TPM is always provided

    remaining_requests = int(response_headers.get("x-ratelimit-remaining-requests", limit_requests))
    remaining_tokens = int(response_headers.get("x-ratelimit-remaining-tokens", limit_tokens))

    # If RPD or TPD is reached → Stop execution
    if remaining_requests == 0:
        print(f"🔴 Request limit (RPD) reached! Execution stopped.")
        return False  # Stop execution

    if remaining_tokens == 0:
        print(f"🔴 Token limit (TPD) reached! Execution stopped.")
        return False  # Stop execution

    # If RPM or TPM is reached → Wait 60 seconds and continue
    if remaining_requests < 1 or remaining_tokens < 1:
        print(f"🟡 API rate limit reached! Waiting 60 seconds...")
        await asyncio.sleep(60)

    return True  # Continue execution



async def chatgpt_async(
    input_column_name=None,
    output_column_name=None,
    input_text_length=None,
    output_text_length=5,
    num_rows=None,
    df=None,
    user_prompt=None,
    gpt_model=None,
    client=None,
    batch_size=None,
    cache_file=None,
    #rpm=500,
    concurrency_limit=10,
    max_retries=5,
    retry_delay=0.125
):
    """Async function to process API requests in batches and cache responses."""

    # Input Validations
    if df is None:
        raise ValueError("df (dataframe) must be provided.")
    if input_column_name is None:
        raise ValueError("input_column_name must be provided.")
    if input_column_name not in df.columns:
        raise ValueError(f"{input_column_name} column does not exist in the dataframe.")
    if output_column_name is None:
        raise ValueError("output_column_name must be provided.")
    if gpt_model is None:
        raise ValueError("gpt_model must be provided.")
    if user_prompt is None:
        raise ValueError("user_prompt must be provided.")
    if client is None:
        raise ValueError("client must be provided.")

    # Prepare Data
    df = df.head(num_rows).copy() if num_rows else df.copy()

    if output_column_name not in df.columns:
        df[output_column_name] = ""

    # Load Cache if Enabled
    cache = {}
    if cache_file and os.path.exists(cache_file):
        async with aiofiles.open(cache_file, "r", encoding="utf-8") as f:
            try:
                cache = json.loads(await f.read())
            except json.JSONDecodeError:
                logging.warning("Cache file is corrupted or empty. Proceeding without cache.")

    semaphore = asyncio.Semaphore(concurrency_limit)  # Limit concurrent requests

    async def process_row(index, row):
        """Asynchronously process each row with caching and retry support."""
        async with semaphore:
            column_text = row[input_column_name]
            truncated_text = " ".join(column_text.split()[:input_text_length]) if input_text_length else column_text

            # Check if result is cached
            if cache_file and truncated_text in cache:
                df.at[index, output_column_name] = cache[truncated_text]
                return  # Skip API call
            
            # old propt version
            #prompt = f"{user_prompt} Text: {truncated_text}"
            
            # New version
            prompt = f"{user_prompt} {truncated_text}"

            for attempt in range(max_retries):
                try:
                    # Fetch response with raw headers
                    response = await client.chat.completions.with_raw_response.create(
                        model=gpt_model,
                        messages=[{"role": "user", "content": prompt}],
                        max_tokens=output_text_length,
                        temperature=0
                    )

                    # Check API limits using response headers
                    #if not await check_rate_limits(response.headers, rpm):
                    #    return  # Stop processing if RPD/TPD is reached

                    response_text = response.parse().choices[0].message.content

                    if cache_file:
                        cache[truncated_text] = response_text
                    df.at[index, output_column_name] = response_text
                    return  

                #except Exception as e:
                #    error_message = str(e)

                    # Try extracting the wait time in milliseconds
                #    wait_time_match = re.search(r'Please try again in (\d+)ms', error_message)
                    
                    # If found, convert to seconds, otherwise use the default retry_delay
                #    dynamic_retry_delay = int(wait_time_match.group(1)) / 1000 if wait_time_match else retry_delay

                #    if attempt < max_retries - 1:
                        #logging.warning(f"Error on row {index}, attempt {attempt+1}: {error_message}. Retrying in {dynamic_retry_delay}s...")
                #        await asyncio.sleep(dynamic_retry_delay + 0.005)
                #    else:
                #        df.at[index, output_column_name] = f"Error: {error_message}"
                        #logging.error(f"Final failure for row {index}: {error_message}")
                #        await asyncio.sleep(60)

                except Exception as e:
                    error_message = str(e)

                    # Try extracting wait time in milliseconds or seconds
                    wait_time_match = re.search(r'Please try again in (\d+(?:\.\d+)?)\s*(ms|s)', error_message)

                    if wait_time_match:
                        wait_time_value = float(wait_time_match.group(1))  # Extract the numeric value
                        wait_time_unit = wait_time_match.group(2)  # Extract unit (ms or s)

                        # Convert milliseconds to seconds if needed
                        dynamic_retry_delay = wait_time_value / 1000 if wait_time_unit == "ms" else wait_time_value
                    else:
                        dynamic_retry_delay = retry_delay  # Default fallback

                    # Add a small buffer (e.g., 5ms extra wait time)
                    dynamic_retry_delay += 0.005

                    if attempt < max_retries - 1:
                        #logging.warning(f"Error on row {index}, attempt {attempt+1}: {error_message}. Retrying in {dynamic_retry_delay:.3f}s...")
                        await asyncio.sleep(dynamic_retry_delay)
                    else:
                        df.at[index, output_column_name] = f"Error: {error_message}"
                        logging.error(f"Final failure for row {index}: {error_message}")

    # Process in Batches
    total_rows = len(df)

    if batch_size and batch_size < total_rows:
        batch_indices = np.array_split(range(total_rows), len(df) // batch_size + 1)
        batches = [df.iloc[idx] for idx in batch_indices]
    else:
        batches = [df]

    with atqdm(total=len(batches), desc="Processing Batches", leave=True) as pbar:
        for batch in batches:
            tasks = [process_row(index, row) for index, row in batch.iterrows()]
            await asyncio.gather(*tasks)

            # Save cache after each batch
            if cache_file:
                async with aiofiles.open(cache_file, "w", encoding="utf-8") as f:
                    await f.write(json.dumps(cache, ensure_ascii=False, indent=4))

            pbar.update(1)  # Update progress bar

    return df

propmpt structure:
"Detect the language of the text and return ONLY the ISO country code (e.g., en, fr, de, ect.). Text:{cell text}"

### Dealing with not enought cinfidence

In [68]:
df_result = await chatgpt_async(
    input_column_name="Job Description", 
    output_column_name="Language gpt-4o-2024-11-20",
    input_text_length=None,
    output_text_length=1,
    num_rows=None,  
    df=not_enough_confidence.copy(), 
    user_prompt="Detect the language of the text and return ONLY the ISO country code (e.g., en, fr, de, ect.). Text:",
    gpt_model="gpt-4o-2024-11-20",
    client=client_async,
    batch_size=10,
    concurrency_limit=10,
    cache_file="./cache/language_cache_gpt-4o-2024-11-20.json"
)

Processing Batches: 100%|██████████| 2/2 [00:00<00:00, 80.21it/s]


In [69]:
df_result['Manual check'] = ["drop", "cs", "drop", "ru", "drop", "fr", "en", "pl", "en", "drop", "fr", "en", "pl", "sv", "zh", "fr"]
df_result['Drop'] = [True, False, True, False, True, False, False, False, False, True, False, False, False, False, False, False]

df_result_final = df_result.copy()
df_result_final = df_result_final.loc[~df_result_final['Drop']]

In [70]:
"""
Cell generated by Data Wrangler.
"""
def clean_data2(df_result_final):
    # Drop columns: 'Drop', 'Language langdetect' and 2 other columns
    df_result_final = df_result_final.drop(columns=['Drop', 'Language langdetect', 'Language langdetect confidence', 'Language gpt-4o-2024-11-20'])
    # Rename column 'Manual check' to 'Language'
    df_result_final = df_result_final.rename(columns={'Manual check': 'Language'})
    return df_result_final

df_result_final_clean = clean_data2(df_result_final.copy())
df_result_final_clean.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query,Language
528,Czechia,Europe,True,True,default,google.com,Android Developer (part-time: 2h/week) @ Exper...,Experis Polska,Anywhere,"Jooble, Jobs Trabajo.org",O pozici / o projektu\r\nPůvodní popisek. Andr...,True,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIERldmVsb3BlciAocG...,2025-01-13 12:09:27 UTC,Android developer,cs
780,Germany,Europe,True,True,default,google.com,JUNIOR IOS DEVELOPER,Check24,Germany,Layboard,Требования 1-2 years of experience with iOS de...,,€4.5K a month,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJKVU5JT1IgSU9TIERFVkVMT1BFUi...,2025-01-13 12:22:03 UTC,iOS developer,ru
325,Canada,Northern America,False,False,default,google.com,"Développeur senior, Android / Senior Android D...",Cerence,"Montreal, Quebec, Canada","Indeed, Built In, Eluta.ca, Glassdoor, Adzuna,...",A Moving Experience.\r\n\r\n(English version b...,,,Full-time,,eyJqb2JfdGl0bGUiOiJEw6l2ZWxvcHBldXIgc2VuaW9yLC...,2025-01-13 12:09:10 UTC,Android developer,fr
2105,Sweden,Europe,True,True,default,google.com,Konsultuppdrag Ios and Android Developers - Of...,Senterprise,ستوكهولم، السويد,Emprego.pt,To one of our clients we are now looking for 1...,,,دوام كامل,,eyJqb2JfdGl0bGUiOiJLb25zdWx0dXBwZHJhZyBJb3MgYW...,2025-01-13 12:11:47 UTC,Android developer,en
1591,Poland,Europe,True,True,default,google.com,Android Developer,ALAN Systems,"Silesian Voivodeship, Poland",JobLeads,"Cześć odkrywco kodu!\r\n\r\nCzy jesteś gotowy,...",,PLN 180K–PLN 240K a year,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIERldmVsb3BlciIsIm...,2025-01-13 12:11:12 UTC,Android developer,pl


### Merging

In [71]:
df_language = pd.concat([df, df_result_final_clean], ignore_index=True)
df_language = df_language.sort_values(['Location'])
df_language.reset_index(inplace=True, drop=True)
df_language.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query,Language
0,Austria,Europe,True,True,local,google.at,Sandbox Innovation Sdn. Bhd. | Senior Mobile A...,Tideri Jobbörse,Austria,"Trabajo.org - Stellenangebote, Arbeit",We are a dynamic FinTech company headquartered...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTYW5kYm94IElubm92YXRpb24gU2...,2025-01-13 12:33:21 UTC,iOS developer,en
1,Austria,Europe,True,True,default,google.com,Mobile Application Developer,Pearson Carter,Austria,"Trabajo.org - Stellenangebote, Arbeit",Lead Mobile Developer | Hyper Growth Startup |...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJNb2JpbGUgQXBwbGljYXRpb24gRG...,2025-01-13 12:08:51 UTC,Android developer,en
2,Austria,Europe,True,True,default,google.com,"Android Developer – Kotlin (Austria based, Hyb...",Bitcoin Devs Company,"Vienna, Austria",Jobs3,Overview:\r\nThe Android Developer – Kotlin po...,,,Contractor,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIERldmVsb3BlciDigJ...,2025-01-13 12:08:48 UTC,Android developer,en
3,Austria,Europe,True,True,default,google.com,Android & iOS Developer,ventopay gmbh,Austria,StudySmarter - Talents,Was sind deine Aufgaben?\r\n• Du gestaltest at...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIFx1MDAyNiBpT1MgRG...,2025-01-13 12:20:43 UTC,iOS developer,de
4,Austria,Europe,True,True,default,google.com,iOS Developer Up3 (f/m/d),Drei Österreich,"Vienna, Austria","MyAbility.jobs, Drei., Jobted.at",Do you want to push the frontier of digital se...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRGV2ZWxvcGVyIFVwMyAoZi...,2025-01-13 12:20:39 UTC,iOS developer,en


## Translate Job Descriptions into English 

Original:
This is a [SRC] to [TGT]
translation, please provide
the [TGT] translation for these
sentences:


My version:
This is a [SRC] to [TGT]
translation, please provide
the [TGT] translation for this
job description:

In [72]:
df_language = df_language.copy()

df_language['Language'].unique()

array(['en', 'de', 'it', 'fr', 'nl', 'es', 'hr', 'ru', 'bg', 'cs', 'pl',
       'uk', 'da', 'fi', 'hu', 'lv', 'zh', 'pt', 'ro', 'sk', 'sl', 'sv'],
      dtype=object)

In [73]:
language_mapping = {
    'en': 'English',
    'de': 'German',
    'it': 'Italian',
    'fr': 'French',
    'nl': 'Dutch',
    'es': 'Spanish',
    'hr': 'Croatian',
    'ru': 'Russian',
    'bg': 'Bulgarian',
    'cs': 'Czech',
    'pl': 'Polish',
    'uk': 'Ukrainian',
    'da': 'Danish',
    'fi': 'Finnish',
    'hu': 'Hungarian',
    'lv': 'Latvian',
    'zh': 'Chinese',
    'pt': 'Portuguese',
    'ro': 'Romanian',
    'sk': 'Slovak',
    'sl': 'Slovenian',
    'sv': 'Swedish'
}

df_language['Language Name'] = df_language['Language'].map(language_mapping)


In [148]:
test = df_language[df_language['Language Name'] == 'German'].head(14)
test_en = df_language[df_language['Language Name'] == 'English'].head(1)
test_all = pd.concat([test, test_en], ignore_index=True)
test_all.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query,Language,Language Name
0,Austria,Europe,True,True,default,google.com,Android & iOS Developer,ventopay gmbh,Austria,StudySmarter - Talents,Was sind deine Aufgaben?\r\n• Du gestaltest at...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIFx1MDAyNiBpT1MgRG...,2025-01-13 12:20:43 UTC,iOS developer,de,German
1,Austria,Europe,True,True,default,google.com,(Senior) Mobile Entwickler:in // Remote möglich,Breuninger,Anywhere,"Jobgether, LinkedIn, JobLeads, Talentify, Recr...","This a Full Remote job, the offer is available...",True,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiIoU2VuaW9yKSBNb2JpbGUgRW50d2...,2025-01-13 12:20:45 UTC,iOS developer,de,German
2,Austria,Europe,True,True,default,google.com,Android Entwickler needed for takethechance An...,takethechance OG,"Linz, Austria",Founderio.com,Android APP weiter entwickeln. Neue Features u...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIEVudHdpY2tsZXIgbm...,2025-01-13 12:08:50 UTC,Android developer,de,German
3,Austria,Europe,True,True,default,google.com,Android Lead Developer (m/w/d),ITPRO - Consulting & Software GmbH,"Ottensheim, Austria (+2 others)",ITPRO - Consulting & Software GmbH,Als Lead Developer bei uns bist Du in den gesa...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIExlYWQgRGV2ZWxvcG...,2025-01-13 12:08:48 UTC,Android developer,de,German
4,Austria,Europe,True,True,default,google.com,Android Developer,Arkulpa,"Lustenau, Austria","DEVjobs.at, Jooble","Deine Rolle im Team\r\n• Kein Blaming, kein Sh...",,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIERldmVsb3BlciIsIm...,2025-01-13 12:08:48 UTC,Android developer,de,German


In [158]:
async def translate_non_english_descriptions(df, language_col, job_desc_col, translated_col, client, gpt_model, batch_size=10, concurrency_limit=10, cache_file="./cache/job_description_english_cache.json"):
    """
    Translates non-English job descriptions into English using chatgpt_async.
    Copies English job descriptions directly without translation.
    
    Parameters:
    - df: DataFrame containing job descriptions and their languages.
    - language_col: Column name for ISO language codes.
    - job_desc_col: Column name for job descriptions.
    - translated_col: Column name where translated descriptions will be stored.
    - client: OpenAI async client instance.
    - gpt_model: GPT model name.
    - batch_size: Number of rows to process per batch.
    - concurrency_limit: Max concurrent API requests.
    - cache_file: Path to store cache.

    Returns:
    - df: Updated DataFrame with translated job descriptions.
    """

    # Language mapping dictionary
    language_mapping = {
        'en': 'English', 'de': 'German', 'it': 'Italian', 'fr': 'French', 'nl': 'Dutch', 'es': 'Spanish',
        'hr': 'Croatian', 'ru': 'Russian', 'bg': 'Bulgarian', 'cs': 'Czech', 'pl': 'Polish', 'uk': 'Ukrainian',
        'da': 'Danish', 'fi': 'Finnish', 'hu': 'Hungarian', 'lv': 'Latvian', 'zh': 'Chinese', 'pt': 'Portuguese',
        'ro': 'Romanian', 'sk': 'Slovak', 'sl': 'Slovenian', 'sv': 'Swedish'
    }

    # Ensure translated column exists
    df[translated_col] = ""

    # Process each language separately
    tasks = []
    for lang_code, lang_name in language_mapping.items():
        subset = df[df[language_col] == lang_code].copy()

        if subset.empty:
            continue  # Skip if no rows for this language

        if lang_name == "English":
            # Copy English job descriptions directly (no translation needed)
            df.loc[df[language_col] == lang_code, translated_col] = df[job_desc_col]
        else:
            # Construct the translation prompt dynamically
            user_prompt = f"This is a {lang_name} to English translation, please provide the English translation for this job description:"
            
            # Call the async function for translation
            task = chatgpt_async(
                input_column_name=job_desc_col,
                output_column_name=translated_col,
                input_text_length=None,
                output_text_length=None,
                num_rows=None,  
                df=subset,  
                user_prompt=user_prompt,
                gpt_model=gpt_model,
                client=client,
                batch_size=batch_size,
                concurrency_limit=concurrency_limit,
                cache_file=cache_file
            )
            tasks.append(task)
    
    # Run all translations in parallel
    results = await asyncio.gather(*tasks, return_exceptions=True)

    # Merge translated results back into the original DataFrame
    for translated_df in results:
        if isinstance(translated_df, Exception):
            # If there's an exception, just continue without modifying df
            continue  
        df.update(translated_df)

    return df


In [157]:
# Example usage
df_translated = await translate_non_english_descriptions(
    df=test_all,#df_language.copy(),
    language_col="Language",
    job_desc_col="Job Description",
    translated_col="Job Description English",
    client=client_async,
    gpt_model="gpt-4o-2024-11-20",
    batch_size=5,
    concurrency_limit=5,
    cache_file="./cache/job_description_english_cache_V2_gpt-4o-2024-11-20.json"
)

Processing Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Processing Batches: 100%|██████████| 3/3 [00:20<00:00,  6.96s/it]


In [152]:
df_translated

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,...,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query,Language,Language Name,Job Description English
0,Austria,Europe,True,True,default,google.com,Android & iOS Developer,ventopay gmbh,Austria,StudySmarter - Talents,...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIFx1MDAyNiBpT1MgRG...,2025-01-13 12:20:43 UTC,iOS developer,de,German,**What are your tasks?** \n- You design attra...
1,Austria,Europe,True,True,default,google.com,(Senior) Mobile Entwickler:in // Remote möglich,Breuninger,Anywhere,"Jobgether, LinkedIn, JobLeads, Talentify, Recr...",...,True,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiIoU2VuaW9yKSBNb2JpbGUgRW50d2...,2025-01-13 12:20:45 UTC,iOS developer,de,German,**English Translation:**\n\n**This is a Full R...
2,Austria,Europe,True,True,default,google.com,Android Entwickler needed for takethechance An...,takethechance OG,"Linz, Austria",Founderio.com,...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIEVudHdpY2tsZXIgbm...,2025-01-13 12:08:50 UTC,Android developer,de,German,"""Further develop the Android app. Implement ne..."
3,Austria,Europe,True,True,default,google.com,Android Lead Developer (m/w/d),ITPRO - Consulting & Software GmbH,"Ottensheim, Austria (+2 others)",ITPRO - Consulting & Software GmbH,...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIExlYWQgRGV2ZWxvcG...,2025-01-13 12:08:48 UTC,Android developer,de,German,**English Translation:**\n\nAs a Lead Develope...
4,Austria,Europe,True,True,default,google.com,Android Developer,Arkulpa,"Lustenau, Austria","DEVjobs.at, Jooble",...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIERldmVsb3BlciIsIm...,2025-01-13 12:08:48 UTC,Android developer,de,German,"Your Role in the Team \n• No blaming, no sham..."
5,Austria,Europe,True,True,default,google.com,Senior Frontend Mobile Developer (m/w/d),eurofunk Kappacher GmbH,"Vienna, Austria","Jobijoba.at, Jobtome",...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTZW5pb3IgRnJvbnRlbmQgTW9iaW...,2025-01-13 12:08:51 UTC,Android developer,de,German,**SENIOR FRONTEND MOBILE DEVELOPER (m/f/d)** ...
6,Austria,Europe,True,True,default,google.com,Android Developer (d/m/w),Raiffeisen Software GmbH,"Linz, Austria","Karriere.at, DEVjobs.at, Jooble, BeBee, Linked...",...,,"€3,950 a month",Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIERldmVsb3BlciAoZC...,2025-01-13 12:08:48 UTC,Android developer,de,German,**Android Developer (d/m/w)** \n\n**Programmi...
7,Austria,Europe,True,True,default,google.com,Mobile App Developer (m/w/d) - Software Entwic...,Ratbacher GmbH,"Vienna, Austria","Ratbacher, Ratbacher, Ratbacher GmbH, Karriere...",...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJNb2JpbGUgQXBwIERldmVsb3Blci...,2025-01-13 12:20:42 UTC,iOS developer,de,German,- You are involved in detailed specifications ...
8,Austria,Europe,True,True,default,google.com,iOS Developer,Raiffeisen Gruppe,"Linz, Austria","DEVjobs.at, BeBee",...,,"€3,828 a month",Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRGV2ZWxvcGVyIiwiY29tcG...,2025-01-13 12:20:42 UTC,iOS developer,de,German,**Your Role in the Team** \n- You contribute ...
9,Austria,Europe,True,True,default,google.com,ios entwickler 80–100% w/m/d,CHANCENLAND VORARLBERG,"Dornbirn, Austria","IT-Career.at, STEMJOBS.AT, IT-JOBS.AT",...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpb3MgZW50d2lja2xlciA4MOKAkz...,2025-01-13 12:20:42 UTC,iOS developer,de,German,**iOS Developer 80–100% w/m/d**\n\n**Job Descr...


In [129]:
csv = pd.read_csv("./df_translated.csv")
non_eng = csv[(csv['Language Name'] != 'English' ) & (~csv["Job Description English"].str.contains("Error: Error code:", na=False))][['Job Description', 'Job Description English']]
len(non_eng)

349

In [None]:
non_eng

In [130]:
# Convert to dictionary
data_dict = dict(zip(non_eng["Job Description"], non_eng["Job Description English"]))

# Save to JSON file
with open("job_description_english_cache_V2_gpt-4o-2024-11-20.json", "w", encoding="utf-8") as f:
    json.dump(data_dict, f, indent=4, ensure_ascii=False)