# Data preparation

In [9]:
from dotenv import load_dotenv
import pandas as pd
import os

import tiktoken
import openai

from langdetect import detect, DetectorFactory

import asyncio
import time
import json
import re

In [27]:
load_dotenv()
client = openai.OpenAI()
client_async = openai.AsyncOpenAI()

csv = pd.read_csv('./jobs_data_clean.csv')

df = csv.copy()
df.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query
0,Austria,Europe,True,True,local,google.at,Sandbox Innovation Sdn. Bhd. | Senior Mobile A...,Tideri Jobbörse,Austria,"Trabajo.org - Stellenangebote, Arbeit",We are a dynamic FinTech company headquartered...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTYW5kYm94IElubm92YXRpb24gU2...,2025-01-13 12:33:21 UTC,iOS developer
1,Austria,Europe,True,True,default,google.com,Senior Ios Developer,Pyramid Global Technologies,Austria,"Trabajo.org - Stellenangebote, Arbeit, StudySm...",A minimum of 6+ years of concurrent commercial...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTZW5pb3IgSW9zIERldmVsb3Blci...,2025-01-13 12:20:43 UTC,iOS developer
2,Austria,Europe,True,True,default,google.com,iOS Developer - Permanent remote,Bluestorm Recruitment by Dazzle,Austria,Jooble,iOS Developer\r\n\r\nOur client is a leading m...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRGV2ZWxvcGVyIC0gUGVybW...,2025-01-13 12:20:43 UTC,iOS developer
3,Austria,Europe,True,True,default,google.com,Sr. Ios Developer,Bykon,Austria,"Trabajo.org - Stellenangebote, Arbeit",In ByKon we're looking for an exceptional Sr.\...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTci4gSW9zIERldmVsb3BlciIsIm...,2025-01-13 12:20:43 UTC,iOS developer
4,Austria,Europe,True,True,default,google.com,Software Engineer/ iOS,Bitpanda,Anywhere,GrabJobs,Who we are\r\n\r\nWe simplify wealth creation....,True,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTb2Z0d2FyZSBFbmdpbmVlci8gaU...,2025-01-13 12:20:43 UTC,iOS developer


## Determine the language of vacancies

In [21]:
def count_tokens(text, model="gpt-4"):
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

text = "Detect the language of the text and return ONLY the ISO code (e.g., en, fr, de). Text: {short_text}"
print("Количество токенов:", count_tokens(text, "gpt-4-turbo"))

Количество токенов: 28


In [9]:
df.loc[0, "Job Description"]

"We are a dynamic FinTech company headquartered in Australia, now expanding our footprint to Malaysia.\r\nSpecializing in innovative payment and remittance solutions.\r\n\r\nWhy Join Us?\r\n\r\nYoung & dynamic workplace & culture (with office recreational amenities provided) Progressive career prospects Well rewarding project contributions Responsibilities:\r\nDevelop, test, and deploy high quality mobile apps for Android/iOS with a focus on performance, scalability, and reliability.\r\nCollaborate with product managers, designers, and developers to deliver tailored FinTech/Payments/Remittance solutions.\r\nImplement responsive design, accessibility, and security best practices.\r\nStay updated on mobile development trends and integrate new tools to improve efficiency.\r\nParticipate in code reviews, debugging, and troubleshooting to maintain code quality.\r\nOptimize mobile development processes for productivity and workflow efficiency.\r\nDesign scalable mobile applications using nat

### `langdetect` library

In [6]:
DetectorFactory.seed = 0  # Фиксируем результат, чтобы не было случайных изменений

def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"

# Применяем к колонке Job Description
df["Language langdetect"] = df["Job Description"].apply(detect_language)

In [7]:
df.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query,Language langdetect
0,Austria,Europe,True,True,local,google.at,Sandbox Innovation Sdn. Bhd. | Senior Mobile A...,Tideri Jobbörse,Austria,"Trabajo.org - Stellenangebote, Arbeit",We are a dynamic FinTech company headquartered...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTYW5kYm94IElubm92YXRpb24gU2...,2025-01-13 12:33:21 UTC,iOS developer,en
1,Austria,Europe,True,True,default,google.com,Senior Ios Developer,Pyramid Global Technologies,Austria,"Trabajo.org - Stellenangebote, Arbeit, StudySm...",A minimum of 6+ years of concurrent commercial...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTZW5pb3IgSW9zIERldmVsb3Blci...,2025-01-13 12:20:43 UTC,iOS developer,en
2,Austria,Europe,True,True,default,google.com,iOS Developer - Permanent remote,Bluestorm Recruitment by Dazzle,Austria,Jooble,iOS Developer\r\n\r\nOur client is a leading m...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRGV2ZWxvcGVyIC0gUGVybW...,2025-01-13 12:20:43 UTC,iOS developer,en
3,Austria,Europe,True,True,default,google.com,Sr. Ios Developer,Bykon,Austria,"Trabajo.org - Stellenangebote, Arbeit",In ByKon we're looking for an exceptional Sr.\...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTci4gSW9zIERldmVsb3BlciIsIm...,2025-01-13 12:20:43 UTC,iOS developer,en
4,Austria,Europe,True,True,default,google.com,Software Engineer/ iOS,Bitpanda,Anywhere,GrabJobs,Who we are\r\n\r\nWe simplify wealth creation....,True,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTb2Z0d2FyZSBFbmdpbmVlci8gaU...,2025-01-13 12:20:43 UTC,iOS developer,en


In [8]:
df.columns

Index(['Location', 'Region', 'EU Member', 'Schengen Agreement',
       'Google Domain Type', 'Google Domain Used', 'Job Title', 'Company Name',
       'Job Location', 'Apply Options', 'Job Description', 'Work from home',
       'Salary', 'Schedule type', 'Qualifications', 'Job ID', 'Search Date',
       'Search Query', 'Language langdetect'],
      dtype='object')

In [9]:
df['Language langdetect'].value_counts()

Language langdetect
en       2445
de         93
it         79
es         68
nl         45
fr         36
cs         23
pt         21
pl         19
hu          8
sv          5
ru          4
ro          4
bg          2
fi          2
sk          2
no          1
uk          1
da          1
lv          1
zh-cn       1
hr          1
sl          1
Name: count, dtype: int64

In [17]:
test = df.head(30).copy()

In [19]:

def chatgpt(intput_column_name=None, output_column_name=None, input_text_length=None, num_rows=None, df=None, user_prompt=None, gpt_model=None):

    if df is None:
        raise ValueError("data_frame must be provided.")
    if intput_column_name is None:
        raise ValueError("intput_column_name must be provided.")
    if intput_column_name not in df.columns:
        raise ValueError(f"{intput_column_name} column is not exist in the dataframe.")
    if output_column_name is None:
        raise ValueError("output_column_name must be provided.")
    
    if gpt_model is None:
        raise ValueError("gpt_model must be provided.")
    if user_prompt is None:
        raise ValueError("user_prompt must be provided.")
    

    df = df.head(num_rows).copy() if num_rows else df.copy()
    if output_column_name not in df.columns:
        df.loc[:, output_column_name] = "" 

    for index, row in df.iterrows():
        column = row[intput_column_name]
        columns_text = " ".join(column.split()[:input_text_length]) if input_text_length else column

        prompt = f"{user_prompt} Text: {columns_text}"

        completion = client.chat.completions.create(
            model=gpt_model,
            messages=[{"role": "user", "content": prompt}]
        )  
        # Assign the API response to the correct DataFrame row
        df.at[index, output_column_name] = completion.choices[0].message.content

    return df


    

In [None]:
chatgpt(intput_column_name="Job Description", 
        output_column_name="Language gpt-3.5-turbo-0125",
        input_text_length=50,
        num_rows=None, 
        df=test, 
        user_prompt="Detect the language of the text and return ONLY the ISO country code (e.g., en, fr, de, ect.).",
        gpt_model="gpt-3.5-turbo-0125"
)   

In [28]:
async def chatgpt_async(
    intput_column_name=None, 
    output_column_name=None, 
    input_text_length=None, 
    num_rows=None, 
    df=None, 
    user_prompt=None, 
    gpt_model=None,
    client=None
):
    if df is None:
        raise ValueError("data_frame must be provided.")
    if intput_column_name is None:
        raise ValueError("intput_column_name must be provided.")
    if intput_column_name not in df.columns:
        raise ValueError(f"{intput_column_name} column does not exist in the dataframe.")
    if output_column_name is None:
        raise ValueError("output_column_name must be provided.")
    
    if gpt_model is None:
        raise ValueError("gpt_model must be provided.")
    if user_prompt is None:
        raise ValueError("user_prompt must be provided.")
    if client is None:
        raise ValueError("client must be provided.")

    df = df.head(num_rows).copy() if num_rows else df.copy()
    
    if output_column_name not in df.columns:
        df.loc[:, output_column_name] = ""

    async def process_row(index, row):
        """Asynchronously process each row"""
        column = row[intput_column_name]
        columns_text = " ".join(column.split()[:input_text_length]) if input_text_length else column

        prompt = f"{user_prompt} Text: {columns_text}"

        try:
            completion = await client.chat.completions.create(
                model=gpt_model,
                messages=[{"role": "user", "content": prompt}]
            )
            df.at[index, output_column_name] = completion.choices[0].message.content
        except Exception as e:
            df.at[index, output_column_name] = f"Error: {str(e)}"

    # Create async tasks for each row
    tasks = [process_row(index, row) for index, row in df.iterrows()]
    
    # Run tasks concurrently
    await asyncio.gather(*tasks)

    return df

In [30]:
df_result = await chatgpt_async(
    intput_column_name="Job Description", 
    output_column_name="Language gpt-3.5-turbo-0125",
    input_text_length=50,
    num_rows=30, 
    df=test, 
    user_prompt="Detect the language of the text and return ONLY the ISO country code (e.g., en, fr, de, ect.).",
    gpt_model="gpt-3.5-turbo-0125",
    client=client_async
)
df_result

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query,Language gpt-3.5-turbo-0125
0,Austria,Europe,True,True,local,google.at,Sandbox Innovation Sdn. Bhd. | Senior Mobile A...,Tideri Jobbörse,Austria,"Trabajo.org - Stellenangebote, Arbeit",We are a dynamic FinTech company headquartered...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTYW5kYm94IElubm92YXRpb24gU2...,2025-01-13 12:33:21 UTC,iOS developer,en
1,Austria,Europe,True,True,default,google.com,Senior Ios Developer,Pyramid Global Technologies,Austria,"Trabajo.org - Stellenangebote, Arbeit, StudySm...",A minimum of 6+ years of concurrent commercial...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTZW5pb3IgSW9zIERldmVsb3Blci...,2025-01-13 12:20:43 UTC,iOS developer,en
2,Austria,Europe,True,True,default,google.com,iOS Developer - Permanent remote,Bluestorm Recruitment by Dazzle,Austria,Jooble,iOS Developer\r\n\r\nOur client is a leading m...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRGV2ZWxvcGVyIC0gUGVybW...,2025-01-13 12:20:43 UTC,iOS developer,en
3,Austria,Europe,True,True,default,google.com,Sr. Ios Developer,Bykon,Austria,"Trabajo.org - Stellenangebote, Arbeit",In ByKon we're looking for an exceptional Sr.\...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTci4gSW9zIERldmVsb3BlciIsIm...,2025-01-13 12:20:43 UTC,iOS developer,en
4,Austria,Europe,True,True,default,google.com,Software Engineer/ iOS,Bitpanda,Anywhere,GrabJobs,Who we are\r\n\r\nWe simplify wealth creation....,True,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTb2Z0d2FyZSBFbmdpbmVlci8gaU...,2025-01-13 12:20:43 UTC,iOS developer,en
5,Austria,Europe,True,True,default,google.com,Ios Developer,Desygner,Austria,"Trabajo.org - Stellenangebote, Arbeit, StudySm...",Are you a talented iOS developer ready to make...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJJb3MgRGV2ZWxvcGVyIiwiY29tcG...,2025-01-13 12:20:43 UTC,iOS developer,en
6,Austria,Europe,True,True,default,google.com,Senior iOs Developer,Thehero,"Vienna, Austria","Jooble, StudySmarter - Talents, ProPursuit, Jo...",We are seeking a highly skilled Senior iOS Dev...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTZW5pb3IgaU9zIERldmVsb3Blci...,2025-01-13 12:20:43 UTC,iOS developer,en
7,Austria,Europe,True,True,default,google.com,Android iOS Developer,ventopay gmbh,"Hagenberg, Austria","DEVjobs.at, Jooble",Deine Rolle im Team\r\n• Design & Umsetzung vo...,,€44.8K a year,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJBbmRyb2lkIGlPUyBEZXZlbG9wZX...,2025-01-13 12:20:43 UTC,iOS developer,de
8,Austria,Europe,True,True,default,google.com,Ios Software Engineer,Render Networks,Austria,"Trabajo.org - Stellenangebote, Arbeit",Render's singular focus is best summarised by ...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJJb3MgU29mdHdhcmUgRW5naW5lZX...,2025-01-13 12:20:42 UTC,iOS developer,en
9,Austria,Europe,True,True,default,google.com,iOS Engineer (m/f/d),Roche,"Vienna, Austria (+1 other)","Roche Careers, Indeed, Jobs - ACP, XING, Joobl...",Diabetes is a pesky monster — and that’s putti...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRW5naW5lZXIgKG0vZi9kKS...,2025-01-13 12:20:42 UTC,iOS developer,en


In [32]:




text = df.loc[0, "Job Description"]
short_text = " ".join(text.split()[:50])

prompt = f"Detect the language of the text and return ONLY the ISO country code. Text: {short_text}"

completion = client.chat.completions.create(
    model="gpt-3.5-turbo-0125",
    messages=[
        #{"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": prompt
        }
    ]
)



# gpt-4o-mini
# gpt-3.5-turbo-0125

In [34]:
completion

ChatCompletion(id='chatcmpl-AsWx8YON9AuuTbqs2NCDVmWpX8OZ9', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='EN', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1737559746, model='gpt-3.5-turbo-0125', object='chat.completion', service_tier='default', system_fingerprint=None, usage=CompletionUsage(completion_tokens=2, prompt_tokens=85, total_tokens=87, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))

In [36]:
completion.choices[0].message.content

'EN'

In [33]:
print(completion.choices[0].message)

ChatCompletionMessage(content='EN', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None)


In [31]:
new_text = "Detect the language of the text and return ONLY the ISO country code. Text: "
print(count_tokens(new_text, "gpt-4-turbo"))

17


### gpt-3.5-turbo-0125

In [10]:
CACHE_FILE = "./cache/language_cache.json"


# Загружаем кеш, если он существует
if os.path.exists(CACHE_FILE) and os.path.getsize(CACHE_FILE) > 0:
    try:
        with open(CACHE_FILE, "r", encoding="utf-8") as f:
            cache = json.load(f)
    except json.JSONDecodeError:
        print("Ошибка: Файл кеша повреждён. Создаём новый.")
        cache = {}
else:
    cache = {}

# Ограничение по параллельности (до 3,500 запросов в минуту)
semaphore = asyncio.Semaphore(3500)

# **Функция очистки текста**
def clean_text(text):
    if not isinstance(text, str):
        return ""

    text = re.sub(r"\s+", " ", text)  # Убираем разрывы строк, табуляцию и лишние пробелы
    text = re.sub(r"[^\w\s.,!?-]", "", text)  # Убираем эмодзи и спецсимволы
    text = " ".join(text.split()[:50])  # Берём первые 50 слов
    return text.strip()

# **Асинхронная функция определения языка с кэшированием (новый API)**
async def detect_language_gpt(text):
    if not isinstance(text, str) or text.strip() == "":
        return "unknown"

    short_text = clean_text(text)

    # Проверяем кэш
    if short_text in cache:
        return cache[short_text]

    prompt = f"Detect the language of the text and return ONLY the ISO code (e.g., en, fr, de). Text: {short_text}"

    async with semaphore:
        for attempt in range(5):  # Повторяем до 5 раз, если есть ошибки
            try:
                response = openai.chat.completions.create(
                    model="gpt-3.5-turbo-0125",
                    messages=[{"role": "user", "content": prompt}],
                    max_tokens=5,
                    temperature=0
                )

                detected_language = response.choices[0].message.content.strip().lower()

                # Сохраняем в кеш
                cache[short_text] = detected_language
                with open(CACHE_FILE, "w", encoding="utf-8") as f:
                    json.dump(cache, f, ensure_ascii=False, indent=4)

                return detected_language

            except openai.RateLimitError:  # ОБНОВЛЁННЫЙ ОБРАБОТЧИК ОШИБОК
                print(f"Лимит запросов превышен. Ждём 60 секунд перед повтором... (Попытка {attempt+1}/5)")
                await asyncio.sleep(60)

            except Exception as e:
                print(f"Ошибка: {e}")
                return "error"

    return "error"

# **Функция для обработки вакансий асинхронно**
async def process_jobs(df):
    # **Создаём колонку "Language gpt-3.5-turbo-0125", если её нет**
    if "Language gpt-3.5-turbo-0125" not in df.columns:
        df["Language gpt-3.5-turbo-0125"] = None  # Или np.nan

    tasks = []
    daily_limit = 10_000  # Лимит запросов в день
    processed = 0

    for index, row in df.iterrows():
        if processed >= daily_limit:
            print("Достигнут дневной лимит API (10,000 запросов). Останавливаем обработку.")
            break

        if pd.isna(row["Language gpt-3.5-turbo-0125"]):  # Проверяем, если язык ещё не определён
            tasks.append(detect_language_gpt(row["Job Description"]))
            processed += 1

        if processed % 3_500 == 0 and processed > 0:
            print(f"Обработано {processed} запросов. Ждём 60 секунд перед продолжением.")
            await asyncio.sleep(60)  # Ждём 60 секунд после 3,500 запросов

    results = await asyncio.gather(*tasks)

    # Записываем результаты обратно в DataFrame
    df.loc[df["Language gpt-3.5-turbo-0125"].isna(), "Language gpt-3.5-turbo-0125"] = results

    # Сохраняем обновлённый DataFrame
    df.to_csv("./jobs_data_with_gpt_language.csv", index=False)

    print("Определение языка завершено! Данные сохранены в 'jobs_data_with_gpt_language.csv'.")

In [None]:
# Запускаем обработку
#await process_jobs(df)