# Data preparation

In [48]:
from dotenv import load_dotenv
import pandas as pd
import os

import tiktoken
import openai

from langdetect import detect, DetectorFactory

from tqdm import tqdm 
import asyncio
import time
import json
import re

In [49]:
load_dotenv()
client = openai.OpenAI()
client_async = openai.AsyncOpenAI()

csv = pd.read_csv('./jobs_data_clean.csv')

df = csv.copy()
df.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query
0,Austria,Europe,True,True,local,google.at,Sandbox Innovation Sdn. Bhd. | Senior Mobile A...,Tideri Jobbörse,Austria,"Trabajo.org - Stellenangebote, Arbeit",We are a dynamic FinTech company headquartered...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTYW5kYm94IElubm92YXRpb24gU2...,2025-01-13 12:33:21 UTC,iOS developer
1,Austria,Europe,True,True,default,google.com,Senior Ios Developer,Pyramid Global Technologies,Austria,"Trabajo.org - Stellenangebote, Arbeit, StudySm...",A minimum of 6+ years of concurrent commercial...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTZW5pb3IgSW9zIERldmVsb3Blci...,2025-01-13 12:20:43 UTC,iOS developer
2,Austria,Europe,True,True,default,google.com,iOS Developer - Permanent remote,Bluestorm Recruitment by Dazzle,Austria,Jooble,iOS Developer\r\n\r\nOur client is a leading m...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRGV2ZWxvcGVyIC0gUGVybW...,2025-01-13 12:20:43 UTC,iOS developer
3,Austria,Europe,True,True,default,google.com,Sr. Ios Developer,Bykon,Austria,"Trabajo.org - Stellenangebote, Arbeit",In ByKon we're looking for an exceptional Sr.\...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTci4gSW9zIERldmVsb3BlciIsIm...,2025-01-13 12:20:43 UTC,iOS developer
4,Austria,Europe,True,True,default,google.com,Software Engineer/ iOS,Bitpanda,Anywhere,GrabJobs,Who we are\r\n\r\nWe simplify wealth creation....,True,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTb2Z0d2FyZSBFbmdpbmVlci8gaU...,2025-01-13 12:20:43 UTC,iOS developer


## Determine the language of vacancies

In [21]:
def count_tokens(text, model="gpt-4"):
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

text = "Detect the language of the text and return ONLY the ISO code (e.g., en, fr, de). Text: {short_text}"
print("Количество токенов:", count_tokens(text, "gpt-4-turbo"))

Количество токенов: 28


In [9]:
df.loc[0, "Job Description"]

"We are a dynamic FinTech company headquartered in Australia, now expanding our footprint to Malaysia.\r\nSpecializing in innovative payment and remittance solutions.\r\n\r\nWhy Join Us?\r\n\r\nYoung & dynamic workplace & culture (with office recreational amenities provided) Progressive career prospects Well rewarding project contributions Responsibilities:\r\nDevelop, test, and deploy high quality mobile apps for Android/iOS with a focus on performance, scalability, and reliability.\r\nCollaborate with product managers, designers, and developers to deliver tailored FinTech/Payments/Remittance solutions.\r\nImplement responsive design, accessibility, and security best practices.\r\nStay updated on mobile development trends and integrate new tools to improve efficiency.\r\nParticipate in code reviews, debugging, and troubleshooting to maintain code quality.\r\nOptimize mobile development processes for productivity and workflow efficiency.\r\nDesign scalable mobile applications using nat

### `langdetect` library

In [6]:
DetectorFactory.seed = 0  # Фиксируем результат, чтобы не было случайных изменений

def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"

# Применяем к колонке Job Description
df["Language langdetect"] = df["Job Description"].apply(detect_language)

In [7]:
df.head()

Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query,Language langdetect
0,Austria,Europe,True,True,local,google.at,Sandbox Innovation Sdn. Bhd. | Senior Mobile A...,Tideri Jobbörse,Austria,"Trabajo.org - Stellenangebote, Arbeit",We are a dynamic FinTech company headquartered...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTYW5kYm94IElubm92YXRpb24gU2...,2025-01-13 12:33:21 UTC,iOS developer,en
1,Austria,Europe,True,True,default,google.com,Senior Ios Developer,Pyramid Global Technologies,Austria,"Trabajo.org - Stellenangebote, Arbeit, StudySm...",A minimum of 6+ years of concurrent commercial...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTZW5pb3IgSW9zIERldmVsb3Blci...,2025-01-13 12:20:43 UTC,iOS developer,en
2,Austria,Europe,True,True,default,google.com,iOS Developer - Permanent remote,Bluestorm Recruitment by Dazzle,Austria,Jooble,iOS Developer\r\n\r\nOur client is a leading m...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRGV2ZWxvcGVyIC0gUGVybW...,2025-01-13 12:20:43 UTC,iOS developer,en
3,Austria,Europe,True,True,default,google.com,Sr. Ios Developer,Bykon,Austria,"Trabajo.org - Stellenangebote, Arbeit",In ByKon we're looking for an exceptional Sr.\...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTci4gSW9zIERldmVsb3BlciIsIm...,2025-01-13 12:20:43 UTC,iOS developer,en
4,Austria,Europe,True,True,default,google.com,Software Engineer/ iOS,Bitpanda,Anywhere,GrabJobs,Who we are\r\n\r\nWe simplify wealth creation....,True,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTb2Z0d2FyZSBFbmdpbmVlci8gaU...,2025-01-13 12:20:43 UTC,iOS developer,en


In [8]:
df.columns

Index(['Location', 'Region', 'EU Member', 'Schengen Agreement',
       'Google Domain Type', 'Google Domain Used', 'Job Title', 'Company Name',
       'Job Location', 'Apply Options', 'Job Description', 'Work from home',
       'Salary', 'Schedule type', 'Qualifications', 'Job ID', 'Search Date',
       'Search Query', 'Language langdetect'],
      dtype='object')

In [9]:
df['Language langdetect'].value_counts()

Language langdetect
en       2445
de         93
it         79
es         68
nl         45
fr         36
cs         23
pt         21
pl         19
hu          8
sv          5
ru          4
ro          4
bg          2
fi          2
sk          2
no          1
uk          1
da          1
lv          1
zh-cn       1
hr          1
sl          1
Name: count, dtype: int64

In [50]:
test = df.head(30).copy()

In [55]:
async def chatgpt_async(
    intput_column_name=None, 
    output_column_name=None, 
    input_text_length=None, 
    num_rows=None, 
    batch_size=None,
    df=None,
    cache_file=None,
    user_prompt=None, 
    gpt_model=None,
    client=None,
):
    """Async function to process API requests in batches and cache responses."""
    
    # Input Validations
    if df is None:
        raise ValueError("data_frame must be provided.")
    if intput_column_name is None:
        raise ValueError("intput_column_name must be provided.")
    if intput_column_name not in df.columns:
        raise ValueError(f"{intput_column_name} column does not exist in the dataframe.")
    if output_column_name is None:
        raise ValueError("output_column_name must be provided.")
    if gpt_model is None:
        raise ValueError("gpt_model must be provided.")
    if user_prompt is None:
        raise ValueError("user_prompt must be provided.")

    # Prepare Data
    df = df.head(num_rows).copy() if num_rows else df.copy()
    
    if output_column_name not in df.columns:
        df.loc[:, output_column_name] = ""

    # Load Cache if Enabled
    cache = {}
    if cache_file:
        if os.path.exists(cache_file):
            with open(cache_file, "r", encoding="utf-8") as f:
                cache = json.load(f)

    async def process_row(index, row):
        """Asynchronously process each row with caching support"""
        column = row[intput_column_name]
        columns_text = " ".join(column.split()[:input_text_length]) if input_text_length else column

        # Check if result is cached
        if cache_file and columns_text in cache:
            df.at[index, output_column_name] = cache[columns_text]
            return  # Skip API call

        prompt = f"{user_prompt} Text: {columns_text}"

        try:
            completion = await client.chat.completions.create(
                model=gpt_model,
                messages=[{"role": "user", "content": prompt}]
            )
            response = completion.choices[0].message.content

            # Store result in cache
            if cache_file:
                cache[columns_text] = response
            df.at[index, output_column_name] = response

        except Exception as e:
            df.at[index, output_column_name] = f"Error: {str(e)}"

    # Process in Batches (or single batch if `batch_size=None`)
    total_rows = len(df)
    batches = [df.iloc[i:i + batch_size] for i in range(0, total_rows, batch_size)] if batch_size else [df]

    for batch_num, batch in enumerate(tqdm(batches, desc="Processing Batches")):
        #print(f"\n🔹 Processing Batch {batch_num+1}/{len(batches)} ({len(batch)} rows)")

        # Create async tasks for processing rows
        tasks = [process_row(index, row) for index, row in batch.iterrows()]
        
        # Run all tasks asynchronously
        await asyncio.gather(*tasks)

        # Save cache after each batch (if enabled)
        if cache_file:
            with open(cache_file, "w", encoding="utf-8") as f:
                json.dump(cache, f, ensure_ascii=False, indent=4)

    # Save final cache after all batches
    if cache_file:
        with open(cache_file, "w", encoding="utf-8") as f:
            json.dump(cache, f, ensure_ascii=False, indent=4)

    return df



In [58]:
df_result = await chatgpt_async(
    intput_column_name="Job Description", 
    output_column_name="Language gpt-3.5-turbo-0125",
    input_text_length=50,
    num_rows=5, 
    batch_size=None,
    df=test, 
    cache_file="./cache/language_cache.json",
    user_prompt="Detect the language of the text and return ONLY the ISO country code (e.g., en, fr, de, ect.).",
    gpt_model="gpt-3.5-turbo-0125",
    client=client_async
)
df_result

Processing Batches: 100%|██████████| 1/1 [00:00<00:00, 501.53it/s]


Unnamed: 0,Location,Region,EU Member,Schengen Agreement,Google Domain Type,Google Domain Used,Job Title,Company Name,Job Location,Apply Options,Job Description,Work from home,Salary,Schedule type,Qualifications,Job ID,Search Date,Search Query,Language gpt-3.5-turbo-0125
0,Austria,Europe,True,True,local,google.at,Sandbox Innovation Sdn. Bhd. | Senior Mobile A...,Tideri Jobbörse,Austria,"Trabajo.org - Stellenangebote, Arbeit",We are a dynamic FinTech company headquartered...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTYW5kYm94IElubm92YXRpb24gU2...,2025-01-13 12:33:21 UTC,iOS developer,en
1,Austria,Europe,True,True,default,google.com,Senior Ios Developer,Pyramid Global Technologies,Austria,"Trabajo.org - Stellenangebote, Arbeit, StudySm...",A minimum of 6+ years of concurrent commercial...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTZW5pb3IgSW9zIERldmVsb3Blci...,2025-01-13 12:20:43 UTC,iOS developer,en
2,Austria,Europe,True,True,default,google.com,iOS Developer - Permanent remote,Bluestorm Recruitment by Dazzle,Austria,Jooble,iOS Developer\r\n\r\nOur client is a leading m...,,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJpT1MgRGV2ZWxvcGVyIC0gUGVybW...,2025-01-13 12:20:43 UTC,iOS developer,en
3,Austria,Europe,True,True,default,google.com,Sr. Ios Developer,Bykon,Austria,"Trabajo.org - Stellenangebote, Arbeit",In ByKon we're looking for an exceptional Sr.\...,,,Full-time,,eyJqb2JfdGl0bGUiOiJTci4gSW9zIERldmVsb3BlciIsIm...,2025-01-13 12:20:43 UTC,iOS developer,en
4,Austria,Europe,True,True,default,google.com,Software Engineer/ iOS,Bitpanda,Anywhere,GrabJobs,Who we are\r\n\r\nWe simplify wealth creation....,True,,Full-time,No degree mentioned,eyJqb2JfdGl0bGUiOiJTb2Z0d2FyZSBFbmdpbmVlci8gaU...,2025-01-13 12:20:43 UTC,iOS developer,en


### gpt-3.5-turbo-0125