In [82]:
import pandas as pd
import numpy as np
df = pd.read_csv('language_analysis/df_langdetect.csv', index_col = 0)

In [83]:
if 'gemini_lang' in df.columns:
    df = df
else:
    df['gemini_lang'] = np.nan

In [84]:
df['gemini_lang'].unique()

array(['de', nan, 'fr', 'en', 'ja', 'es', 'it', 'ko', 'gr', 'nl', 'pt',
       'no', 'eu', 'sv', 'ru', 'ca', 'zh'], dtype=object)

In [85]:
# get the texts or which langdetect did not get fr

input_prompt_list = df[(df['merged_langdetect'] != 'fr') & pd.isna(df['gemini_lang'])]['merged_text'].tolist()


In [86]:
len(input_prompt_list)

16217

In [87]:
import google.generativeai as genai
import os
import ast
from dotenv import load_dotenv
import json

# Load environment variables from .env file
load_dotenv()

# Get the API key from the environment variables
api_key = os.environ.get("GEMINI_API_KEY")

if not api_key:
    print("Error: GEMINI_API_KEY not found in environment variables or .env file.")
    exit()

genai.configure(api_key=api_key)

# Select the Gemini model
model = genai.GenerativeModel('gemini-2.0-flash')

In [88]:
def gemini_call(prompt):
    '''Gemini call to determine in language of text'''
    try:
        # Generate the response
        response = model.generate_content(f'Without any formatting give me a dictionary with the items of the list as key and a two letter language code as value for the list: {prompt}')

        # Print the generated text
        return response.text # return response as dictionary
    except Exception as e:        
        return {
            "error": str(e),
            "prompt": prompt
        }

In [None]:
gemini_limit = 25000 # gemini uses tokens with an output limited of 8,192 tokens as limit which corresponds to about 4 characters

# first make sure that json file for output exist:
file_path = 'language_analysis/gemini_result.json'

In [None]:

try:
    with open(file_path, "r") as file:
        data = json.load(file)
except (FileNotFoundError, json.JSONDecodeError):
    data = {}  # Start with an empty list if file doesn't exist or is invalid
prompts = []
while input_prompt_list:
    prompt = []
    current_length = 0
    while input_prompt_list:
        next_element = input_prompt_list[0]
        next_length = len(next_element)

        if current_length + next_length > gemini_limit:
            break

        prompt.append(input_prompt_list.pop(0))
        current_length += next_length
    result = gemini_call(prompt)
    if type(result) != dict:
        try:
            result = ast.literal_eval(result)
        except:
            prefix = "```python"
            suffix = "```"
            if result.startswith(prefix):
                result = result[len(prefix):]
                result = result[:-len(suffix)]
                result = ast.literal_eval(result)
            #cleanup:
    if type(result) == dict:

        data.update(result)  # Merge the new result into the existing data
        with open(file_path, "w") as file:
            json.dump(data, file, indent=4)
    else:
        error_result = {
            "prompt": prompt,
            "result": result
        }
        with open('language_analysis/gemini_reponse_error.json', "a") as file:
            json.dump(error_result, file, indent=4)
    print(f'{len(input_prompt_list)} items to go')

16044 items to go
15930 items to go
15842 items to go
15745 items to go
15663 items to go
15547 items to go
15441 items to go
15334 items to go
15209 items to go
15095 items to go
15009 items to go
14882 items to go
14771 items to go
14681 items to go
14602 items to go
14501 items to go
14406 items to go
14348 items to go
14235 items to go
14118 items to go
14055 items to go
13975 items to go
13857 items to go
13725 items to go
13607 items to go
13474 items to go
13389 items to go
13289 items to go
13177 items to go
13066 items to go
12967 items to go
12842 items to go
12759 items to go
12617 items to go
12497 items to go
12393 items to go
12271 items to go


In [81]:
with open(file_path, "r") as file:
    data = json.load(file)
df['gemini_lang'] = df['merged_text'].map(data)
df.to_csv('language_analysis/df_langdetect.csv')