In [101]:
import pandas as pd
import numpy as np
df = pd.read_csv('language_analysis/df_langdetect.csv', index_col = 0)

In [102]:
if 'gemini_lang' in df.columns:
    df = df
else:
    df['gemini_lang'] = np.nan

In [103]:
df['gemini_lang'].unique()

array(['de', nan, 'fr', 'en', 'ja', 'es', 'it', 'ko', 'gr', 'nl', 'pt',
       'no', 'eu', 'sv', 'ru', 'ca', 'zh', 'la', 'vo', 'pl', 'is', 'tl',
       'tr', 'da'], dtype=object)

In [104]:
# get the texts or which langdetect did not get fr

input_prompt_list = df[(df['merged_langdetect'] != 'fr') & pd.isna(df['gemini_lang'])]['merged_text'].tolist()


In [105]:
len(input_prompt_list)

8583

In [106]:
import google.generativeai as genai
import os
import ast
from dotenv import load_dotenv
import json

# Load environment variables from .env file
load_dotenv()

# Get the API key from the environment variables
api_key = os.environ.get("GEMINI_API_KEY")

if not api_key:
    print("Error: GEMINI_API_KEY not found in environment variables or .env file.")
    exit()

genai.configure(api_key=api_key)

# Select the Gemini model
model = genai.GenerativeModel('gemini-2.0-flash')

In [107]:
def gemini_call(prompt):
    '''Gemini call to determine in language of text'''
    try:
        # Generate the response
        response = model.generate_content(f'Without any formatting give me a dictionary with the items of the list as key and a two letter language code as value for the list: {prompt}')

        # Print the generated text
        return response.text # return response as dictionary
    except Exception as e:        
        return {
            "error": str(e),
            "prompt": prompt
        }

In [108]:
gemini_limit = 25000 # gemini uses tokens with an output limited of 8,192 tokens as limit which corresponds to about 4 characters

# first make sure that json file for output exist:
file_path = 'language_analysis/gemini_result.json'

In [None]:
def parse_result(result):
    if not isinstance(result, dict):
        try:
            return ast.literal_eval(result)
        except:
            try:
                code = re.search(r"```(?:python)?\s*(.*?)```", result, re.DOTALL | re.IGNORECASE).group(1).strip()
                return ast.literal_eval(code)
            except Exception as e:
                print(f"Failed to parse result: {e}")
                return None
    else:
        return result

try:
    with open(file_path, "r") as file:
        data = json.load(file)
except (FileNotFoundError, json.JSONDecodeError):
    data = {}  # Start with an empty list if file doesn't exist or is invalid
prompts = []
while input_prompt_list:
    prompt = []
    current_length = 0
    while input_prompt_list:
        next_element = input_prompt_list[0]
        next_length = len(next_element)

        if current_length + next_length > gemini_limit:
            break

        prompt.append(input_prompt_list.pop(0))
        current_length += next_length
    result = gemini_call(prompt)
    try:
        parsed = parse_result(result)
        data.update(parsed)
        with open(file_path, "w") as file:
            json.dump(data, file, indent=4)
    except:
        with open('language_analysis/gemini_reponse_error.json', "w") as file:
            error_data = json.load(file)
            error_data.update({"prompt": prompt, "result": result})
            json.dump(error_data, file, indent=4)

print(f'{len(input_prompt_list)} items to go')

Failed to parse result: name 're' is not defined
Failed to parse result: name 're' is not defined
Failed to parse result: name 're' is not defined


In [None]:
with open(file_path, "r") as file:
    data = json.load(file)
df['gemini_lang'] = df['merged_text'].map(data)
df.to_csv('language_analysis/df_langdetect.csv')