In [169]:
import pandas as pd
import numpy as np
df = pd.read_csv('language_analysis/df_langdetect.csv', index_col = 0)

In [170]:
if 'gemini_lang' in df.columns:
    df = df
else:
    df['gemini_lang'] = np.nan

In [171]:
df['gemini_lang'].unique()

array(['de', nan, 'fr', 'en', 'ja', 'es', 'tr', 'el', 'it', 'nl', 'pt',
       'da', 'sv', 'ko', 'mul', 'ca', 'zh', 'la', 'vo', '??', 'pl', 'ru',
       'tl', 'und', 'cs'], dtype=object)

In [172]:
# get the texts or which langdetect did not get fr

input_prompt_list = df[(df['merged_langdetect'] != 'fr') & pd.isna(df['gemini_lang'])]['merged_text'].tolist()


In [173]:
len(input_prompt_list)

2

In [174]:
input_prompt_list

["Livre intégralement en italien. Le delocalizzazioni avvantagiano soltanto un'élite finanziaria. Ora se i tassi di cambio sono stabiliti nel rispetto della parità dei poteri d'acquisto le delocalizzazioni perdono ogni interesse. Il ruolo dei responsabili politici economici e monetari è messo in causa dato che essi sono tutti sotto l'influenza di coloro che dalle delocalizzazioni traggono profitto. L'autore dimostra che esistono soluzioni economiche : occorre solo il coraggio politico di applicarle.",
 'Lanceur complet pour Moteur de tondeuse Briggs & stratton Moteur Briggs & stratton - Lanceur complet compatible avec 12h809-1795-b1 124t05 122682 122h82 111607 12q507 12s882 123j09 12d802 123j02 12g882 12z802 12m887 125607 12g807 12s512 127712 128812 12g602 12j612 12s702 12e707 120612 125h82 127782 128m07 12s602 127802 129h07 12f889 120609 205412 12s605 128h02 12j882 12e787 12h612 129h02 12g612 12s612 204437 12e802 125602 120l02 12h672 120t02 125k02 12h887 12k602 12g702 12f887 121686 12

In [163]:
[len(x) for x in input_prompt_list]

[502, 4460, 1120]

In [164]:
import google.generativeai as genai
import os # used for importing the API KEY
from pathlib import Path
import re
import ast
from dotenv import load_dotenv
import json

# Load environment variables from .env file
load_dotenv()

# Get the API key from the environment variables
api_key = os.environ.get("GEMINI_API_KEY")

if not api_key:
    print("Error: GEMINI_API_KEY not found in environment variables or .env file.")
    exit()

genai.configure(api_key=api_key)

# Select the Gemini model
model = genai.GenerativeModel('gemini-2.0-flash')

In [165]:
def gemini_call(prompt):
    '''Gemini call to determine in language of text'''
    try:
        # Generate the response
        response = model.generate_content(f'Without any formatting give me a dictionary with the items of the list as key and a two letter language code as value for the list: {prompt}')

        # Print the generated text
        return response.text # return response as dictionary
    except Exception as e:        
        return {
            "error": str(e),
            "prompt": prompt
        }

In [None]:
gemini_limit = 25000 # gemini uses tokens with an output limited of 8,192 tokens as limit which corresponds to about 4 characters
#If the gemini_limit is high we will see more failures but need less calls one option is to keep the limit high first e.g. 25000 and than set it lower to capture the elements that first fail e.g. 18000
#Alternativily you can also just run the script multiple times since the slices will shift every time so it might just run with one of the reorderings


# first make sure that json file for output exist:
file_path = 'language_analysis/gemini_result.json'

In [167]:
def parse_result(result):
    if not isinstance(result, dict):
        try:
            return ast.literal_eval(result)
        except:
            try:
                code = re.search(r"```(?:python)?\s*(.*?)```", result, re.DOTALL | re.IGNORECASE).group(1).strip()
                return ast.literal_eval(code)
            except Exception as e:
                print(f"Failed to parse result: {e}")
                return None
    else:
        return result

try:
    with open(file_path, "r") as file:
        data = json.load(file)
except (FileNotFoundError, json.JSONDecodeError):
    data = {}  # Start with an empty list if file doesn't exist or is invalid
prompts = []
while input_prompt_list:
    prompt = []
    current_length = 0
    while input_prompt_list:
        next_element = input_prompt_list[0]
        next_length = len(next_element)

        if current_length + next_length > gemini_limit:
            break

        prompt.append(input_prompt_list.pop(0))
        current_length += next_length
    result = gemini_call(prompt)
    try:
        parsed = parse_result(result)
        data.update(parsed)
        with open(file_path, "w") as file:
            json.dump(data, file, indent=4)
    except:
        error_path = Path("language_analysis/gemini_reponse_error.json")
        try:
            error_data = json.loads(error_path.read_text())
        except (FileNotFoundError, json.JSONDecodeError):
            error_data = {}

        error_data.update({"prompt": prompt, "result": result})
        error_path.write_text(json.dumps(error_data, indent=4))

    print(f'{len(input_prompt_list)} items to go')

0 items to go


In [168]:
with open(file_path, "r") as file:
    data = json.load(file)
df['gemini_lang'] = df['merged_text'].map(data)
df.to_csv('language_analysis/df_langdetect.csv')