In [1]:
import os
from json import JSONDecodeError

from openai import OpenAI
import json
import csv
from tqdm import tqdm

In [6]:
client = OpenAI(api_key="")

rare_word_percentages = [0.02, 0.03, 0.04, 0.05] # Follow the 95%/98% rule, but consider 3% and 4% as well
TARGET_AUDIENCE = 'B1-B2 level' # According to the CEFR
SOURCE_LANGUAGE = 'English'
TARGET_LANGUAGE = 'Spanish'
RARE_WORD_FREQUENCY = 'medium'

folder_path = './articles'

articles_dict = dict()

for file_name in os.listdir(folder_path):

    if file_name.endswith('.txt'):
        for rare_word_percentage in rare_word_percentages:
            file_path = os.path.join(folder_path, file_name)
    
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
    
            article_name = os.path.splitext(file_name)[0] + f"_{int(rare_word_percentage * 100)}_percent"
            articles_dict[article_name] = dict()

            articles_dict[article_name]['article_text'] = content
            articles_dict[article_name]['article_length'] = len(content.split())
            articles_dict[article_name]['number_of_rare_words'] = int(len(content.split()) * rare_word_percentage)

In [7]:
def parse_rare_words(article_data):
    try:
        parsed_data = json.loads(article_data)
    except json.JSONDecodeError:
        parsed_data = manually_parse_rare_words(article_data)

    return parsed_data


def manually_parse_rare_words(article_data):
    parsed_data = {}

    for line in article_data.splitlines():
        if ':' in line:
            key, value = line.split(':', 1)
            parsed_data[key.strip()] = value.strip()

    return parsed_data

In [9]:
def generate_rare_words(article_dict):
    word_gen_system_prompt = """You are a bilingual language expert. Your task is to analyze the provided article and suggest nouns that would be unfamiliar to intermediate language learners but commonly known to native speakers.

Key requirements:
1. Generate nouns that native speakers would know but intermediate learners typically haven't learned yet
2. Avoid basic vocabulary that intermediate learners would already know
3. Focus on 2-5 syllable words
4. Exclude highly technical, academic, or specialized terminology

Please, provide content in valid JSON"""

    word_gen_user_prompt = f"""Analyze the following article and identify {article_dict['number_of_rare_words']} nouns that would be unfamiliar to intermediate learners but known for native speakers.

Target Language: {TARGET_LANGUAGE}
Translation Language: {SOURCE_LANGUAGE}

For each of the {article_dict['number_of_rare_words']} words, provide:
1. The word in {TARGET_LANGUAGE}
2. Its translation in {SOURCE_LANGUAGE}

Format the output as a JSON array with the following structure:
{{
    "rare_words": [
        {{
            "rare_word": "word in {TARGET_LANGUAGE}",
            "translation": "word in {SOURCE_LANGUAGE}",
        }}
    ]
}}

Article:
{article_dict['article_text']}"""

    response = client.chat.completions.create(
        model="gpt-4o",
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": word_gen_system_prompt.strip()},
            {"role": "user", "content": word_gen_user_prompt.strip()}
        ]
    )
    
    try:
        final_response = parse_rare_words(response.choices[0].message.content)
    except JSONDecodeError:
        final_response = manually_parse_rare_words(response.choices[0].message.content)

    return final_response['rare_words']


# TODO: Control forms, Controls length, Rare, Having different sources - no leg up, Not latin-ate in rooting, germanic
# TODO: Spanish L2, English L1
# word_list_text = generate_rare_words()
# print("Generated Rare Nouns:\n")
# print(word_list_text)
# print("\n" + "-"*80 + "\n")


In [10]:
def repurpose_article(articles_dict, rare_words):
    system_prompt = """You are a multilingual language expert. Your task is to:
1. Translate the provided article into the target language
2. Incorporate specified rare nouns into the translated text
3. Ensure the final text maintains coherence and is appropriate for the target audience

You will:
- First translate the entire article into the target language using a language that is appropriate for the target audience
- Then insert the provided rare words into the  text according to the specified frequency
- Bold all inserted rare words using **word** format
- Maintain the original article's tone and style while incorporating new vocabulary

Parameters:
- Target Language: The language to translate the article into
- Target Audience: Language proficiency level (A1-C2 CEFR scale)
- Word Frequency: How often to insert rare words (low: 1 time, medium: 2-3 times, high: 4+ times)
- Article: The original article to translate
- Rare Words List: The list of rare words to incorporate into the translated text"""

    user_prompt = f"""Please follow these steps:
1. Translate this article into {TARGET_LANGUAGE} using language appropriate for {TARGET_AUDIENCE} level learners
2. Insert the provided rare nouns according to the specified frequency
3. Mark all inserted rare words in **bold**

Parameters:
- Rare Word Frequency: medium
- Target Audience: {TARGET_AUDIENCE}
- Target Language: {TARGET_LANGUAGE}
- Article: {articles_dict['article_text']}
- Rare Words List: {rare_words}"""

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_prompt.strip()},
            {"role": "user", "content": user_prompt.strip()}
        ]
    )

    return response.choices[0].message.content.strip()

In [11]:
def main():
    for article_name, article_dict in tqdm([list(articles_dict.items())[3]]):
        rare_words = generate_rare_words(article_dict)

        article_text_with_rare_words = repurpose_article(articles_dict[article_name], [rare_word['rare_word'] for rare_word in rare_words])
        # article_text_with_rare_words = article_text_with_rare_words.replace('**', '') # Remove the bold formatting for now

        articles_dict[article_name]['rare_words'] = rare_words
        articles_dict[article_name]['article_text_with_rare_words'] = article_text_with_rare_words

        # Save the rare words to a CSV file
        with open(f'./rare_words/{article_name}_rare_words.csv', 'w', newline='', encoding='utf-8') as csv_file:
            writer = csv.writer(csv_file)

            writer.writerow(['rare_word', 'translation'])
            for item in articles_dict[article_name]['rare_words']:
                writer.writerow([item['rare_word'], item['translation']])

        # Save the text with rare words to a text file
        with open(f'./articles_with_rare_words/{article_name}_with_rare_words.txt', 'w', encoding='utf-8') as f:
            f.write(article_text_with_rare_words)

    with open('./articles_with_rare_words.json', 'w', encoding='utf-8') as json_file: # TODO: should we include datetime in the file name?
        json.dump(articles_dict, json_file, ensure_ascii=False, indent=4)
        

if __name__ == '__main__':
    main()

100%|██████████| 1/1 [00:17<00:00, 17.35s/it]


In [16]:
print(articles_dict[article_name]['rare_words'])
print(articles_dict[article_name]['article_text_with_rare_words'])

[{'rare_word': 'camarero', 'translation': 'waiter'}, {'rare_word': 'chofer', 'translation': 'driver'}, {'rare_word': 'parcel', 'translation': 'plot'}, {'rare_word': 'malezas', 'translation': 'weeds'}, {'rare_word': 'complejo', 'translation': 'complex'}, {'rare_word': 'conveniencia', 'translation': 'convenience'}, {'rare_word': 'realidad', 'translation': 'reality'}, {'rare_word': 'industria', 'translation': 'industry'}, {'rare_word': 'noche', 'translation': 'night'}, {'rare_word': 'residente', 'translation': 'resident'}, {'rare_word': 'solicitud', 'translation': 'request'}, {'rare_word': 'amigo', 'translation': 'friend'}, {'rare_word': 'chofer', 'translation': 'driver'}, {'rare_word': 'frente', 'translation': 'forehead'}, {'rare_word': 'crédito', 'translation': 'credit'}, {'rare_word': 'sandwich', 'translation': 'sandwich'}, {'rare_word': 'empresas', 'translation': 'companies'}, {'rare_word': 'niebla', 'translation': 'fog'}, {'rare_word': 'posición', 'translation': 'position'}, {'rare_w

In [18]:
from collections import Counter
import re

text = articles_dict[article_name]['article_text_with_rare_words']

cleaned_text = re.sub(r'[^\w\s]', '', text).lower()

rare_words_dict = articles_dict[article_name]['rare_words']
# Extract rare words from the dictionary
rare_words = {entry['rare_word'] for entry in rare_words_dict}

# Remove punctuation and convert to lowercase
cleaned_text = re.sub(r'[^\w\s]', '', text).lower()

# Split the text into words
words = cleaned_text.split()

# Count the frequency of each word
word_counts = Counter(words)

# Initialize a dictionary to hold the counts of all rare words
rare_word_counts = {word: 0 for word in rare_words}

# Update the counts with actual occurrences
for word in rare_word_counts:
    rare_word_counts[word] = word_counts.get(word, 0)

# Print all rare words and their counts
for word, count in rare_word_counts.items():
    print(f"{word}: {count}")

camarero: 1
empresas: 1
industria: 1
teléfono: 1
malezas: 1
noche: 1
residente: 0
frente: 0
marido: 0
realidad: 1
humedad: 1
chofer: 1
parcel: 1
sandwich: 1
posición: 1
amigo: 1
conveniencia: 1
solicitud: 1
detestadas: 1
complejo: 1
niebla: 1
crédito: 1
