In [59]:
# !pip uninstall httpx httpcore
# !pip install --upgrade httpx httpcoreimport pandas as pd
import os
from openai import OpenAI
import math
import json
import re
import builtins
import csv
from deep_translator import GoogleTranslator

In [ ]:
#frequencydf should be a csv with columns: Language (e.g. 'Dutch'), 'POS' and for best results, 'Translation'
language = 'Dutch'
language_code = 'nl' #For the translation API
frequencydf = pd.read_csv(os.path.join('Frequency Lists', language, 'Clean.csv'))
frequencydf

In [60]:
with open('config.json') as config_file:
    config = json.load(config_file)
    open_api_key = config['open_api_key']
client = OpenAI(api_key=open_api_key)

SPLIT THE VOCABULARY INTO GROUPS OF STUDYSETS - default is 100 per set -> 1000, and 250 per set after.

In [69]:
def split_into_studysets(frequencydf, language, initial_splitsize=100, new_splitsize=250):
    os.makedirs(os.path.join(language, 'Vocabulary', 'Split_Sets'), exist_ok=True)
    columns = [language]
    if 'POS' in frequencydf.columns:
        columns.append('POS')
    if 'Translation' in frequencydf.columns:
        columns.append('Translation')
    if 'English' in frequencydf.columns:
        frequencydf.rename(columns={'English': 'Translation'}, inplace=True)
        columns.append('Translation')
    if 'lemma' in frequencydf.columns:
        columns.append('show')
        columns.append('lemma')
    print('columns:', columns)
    
    total_rows = len(frequencydf)
    i = 0
    splitsize = initial_splitsize
    
    while i * splitsize < total_rows:
        if i * splitsize >= 1000:
            splitsize = new_splitsize
            start_index = 1000 + (i - 10) * new_splitsize  # Adjust start index for new splitsize
        else:
            start_index = i * splitsize
        
        end_index = min(start_index + splitsize, total_rows)
        subdf = frequencydf[start_index:end_index][columns]
        subdf.to_csv(language + '/Vocabulary/Split_Sets/' + str(start_index) + '-' + str(end_index) + '.csv')
        
        i += 1
    
    remainder = total_rows % splitsize
    print('remainder=', remainder)
    if remainder > 0:
        remaindersubdf = frequencydf[-remainder:][columns]
        remaindersubdf.to_csv(language + '/Vocabulary/Split_Sets/' + 'Remainder.csv')

    return subdf

# Example usage
df = split_into_studysets(frequencydf, language=language)

columns: ['Dutch', 'POS', 'Translation']
remainder= 248


OPTIONAL: If you only want to do a subset of files for time purposes

In [1]:
selected_files = ['100-200.csv', '0-100.csv', '200-300.csv', '300-400.csv', '700-800.csv', '900-1000.csv', '500-600.csv', '400-500.csv', '600-700.csv', '800-900.csv']
selected_files += ['1000-1250.csv', '1250-1500.csv','1500-1750.csv',
'1750-2000.csv']
# selected_files = ['2000-2250.csv','2250-2500.csv','2500-2750.csv',2750-3000.csv']
# selected_files for only doing a subset to save time

In [70]:
def move_unselected_files(language, selected_files):
    # Create directory for spare sets
    spare_split_sets_dir = os.path.join(language, 'Vocabulary', 'Spare_Split_Sets')
    os.makedirs(spare_split_sets_dir, exist_ok=True)

    split_sets_dir = os.path.join(language, 'Vocabulary', 'Split_Sets')
    all_files = os.listdir(split_sets_dir)
    selected_files_set = set(selected_files)

    # Move files not in selected_files to the spare sets directory
    for file_name in all_files:
        if file_name not in selected_files_set:
            os.rename(
                os.path.join(split_sets_dir, file_name),
                os.path.join(spare_split_sets_dir, file_name)
            )
  # example of selected files
move_unselected_files(language, selected_files)

In [64]:
def get_sets_to_do(post_directory):
    os.makedirs(os.path.join(language, 'Vocabulary', post_directory), exist_ok = True)
    completed_sets = os.listdir(os.path.join(language, 'Vocabulary', post_directory))
    allsets = os.listdir(os.path.join(language, 'Vocabulary', 'split_sets'))
    incomplete_sets = [i for i in allsets if not i in completed_sets]
    if '.DS_Store' in donesets:
        incomplete_sets.remove('.DS_Store')
    # print('Done:', completed_sets)
    # print('To Do:', incomplete_sets)
    return incomplete_sets

GPT4 - Add example sentences for each word

In [71]:
CHATGPT_todosets = get_sets_to_do('ChatGPT_Sets')

Done: ['.DS_Store', '800-900.csv', '100-200.csv', '600-700.csv', '700-800.csv', '0-100.csv', '300-400.csv', '900-1000.csv', '400-500.csv', '500-600.csv', '200-300.csv']
To Do: ['1000-1250.csv', '1500-1750.csv', '1250-1500.csv', '1750-2000.csv']


In [72]:
def generate_sentence_in_target_language(variable_word, POS, words_to_include, model = 'gpt-4o-2024-05-13', tenses = ['present', 'future', 'past']):
    """
    Generate a sentence in Target Language using a specific word and part of speech, with words from a limited vocabulary, along with its English translation.

    Parameters:
    variable_word (str): The word to include in the sentence.
    POS (str): The part of speech the word should operate as.
    words_to_include (tuple): A list of words to try to include, and how many
    model (str): The model to use for generating the sentence (default is 'gpt-4o-2024-05-13').
    
    # vocabulary (list): A list of words to use in the sentence.
    # tenses (list)

    Returns:
    str: A sentence in Russian and its English translation, separated by a newline.
    """
    
    words_to_include = list(words_to_include[0].sample(words_to_include[1], replace=False))
    # print(variable_word, POS)
    # print('words_to_include:', words_to_include)
    prompt = (
              f"Create a simple sentence in f{language} using basic vocabulary and containing the word '{variable_word}' "
              f"operating as a {POS} part of speech. Also, provide the English translation of the sentence separated by a newline."
              f"Try to include the following words in the sentence: {words_to_include}"
              # f"Use only the following tenses: {tenses}. "
              )

    try:
        # Make the API request
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=50,
            temperature=0.2,
        )

        # Extract and return the output
        output_message = response.choices[0].message.content.strip()
        return output_message

    except Exception as e:
        return f"An error occurred: {str(e)}"

    # Extract and return the output
    output_message = response.choices[0].message.content.strip()
    
    return output_message

In [73]:
#TEST THE API
for index, row in frequencydf.iloc[104:108].iterrows():
    sentence = generate_sentence_in_target_language(row[language], row.POS, (frequencydf.iloc[100:200][language], 2))
    print(sentence)

hoop Noun
['haal', 'eigen']
Ik haal mijn eigen hoop uit het leven.
I draw my own hope from life.
wou Verb
['dagen', 'slecht']
Ik wou dat de dagen niet zo slecht waren.
I wished that the days were not so bad.
vermoord Verb
['kind', 'vroeg']
De man vermoordde het kind vroeg in de ochtend.
The man murdered the child early in the morning.
elke Determiner
['voel', 'kans']
Elke kans voel ik.  
Every opportunity I feel.


In [74]:
def Apply_LLM_to_studyset(s_set_df, set_name, n = 2):
    words_to_include = (s_set_df[language], n)
    s_set_df['ChatGPT_Sentence'] = s_set_df.apply(lambda row: generate_sentence_in_target_language(row[language], row['POS'], words_to_include), axis = 1)
    s_set_df.to_csv(os.path.join(language, 'Vocabulary','ChatGPT_Sets', set_name))
    return s_set_df

In [75]:
def do_all_sets(post_directory, func, language = language):
    """ post-directory: directory where sets are stored
    func: function to be applied to study_sets"""
    os.makedirs(os.path.join(language, 'Vocabulary', post_directory), exist_ok = True)
    for s_set in get_sets_to_do(post_directory):
        if s_set[0] != '.': #IGNORE .DSStore
            s_set_df = pd.read_csv(
                os.path.join(language, 'Vocabulary', 'split_sets', s_set)
            )
            print(s_set)
            func(s_set_df, s_set)
    return

In [77]:
do_all_sets('ChatGPT_Sets', Apply_LLM_to_studyset)

Done: ['1000-1250.csv', '.DS_Store', '800-900.csv', '1500-1750.csv', '100-200.csv', '600-700.csv', '1250-1500.csv', '700-800.csv', '1750-2000.csv', '0-100.csv', '300-400.csv', '900-1000.csv', '400-500.csv', '500-600.csv', '200-300.csv']
To Do: []


ADD TRANSLATION INFORMATION TO SETS IF NECESSARY:

In [78]:
def add_set_translation_information(s_set_df, set_name, code = language_code):
    LLM_Set = pd.read_csv(os.path.join(language, 'Vocabulary', 'ChatGPT_Sets', set_name))
    
    # Check whether translation is already in the data
    if 'Translation' in LLM_Set.columns:
        if 'POS' in LLM_Set.columns:
            s_set_df = LLM_Set[[language, 'Translation']]
        
    else:
        s_set_df['Translation'] = s_set_df.apply(
            lambda row: GoogleTranslator(source= code, target='en').translate(row[language]), axis=1
        )

    s_set_df.to_csv(os.path.join(language, 'Vocabulary/Translated Sets', set_name), index=False)
    return s_set_df

In [79]:
translation_todosets = get_sets_to_do('Translated Sets')

Done: ['800-900.csv', '100-200.csv', '600-700.csv', '700-800.csv', '0-100.csv', '300-400.csv', '900-1000.csv', '400-500.csv', '500-600.csv', '200-300.csv']
To Do: ['1000-1250.csv', '1500-1750.csv', '1250-1500.csv', '1750-2000.csv']


In [80]:
for set in translation_todosets:
    print(set)
    do_all_sets('Translated Sets', add_set_translation_information)

1000-1250.csv
Done: ['800-900.csv', '100-200.csv', '600-700.csv', '700-800.csv', '0-100.csv', '300-400.csv', '900-1000.csv', '400-500.csv', '500-600.csv', '200-300.csv']
To Do: ['1000-1250.csv', '1500-1750.csv', '1250-1500.csv', '1750-2000.csv']
     Unnamed: 0          Dutch      POS Translation
0          1000           eind     Noun         end
1          1001          deden     Verb         did
2          1002          mijne  Pronoun        mine
3          1003           gooi     Noun       throw
4          1004           gast     Noun       guest
..          ...            ...      ...         ...
245        1245        procent     Noun    per cent
246        1246  neergeschoten     Verb        shot
247        1247          spoor     Noun       track
248        1248      kilometer     Noun  kilometers
249        1249        bezorgd     Verb     Worried

[250 rows x 4 columns]
1000-1250.csv
     Unnamed: 0       Dutch        POS   Translation
0          1500       halve  Adjective 

Make into Quizlet

In [89]:
def make_quizlet_ready_set(set_name):
    #For the quizlet sets, the rows are separated by 3 blank lines 
    df = pd.read_csv(os.path.join(language, 'Vocabulary', 'ChatGPT_Sets', set_name))
    transdf = pd.read_csv(os.path.join(language, 'Vocabulary', 'Translated Sets', set_name))

    df['ChatGPT_Sentence'] = df['ChatGPT_Sentence'].apply(lambda x: re.sub(r'\n+', '\n', x))
    # For French - Removing text within parentheses that start with 'see'
    df['ChatGPT_Sentence'] = df['ChatGPT_Sentence'].str.replace(r'\(see[^)]*\)', '', regex=True)
    df['ChatGPT_Sentence'] = df['ChatGPT_Sentence'].str.strip()
    
    df['quizlet'] = df['POS'] + ' : ' + '*' + transdf['Translation'] + '*' + '\n' + df['ChatGPT_Sentence']
    df = df[[language, 'quizlet']]
    
    os.makedirs(os.path.join(language, 'Vocabulary', 'Quizlet Sets'), exist_ok=True)
    output_path = os.path.join(language, 'Vocabulary', 'Quizlet Sets', set_name.replace('.csv', '.txt'))

    # Saving the data to a tab-separated text file with 3 lines between each row
    with open(output_path, 'w', encoding='utf-8') as outfile:
        for index, row in df.iterrows():
            outfile.write(f"{row[language]}\t{row['quizlet']}\n\n\n")

    print(f"Text file saved to {output_path}")
    return df

In [90]:
def make_English2Language_quizlet_set(set_name):
    df = pd.read_csv(os.path.join(language, 'Vocabulary', 'ChatGPT_Sets', set_name))
    transdf = pd.read_csv(os.path.join(language, 'Vocabulary', 'Translated Sets', set_name))

    df['ChatGPT_Sentence'] = df['ChatGPT_Sentence'].apply(lambda x: re.sub(r'\n+', '\n', x))

    # Removing text within parentheses that start with 'see'
    df['ChatGPT_Sentence'] = df['ChatGPT_Sentence'].str.replace(r'\(see[^)]*\)', '', regex=True).str.strip()
    
    df['quizlet_front'] = df['POS'] + ' : ' + '*' + transdf['Translation'] + '*'
    df['quizlet_back'] =  '*' + df[language] + '*' + '\n' + df['ChatGPT_Sentence']

    df = df[['quizlet_front', 'quizlet_back']]
    os.makedirs(os.path.join(language, 'Vocabulary', 'Reverse Quizlet Sets'), exist_ok=True)
    output_path = os.path.join(language, 'Vocabulary', 'Reverse Quizlet Sets', set_name.replace('.csv', '.txt'))

    # Saving the data to a tab-separated text file with 3 lines between each row
    with open(output_path, 'w', encoding='utf-8') as outfile:
        for index, row in df.iterrows():
            outfile.write(f"{row['quizlet_front']}\t{row['quizlet_back']}\n\n\n")

    print(f"Text file saved to {output_path}")
    return df

In [92]:
# builtin_set = builtins.set
LLM_sets = builtin_set(os.listdir(os.path.join(language, 'Vocabulary', 'ChatGPT_Sets')))
Translated_Sets = builtin_set(os.listdir(os.path.join(language, 'Vocabulary', 'Translated Sets')))
quizlet_todosets = reverse_quizlet_todosets = LLM_sets.intersection(Translated_Sets)

for s_set in quizlet_todosets:
    make_quizlet_ready_set(s_set)
    make_English2Language_quizlet_set(s_set)


{'0-100.csv',
 '100-200.csv',
 '1000-1250.csv',
 '1250-1500.csv',
 '1500-1750.csv',
 '1750-2000.csv',
 '200-300.csv',
 '300-400.csv',
 '400-500.csv',
 '500-600.csv',
 '600-700.csv',
 '700-800.csv',
 '800-900.csv',
 '900-1000.csv'}