In [ ]:
# !pip uninstall httpx httpcore
# !pip install --upgrade httpx httpcore

In [1]:
import pandas as pd
import os
from openai import OpenAI
import math
import json
import re

In [2]:
with open('config.json') as config_file:
    config = json.load(config_file)
    open_api_key = config['open_api_key']
language = 'Russian'
client = OpenAI(api_key=open_api_key)

In [5]:
def get_sets_to_do(pre_directory, post_directory):
    donesets = os.listdir(os.path.join(language, 'Vocabulary', post_directory))
    donesets.remove('.DS_Store')
    allsets = os.listdir(os.path.join(language, 'Vocabulary', pre_directory))
    notdonesets = [i for i in allsets if not i in donesets]
    todosets = notdonesets#['900-1000.csv']
    todosets.remove('.DS_Store')
    print(todosets)
    return todosets

Clean

In [6]:
def clean(language = 'Russian'):
    df = pd.read_csv(language +'/Vocabulary/Russian1000', sep = '\t')
    df['Russian'] = df['Лемма ']
    df['POS'] = df['Часть речи ']
    df = df[[language, 'POS']]
    pos_mapping = {
    's': 'noun',
    'advpro': 'adverb pronoun',
    'a': 'adjective',
    'adv': 'adverb',
    'v': 'verb',
    'part': 'particle',
    'conj': 'conjunction',
    'spro': 'subject pronoun',
    'apro': 'adjective pronoun',
    'num': 'numeral'
}
    df['POS'] = df['POS'].replace(pos_mapping)
    df.to_csv(language +'/Vocabulary/Clean')
    print(df)
    return df
df = clean()

           Russian              POS
1                и      conjunction
2                в               pr
3               не         particle
4               на               pr
5                я  subject pronoun
...            ...              ...
996          слава             noun
997          кухня             noun
998    определение             noun
999   пользоваться             verb
1000       быстрый        adjective

[1000 rows x 2 columns]


VOCABULARY SPLIT

In [7]:
def split_into_studysets(df, language, splitsize = 100):
    frequencydf = df
    # frequencydf['Frequency'] = frequencydf['Frequency'].astype(int)
    # frequencydf['cumsum'] = frequencydf['Frequency'].cumsum()/(0.01 * frequencydf['Frequency'].sum())
    for i in range(math.floor(len(frequencydf)/splitsize)):
        subdf = frequencydf[splitsize*i: splitsize*(i+1)][[language, 'POS']]
        subdf.to_csv(language +'/Vocabulary/Split_Sets/' + str(splitsize*i) + '-' + str(splitsize*(i+1)) + '.csv')
    print('remainder=', -(len(frequencydf)%splitsize))
    remainder = -(len(frequencydf)%splitsize)
    if remainder > 0:
        remaindersubdf = frequencydf[remainder:][[language, 'POS']]
        if len(remaindersubdf) != splitsize:
            remaindersubdf.to_csv(language + '/Vocabulary/Split_Sets/' + 'Remainder.csv')

    return subdf
df = split_into_studysets(df, language = 'Russian')

remainder= 0


CHATGPT

In [9]:
todosets = get_sets_to_do('Split_Sets','ChatGPT_Sets')

['800-900.csv', '100-200.csv', '600-700.csv', '700-800.csv', '0-100.csv', '300-400.csv', '900-1000.csv', '400-500.csv', '500-600.csv', '200-300.csv']


In [10]:
def generate_sentence_in_russian(variable_word, POS, words_to_include, model = 'gpt-4o-2024-05-13', tenses = ['present', 'future', 'past']):
    """
    Generate a sentence in Russian using a specific word and part of speech, with words from a limited vocabulary, along with its English translation.

    Parameters:
    variable_word (str): The word to include in the sentence.
    POS (str): The part of speech the word should operate as.
    words_to_include (tuple): A list of words to try to include, and how many
    model (str): The model to use for generating the sentence (default is 'gpt-4o-2024-05-13').
    
    # vocabulary (list): A list of words to use in the sentence.
    # tenses (list)

    Returns:
    str: A sentence in Russian and its English translation, separated by a newline.
    """
    
    print(variable_word)
    words_to_include = list(words_to_include[0].sample(words_to_include[1], replace=False))
    # vocabulary = vocabulary
    # vocabulary_str = ', '.join(vocabulary)
    prompt = (
              # f"Use only the following Russian words: {vocabulary_str}. "
        
              f"Create a simple sentence in Russian using basic vocabulary and containing the word '{variable_word}' "
              f"operating as a {POS} part of speech. Also, provide the English translation of the sentence separated by a newline."
              f"Try to include the following words in the sentence: {words_to_include}"
              # f"Use only the following tenses: {tenses}. "
              )

    try:
        # Make the API request
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=50,
            temperature=0.2,
        )

        # Extract and return the output
        output_message = response.choices[0].message.content.strip()
        return output_message

    except Exception as e:
        return f"An error occurred: {str(e)}"

    # Extract and return the output
    output_message = response.choices[0].message.content.strip()
    return output_message
R1000 = pd.read_csv('Russian/Vocabulary/Clean')

In [13]:
# ', '.join(R1000.iloc[:100].Russian)

In [14]:
# for index, row in R1000.iloc[:3].iterrows():
#     print(row.Russian, row.POS)
#     
#     sentence = generate_sentence_in_russian(row.Russian, row.POS, (R1000.iloc[300:310].Russian, 2))
#     print(sentence)

In [15]:
def chatonset(s_set_df, set_name, n = 2):
    words_to_include = (s_set_df.Russian, n)
    s_set_df['ChatGPT_Sentence'] = s_set_df.apply(lambda row: generate_sentence_in_russian(row['Russian'], row['POS'], words_to_include), axis = 1)
    s_set_df.to_csv(os.path.join('Russian/Vocabulary/ChatGPT_Sets', set_name))
    return s_set_df

In [132]:
def do_all_sets(pre_directory, post_directory, func, language = 'Russian'):
    allsets = os.listdir(os.path.join(language, 'Vocabulary', pre_directory))
    todosets = get_sets_to_do(pre_directory, post_directory)
    # print(allsets)
    # print(todosets)
    for s_set in todosets:
        if s_set[0] != '.':
        # if s_set == '500-600.csv':
            # print(s_set)
            s_set_df = pd.read_csv(
                os.path.join(language, 'Vocabulary', pre_directory, s_set)
            )
            # print(s_set_df)
            func(s_set_df, s_set)
            
    return

In [18]:
do_all_sets('Split_Sets', 'ChatGPT_Sets', chatonset)

['800-900.csv', '100-200.csv', '600-700.csv', '700-800.csv', '0-100.csv', '300-400.csv', '900-1000.csv', '400-500.csv', '500-600.csv', '200-300.csv']
['800-900.csv', '100-200.csv', '600-700.csv', '700-800.csv', '0-100.csv', '300-400.csv', '900-1000.csv', '400-500.csv', '500-600.csv', '200-300.csv']
    Unnamed: 0     Russian             POS
0          801       текст            noun
1          802        сюда  adverb pronoun
2          803      темный       adjective
3          804      защита            noun
4          805  предлагать            verb
..         ...         ...             ...
95         896  финансовый       adjective
96         897    открытый       adjective
97         898   почему-то  adverb pronoun
98         899     значить            verb
99         900  возникнуть            verb

[100 rows x 3 columns]
текст
сюда
темный
защита
предлагать
руководство
вовсе
площадь
сознание
гражданский
убить
возраст
молчать
согласиться
участник
участок
рано
пункт
несмотря
сильно

ADD TRANSLATION INFORMATION:

In [19]:
from deep_translator import GoogleTranslator

In [20]:
def add_set_translation_information(s_set_df, set_name):
    # Apply the translation
    s_set_df['Translation'] = s_set_df.apply(
        lambda row: GoogleTranslator(source='ru', target='en').translate(row['Russian']), axis=1
    )

    # Save to CSV
    s_set_df.to_csv(os.path.join('Russian/Vocabulary/Translated Sets', set_name), index=False)
    return s_set_df

In [24]:
translation_todosets = get_sets_to_do('ChatGPT_Sets', 'Translated Sets')

['800-900.csv', '100-200.csv', '600-700.csv', '700-800.csv', '0-100.csv', '300-400.csv', '900-1000.csv', '400-500.csv', '200-300.csv']


In [26]:
for set in translation_todosets:
    do_all_sets('ChatGPT_Sets', 'Translated Sets', add_set_translation_information)

['800-900.csv', '100-200.csv', '600-700.csv', '700-800.csv', '0-100.csv', '300-400.csv', '900-1000.csv', '400-500.csv', '200-300.csv']
['800-900.csv', '100-200.csv', '600-700.csv', '700-800.csv', '0-100.csv', '300-400.csv', '900-1000.csv', '400-500.csv', '200-300.csv']
    Unnamed: 0.1  Unnamed: 0     Russian             POS  \
0              0         801       текст            noun   
1              1         802        сюда  adverb pronoun   
2              2         803      темный       adjective   
3              3         804      защита            noun   
4              4         805  предлагать            verb   
..           ...         ...         ...             ...   
95            95         896  финансовый       adjective   
96            96         897    открытый       adjective   
97            97         898   почему-то  adverb pronoun   
98            98         899     значить            verb   
99            99         900  возникнуть            verb   

         

ConnectionError: HTTPSConnectionPool(host='translate.google.com', port=443): Max retries exceeded with url: /m?tl=en&sl=ru&q=%D1%80%D1%83%D0%BA%D0%B0 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x11033fa10>: Failed to resolve 'translate.google.com' ([Errno 8] nodename nor servname provided, or not known)"))

Make into Quizlet

In [ ]:
# def QuizletCombine(language):
#     dfs = os.listdir(language + '/Vocabulary/Quizlet Sets')
#     dfs.remove('.DS_Store')
#     if 'All.csv' in dfs:
#         dfs.remove('All.csv')
#     df = pd.concat([pd.read_csv(language + '/Vocabulary/Quizlet Sets/' + set) for set in dfs])
#     # df['QuizletMore'] = df['Quizlet'] + '\n' + df['MoreInfo']
#     df = df[[language, 'Quizlet']]
#     return df
# all = QuizletCombine(language)
# all['Quizlet'] = all['Quizlet'] + '\n\n\n'
# all.to_csv('Dutch/Vocabulary/Quizlet Sets/All.csv')


In [162]:
def make_quizlet_set(df, set_name):
    df['ChatGPT_Sentence'] = df['ChatGPT_Sentence'].apply(lambda x: re.sub(r'\n+', '\n', x))
 
    # Create the 'quizlet' column with POS : Translation and exactly two newlines between Translation and ChatGPT_Sentence
    df['quizlet'] = df['POS'] + ' : ' + '*' + df['Translation'] + '*' + '\n\n' + df['ChatGPT_Sentence']


    df = df[['Russian', 'quizlet']]
    output_path = os.path.join('Russian/Vocabulary/Quizlet Sets', set_name)
    df.to_csv(output_path, index=False, quoting=csv.QUOTE_MINIMAL)
    # print(f"CSV file saved to {output_path}")
    return df

In [163]:
import re
quizlet_todosets = get_sets_to_do('Translated Sets', 'Quizlet Sets')
for set in quizlet_todosets:
    do_all_sets('Translated Sets', 'Quizlet Sets', make_quizlet_set)
    
import csv
def makequizlet_step2(input_file):
    output_file = input_file 
    with open(input_file, 'r', newline='', encoding='utf-8') as infile:
        reader = csv.reader(infile)
        rows = list(reader)
    
    # Write the output CSV file with extra newlines
    with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL)
        for row in rows:
            writer.writerow(row)
            writer.writerow([])  # Add a blank row
            writer.writerow([])  # Add a blank row
            writer.writerow([])
    
    print(f"Modified CSV file saved as {output_file}")
    
# for file in os.listdir('Russian/Vocabulary/Quizlet Sets'):
#     if file[0] != '.':
#         print(file)
#         makequizlet_step2('Russian/Vocabulary/Quizlet Sets/' + file)


['800-900.csv', '100-200.csv', '600-700.csv', '700-800.csv', '500-600.csv']
['800-900.csv', '100-200.csv', '600-700.csv', '700-800.csv', '500-600.csv']
[]
[]
[]
[]


In [158]:
# def remove_quotes_from_csv(file_path):
#     # Read the CSV file and remove quotes
#     with open(file_path, 'r', newline='') as infile:
#         reader = csv.reader(infile)
#         rows = [[cell.replace('"', '') for cell in row] for row in reader]
#     
#     # Write the cleaned data back to the CSV file
#     with open(file_path, 'w', newline='') as outfile:
#         writer = csv.writer(outfile)
#         writer.writerows(rows)

In [164]:
for file in os.listdir('Russian/Vocabulary/Quizlet Sets'):
    if file[0] != '.':
        print(file)
        makequizlet_step2('Russian/Vocabulary/Quizlet Sets/' + file)

800-900.csv
Modified CSV file saved as Russian/Vocabulary/Quizlet Sets/800-900.csv
100-200.csv
Modified CSV file saved as Russian/Vocabulary/Quizlet Sets/100-200.csv
600-700.csv
Modified CSV file saved as Russian/Vocabulary/Quizlet Sets/600-700.csv
700-800.csv
Modified CSV file saved as Russian/Vocabulary/Quizlet Sets/700-800.csv
500-600.csv
Modified CSV file saved as Russian/Vocabulary/Quizlet Sets/500-600.csv
