# Sentences Generator with DeepSeek

**Description:**
In this project, we utilize the DeepSeek API and Khmer words from Khmer dictionary provided by `seanghay/khmer-dictionary-44k` on [**Hugging Face**](https://huggingface.co/datasets/seanghay/khmer-dictionary-44k) to create Khmer sentences based on provided words. For each generated sentence, we include the given word, its part of speech, and its definition to ensure that each sentence conveys the correct meaning, especially for the target word. After this step, we will replace the target words with their homophone pairs and use these sentences to train our model to recognize Khmer homophones.

In [1]:
# pip install openai

In [2]:
# pip install datasets

In [2]:
from datasets import load_dataset
from openai import OpenAI
import time
import requests # for API calls (if DeepSeek has an HTTP API)
import json

In [None]:
# Configuration
DEEPSEEK_API_KEY = "your-api-key" # replace with your actual API key
MODEL_NAME = "deepseek-chat"

# begin_range = 0 # the index to start your dictionary loop
begin_range = 600

number_of_words_per_file = 100
end_range = begin_range + number_of_words_per_file
OUTPUT_FILE = f"output/data_{begin_range}_{end_range-1}_1.json" # using json format for better structure
SENTENCES_PER_WORD = 60 # Number of sentences to generate perword

# Initial clients
client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com")

# Load data from khmer dictionary
khmer_dictionary = load_dataset("seanghay/khmer-dictionary-44k")

# Generate prompt from the words in dictionary
def generate_sentence(entry, number_of_sentence):
    '''
    :param entry is words that query from khmer dictionary with a loop
    :param number_of_sentence is the number of sentence that you want your promt to create that number of sentences
    return prompt (a paragraph) with word, definition, .. that will be used to generate khmer sentences with target word
    '''

    s1 = "បង្កើតប្រយោគភាសាខ្មែរចំនួន"
    s2 = "ប្រយោគជាមួយនឹងពាក្យ "
    s3 = " មានថ្នាក់ពាក្យជា "
    s4 = " មានន័យថា "
    s5 = " ពាក្យនេះស្ថិតនៅទីតាំងផ្សេងៗគ្នានៅក្នុងប្រយោគ ដែលប្រយោគនិមួយៗមានភាពចម្រុះគ្នា មានបរិបទផ្សេងគ្នា កម្រិតភាសាផ្សេងគ្នា និងមានភាពខុសប្លែគ្នា។ ពេលចាប់ផ្តើមប្រយោគនិមួៗកុំរំលងបន្ទាត់ កុំដាក់លេខរៀង ហើយកុំប្រើអក្សរដិតក្នុងប្រយោគ។"
    output = (s1+str(number_of_sentence)+s2+entry['word']+s3+entry['pos']+s4+entry['definition']+s5)
    return output

# Query DeepSeek (replace with actual API call)
def get_sentences_from_deepseek(prompt):
    try:
        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": "You are a helpful Khmer language assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=5000,
            stream=False
        )
        return [s.strip() for s in response.choices[0].message.content.split("\n") if s.strip()]
        
    except Exception as e:
        print(f"Error: {str(e)}")
        return []

# Main processing loop
for j in range(8,10):
    number_of_words_per_file = 100

    begin_range = j*number_of_words_per_file
    end_range = begin_range + number_of_words_per_file
    OUTPUT_FILE = f"output/data_{begin_range}_{end_range-1}_1.json" # using json format for better structure
    results = []
    for i in range(begin_range, end_range): # Process first 10 entries
        entry = khmer_dictionary["train"][i]
    
        # Generate and save the prompt
        prompt = generate_sentence(entry, SENTENCES_PER_WORD)
        sentences = get_sentences_from_deepseek(prompt)
        result = {
            "word": entry["word"],
            "pos": entry["pos"],
            "definition": entry["definition"],
            "prompt": prompt,
            "generated_sentences": sentences,
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
        }
        results.append(result)
    
        # print progress
        print(f"Processed {i+1}/{end_range}: {entry['word']} - Got {len(sentences)} sentences")
        time.sleep(1) # rate limiting
    
    # save all results
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    
    print(f"Saved results to {OUTPUT_FILE}")