In [1]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_json)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/multilingual-idioms-indian/malayalam.json
/kaggle/input/multilingual-idioms-indian/hindi.json
/kaggle/input/multilingual-idioms-indian/punjabi.json
/kaggle/input/multilingual-idioms-indian/gujrati.json
/kaggle/input/multilingual-idioms-indian/marathi.json
/kaggle/input/multilingual-idioms-indian/kannada.json
/kaggle/input/multilingual-idioms-indian/telugu.json
/kaggle/input/multilingual-idioms-indian/balanced_combined_idioms.json
/kaggle/input/multilingual-idioms-indian/english.json
/kaggle/input/multilingual-idioms-indian/urdu.json
/kaggle/input/gemma2/keras/gemma2_2b_en/1/config.json
/kaggle/input/gemma2/keras/gemma2_2b_en/1/tokenizer.json
/kaggle/input/gemma2/keras/gemma2_2b_en/1/metadata.json
/kaggle/input/gemma2/keras/gemma2_2b_en/1/model.weights.h5
/kaggle/input/gemma2/keras/gemma2_2b_en/1/assets/tokenizer/vocabulary.spm


In [2]:
data= pd.read_json('/kaggle/input/multilingual-idioms-indian/balanced_combined_idioms.json')
data

Unnamed: 0,idiom,literal_meaning,figurative_meaning,example,language
0,Make two ends meet,To connect both ends.,Struggling to manage finances.,"After losing his job, he found it hard to make...",English
1,Break a leg,To fracture a leg.,"Wishing someone success, especially before a p...",'Break a leg' before your performance tonight;...,English
2,Through thick and thin,In both thick and thin conditions.,Loyalty and support regardless of circumstances.,They have been friends through thick and thin ...,English
3,Let the cat out of the bag,To release a cat from a bag.,Revealing a secret carelessly.,She let the cat out of the bag about the surpr...,English
4,Elephant in the room,A large elephant present in the room.,A major problem that people avoid discussing.,'We need to address the elephant in the room r...,English
...,...,...,...,...,...
295,بندر کیا جانے ادرک,What does a monkey know of ginger?,Used to indicate that someone does not underst...,'وہ اس کتاب کی اہمیت کو نہیں سمجھتا، بندر کیا ...,Urdu
296,چپ رہنے میں ہی عافیت ہے,'There is safety in silence.','Sometimes it is better to remain silent than ...,'بہت سی باتوں پر چپ رہنے میں ہی عافیت ہے۔',Urdu
297,بندہ باندھیے,Tied up person.,Someone who is dependent on others for help or...,'وہ ہمیشہ دوسرے لوگوں پر منحصر رہتا ہے، واقعی ...,Urdu
298,چپ رہنے میں ہی عافیت ہے,'There is safety in silence.','Sometimes it is better to remain silent than ...,'بہت سی باتوں پر چپ رہنے میں ہی عافیت ہے۔',Urdu


In [3]:
!pip install -q -U keras-nlp
!pip install -q -U keras>=3

In [4]:
import os

os.environ['KERAS_BACKEND'] = 'jax'
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"]="1.00"

In [5]:
import keras
import keras_nlp

In [6]:
from datasets import load_dataset

ds = load_dataset("json",data_files='/kaggle/input/multilingual-idioms-indian/balanced_combined_idioms.json')

Generating train split: 0 examples [00:00, ? examples/s]

In [13]:
# First, modify the training data format to encourage translation and explanation
data = []
train_data=ds['train']
for example in train_data:
    if not all(key in example for key in ["idiom", "literal_meaning", "figurative_meaning", "example", "language"]):
        continue
        
    template = (
        "Instruction:\n"
        "Find a suitable idiom for this situation: {figurative_meaning}\n\n"
        "Response:\n"
        "Original Idiom ({language}): {idiom}\n"
        "English Translation: {literal_meaning}\n"
        "Example Usage: {example}\n"
        "Cultural Context: This idiom comes from {language} and is commonly used when {figurative_meaning}.\n"
    )
    
    # If the idiom is already in English, use it directly
    if example["language"].lower() == "english":
        example["english_translation"] = example["idiom"]
    else:
        # For non-English idioms, we need to ensure there's a translation
        # You might need to add this to your dataset if not present
        example["english_translation"] = f"[Translation: {example.get('english_translation', 'needs translation')}]"
    
    data.append(template.format(**example))


In [None]:
from datasets import DatasetDict
# Initialize an empty list to store the formatted examples
data = []
# Access the 'train' split of your dataset
train_data = ds["train"]

# Add debug printing to see what fields are available
print("First example keys:", list(train_data[0].keys()))

# Iterate over each example in the dataset
for i, example in enumerate(train_data):
    try:
        # Check if required fields are available and valid
        required_fields = ["idiom", "literal_meaning", "figurative_meaning", "example"]
        
        # Print missing fields for debugging
        missing_fields = [field for field in required_fields if field not in example]
        if missing_fields:
            print(f"Example {i} is missing fields: {missing_fields}")
            continue
            
        # Create a template with instruction and response format
        template = (
            "Instruction:\n Which idiom would be suitable for {figurative_meaning}\n\n"
            "Response:\n"
            "Idiom: {idiom}\n\n"
            "Example Usage: {example}"
        )
        
        # Format the example and add it to the data list
        formatted_example = template.format(**example)
        data.append(formatted_example)
        
    except KeyError as e:
        print(f"KeyError in example {i}: {str(e)}")
        print(f"Available keys: {list(example.keys())}")
        continue

# Limit to the first 1000 examples
data = data[:1000]

# Display the first few formatted examples
for i, example in enumerate(data[:5]):
    print(f"Example {i + 1}:\n{example}\n")

In [9]:
gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset("gemma2_2b_en")
gemma_lm.summary()

In [10]:
# Enable LoRA for the model and set the LoRA rank to 4.
gemma_lm.backbone.enable_lora(rank=4)
gemma_lm.summary()

In [11]:
# Limit the input sequence length to 256 (to control memory usage).
gemma_lm.preprocessor.sequence_length = 256
# Use AdamW (a common optimizer for transformer models).
optimizer = keras.optimizers.AdamW(
    learning_rate=5e-5,
    weight_decay=0.01,
)
# Exclude layernorm and bias terms from decay.
optimizer.exclude_from_weight_decay(var_names=["bias", "scale"])

gemma_lm.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=optimizer,
    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
)
gemma_lm.fit(data, epochs=1, batch_size=1)

[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 473ms/step - loss: 0.9032 - sparse_categorical_accuracy: 0.5776


<keras.src.callbacks.history.History at 0x7e4d2772c9d0>

In [14]:
# Test inference
test_situation = "someone who is very nervous before an important event"
prompt = inference_template.format(test_situation)

# Adjust sampling parameters to encourage creativity
sampler = keras_nlp.samplers.TopKSampler(k=10, seed=2)
gemma_lm.compile(sampler=sampler)
print(gemma_lm.generate(prompt, max_length=512))


Instruction:
Find a suitable idiom for this situation: someone who is very nervous before an important event

Please provide:
1. An appropriate idiom (from any language)
2. Its English translation if not in English
3. A clear example of usage
4. Brief cultural context

Response:
Original Idiom: കാശി പെട്ടി വളയ്ക്കു.
English Translation: To go into convulsions.
Cultural Context: Malayalam.
Example Usage: 'ഒരു കാര്യത്തിന് ആവശ്യമായ കാശി പെട്ടി വളയ്ക്കുന്നവർ ഇല്ല എങ്കിൽ മതിയായി.'
Brief Cultural Context: The idiom comes from Malayalam and is used when someone is very nervous before an important event.



In [None]:
#Define figurative meaning to be tested
test_meaning = "Someone who is carefree  "


#Using the same template format as training
prompt = (
    "Instruction:\n Which  idiom would be suitable for {}\n\n"
    "Response:\n"
).format(test_meaning)


sampler = keras_nlp.samplers.TopKSampler(k=10, seed=2)
gemma_lm.compile(sampler=sampler)
print(gemma_lm.generate(prompt, max_length=512))