In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -q -U keras-nlp datasets
!pip install -q -U keras


In [None]:
import os


import keras_nlp
import keras



In [None]:

# Set the backbend before importing Keras
os.environ["KERAS_BACKEND"] = "jax"
# Avoid memory fragmentation on JAX backend.
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "1.00"

In [None]:
# Run at half precision.
#keras.config.set_floatx("bfloat16")

# Training Configurations
token_limit = 256
num_data_limit = 100
lora_name = "swahili"
lora_rank = 4
lr_value = 1e-4
train_epoch = 10
model_id = "gemma2_instruct_2b_en"

In [None]:
gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset(model_id)
gemma_lm.summary()

In [None]:
print(gemma_lm.generate("translate: mama amezaa mtoto, to english?",max_length=256))

In [None]:
import pandas as pd
df=pd.read_csv('/kaggle/input/swahili-dataset-for-gemma/final1_swahili_english.csv')

df.columns=['swahili','english']
df.head()

In [None]:
import pandas as pd
import json

# Load the CSV file
csv_file_path = '/kaggle/input/swahili-dataset-for-gemma/final1_swahili_english.csv'
data = pd.read_csv(csv_file_path)

# Convert DataFrame to JSON
json_data = data.to_dict(orient='records')

# Save JSON data to file
json_file_path = 'swahili_english_translations.json'
with open(json_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(json_data, json_file, ensure_ascii=False, indent=4)

print(f"CSV file has been converted to JSON and saved as {json_file_path}.")

In [None]:
from datasets import load_dataset

# Load the JSON file
dataset = load_dataset("json", data_files="swahili_english_translations.json")

# Preview the dataset
print(dataset['train'][1])

In [None]:
LoRA_rank = 2 # you can modify this 
# Enable LoRA for the model and set the LoRA rank to 2,4,...
gemma_lm.backbone.enable_lora(rank=LoRA_rank)
gemma_lm.summary()

In [None]:
# Limit the input sequence length to 256 (to control memory usage).
gemma_lm.preprocessor.sequence_length = 256
# Use AdamW (a common optimizer for transformer models).
optimizer = keras.optimizers.AdamW(
    learning_rate=5e-5,
    weight_decay=0.01,
)
# Exclude layernorm and bias terms from decay.
optimizer.exclude_from_weight_decay(var_names=["bias", "scale"])

gemma_lm.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=optimizer,
    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
)

In [None]:
tokenizer = keras_nlp.models.GemmaTokenizer.from_preset(model_id)

from datasets import load_dataset

dataset = load_dataset("json", data_files="swahili_english_translations.json")



In [None]:
# Apply the tokenization function

data = dataset.with_format(
    "np", columns=["kiswahili", "english"], output_all_columns=False
)

In [None]:
print(data)

In [None]:
train = []
for x in data['train']:  # Iterating over the 'train' split
    item = f"<start_of_turn>user\n{x['kiswahili']}<end_of_turn>\n<start_of_turn>model\n{x['english']}<end_of_turn>"
    length = len(tokenizer(item))
    # skip data if the token length is longer than our limit
    if length < token_limit:
        train.append(item)

In [None]:
new_train=train[0:3000]
vallid=train[3001:]

In [None]:
print(len(new_train))
print(new_train[0])
print(len(vallid))
#print(vallid[2001])


In [None]:
history = gemma_lm.fit(new_train, epochs=train_epoch, batch_size=1)

import matplotlib.pyplot as plt
plt.plot(history.history['loss'])
plt.show()

In [None]:
print(gemma_lm.generate("translate: mama amezaa mtoto, to english?",max_length=256))

In [None]:
def text_gen(prompt):
    input = f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
    output = gemma_lm.generate(input, max_length=token_limit)
    print("\nGemma output:")
    print(output)

In [None]:
import pickle

# Save the model to a file
with open('model.pkl', 'wb') as file:
    pickle.dump(gemma_lm, file)

print("Model saved as model.pkl")
