In [None]:
# SETUP & INSTALLATION
!pip install datasets transformers sentence-transformers faiss-cpu torch

import pandas as pd
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import faiss
import pickle
import os

In [None]:
# LOAD & PREPARE DATA
print("Loading Datasets...")

# Load Medicine Data
url = "https://raw.githubusercontent.com/MinSiThu/Burmese-Microbiology-1K/main/data/Microbiology.csv"
medicine_df = pd.read_csv(url)
med_questions = medicine_df['Instruction'].tolist()
med_answers = medicine_df['Output'].tolist()

# Load Agriculture Data
dataset = load_dataset("chuuhtetnaing/myanmar-instruction-tuning-dataset")
def is_agriculture(example):
    keywords = ["လယ်သမား", "စိုက်ပျိုးရေး", "လယ်ယာ", "စပါး", "ပင်ပေါက်", "သစ်တော", "ရေမြေ", "သီးနှံ", "သတ်မှတ်ချက်", "သစ်ပင်"]
    return any(keyword in example['inputs'] for keyword in keywords)

agriculture_dataset = dataset['train'].filter(is_agriculture)
agri_questions = agriculture_dataset['inputs']
agri_answers = agriculture_dataset['targets']

# Combine for Training
# convert Q&A pairs into "InputExample" format for the model
train_examples = []

# Add Medicine pairs
for q, a in zip(med_questions, med_answers):
    train_examples.append(InputExample(texts=[str(q), str(a)]))

# Add Agriculture pairs
for q, a in zip(agri_questions, agri_answers):
    train_examples.append(InputExample(texts=[str(q), str(a)]))

print(f"Total Training Pairs: {len(train_examples)}")

In [None]:
# FINE-TUNE THE MODEL
print("Loading Pre-trained Model...")
# start with the base multilingual model
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# Create a DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)

# Define Loss Function
train_loss = losses.MultipleNegativesRankingLoss(model)

# Train the model
print("Starting Fine-Tuning (This updates the neural network)...")
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=4,
    warmup_steps=100,
    show_progress_bar=True
)

# Save the FINE-TUNED model
output_path = "./fine_tuned_burmese_model"
model.save(output_path)
print(f"Fine-tuned model saved to: {output_path}")

In [None]:
# RE-GENERATE FAISS INDEXES (Using NEW Model)
print("Regenerating FAISS Indexes with the new brain...")

# Function to save index and answers
def create_index(questions, answers, name):
    embeddings = model.encode(questions, convert_to_numpy=True, show_progress_bar=True)

    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    # Save Index
    faiss.write_index(index, f"{name}_faiss.index")

    # Save Answers
    with open(f"{name}_answers.pkl", "wb") as f:
        pickle.dump(answers, f)
    print(f"Saved {name} index and answers.")

# create Medicine Index
create_index(med_questions, med_answers, "medicine")

# create Agriculture Index
create_index(agri_questions, agri_answers, "agriculture")

In [None]:
# ZIP FILES FOR DOWNLOAD
!zip -r my_project_data.zip fine_tuned_burmese_model medicine_faiss.index medicine_answers.pkl agriculture_faiss.index agriculture_answers.pkl
print("All files zipped! Download 'my_project_data.zip' from the files tab.")

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Create a folder in Drive to store the project
import os
destination_folder = "/content/drive/My Drive/Burmese_AI_Model"

if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)
    print(f"Created folder: {destination_folder}")

# Copy the zip file to Google Drive
import shutil
source_file = "my_project_data.zip"
destination_path = f"{destination_folder}/my_project_data.zip"

if os.path.exists(source_file):
    shutil.copy(source_file, destination_path)
    print(f"Success! File saved to Google Drive at: {destination_path}")
else:
    print("Error: Could not find 'my_project_data.zip'.")

In [None]:
!pip install datasets pandas

import pandas as pd
from datasets import load_dataset
import pickle

print("Repairing Medicine Data...")
# Repair Medicine
url = "https://raw.githubusercontent.com/MinSiThu/Burmese-Microbiology-1K/main/data/Microbiology.csv"
medicine_df = pd.read_csv(url)
# Force conversion to simple string list
med_answers = [str(x) for x in medicine_df['Output'].tolist()]

with open("medicine_answers.pkl", "wb") as f:
    pickle.dump(med_answers, f)

print("Repairing Agriculture Data...")
# Repair Agriculture
dataset = load_dataset("chuuhtetnaing/myanmar-instruction-tuning-dataset")

def is_agriculture(example):
    keywords = ["လယ်သမား", "စိုက်ပျိုးရေး", "လယ်ယာ", "စပါး", "ပင်ပေါက်", "သစ်တော", "ရေမြေ", "သီးနှံ", "သတ်မှတ်ချက်", "သစ်ပင်"]
    return any(keyword in example['inputs'] for keyword in keywords)

# Filter again
agriculture_dataset = dataset['train'].filter(is_agriculture)


agri_answers = [str(x) for x in agriculture_dataset['targets']]

with open("agriculture_answers.pkl", "wb") as f:
    pickle.dump(agri_answers, f)

print("SUCCESS! Clean files created.")