In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

# Load the TibetanQA dataset
df = pd.read_excel('TibetanQA.xlsx')

# Inspect data structure
print(df.columns, df.shape)

  from .autonotebook import tqdm as notebook_tqdm


Index(['问题', '答案', '文本', 'Unnamed: 3', '标题'], dtype='object') (2007, 5)


In [2]:
df.columns = ['question', 'answer', 'text', 'Unnamed: 3', 'title']

In [3]:
# Clean and prepare data
# Assuming columns: 'article', 'question', 'answer'
df = df.dropna(subset=['question', 'text'])
df['qa_pair'] = df['question'] + ' ' + df['text']

In [4]:
model = SentenceTransformer('sentence-transformers/LaBSE')

# Test with Tibetan text
test_tibetan = "འདི་ནི་བོད་ཡིག་གི་དཔེ་ཞིག་ཡིན།"
test_embedding = model.encode(test_tibetan)
print(f"Embedding dimension: {len(test_embedding)}")

Embedding dimension: 768


In [5]:
import chromadb
from chromadb.config import Settings
from tqdm import tqdm

# Initialize ChromaDB
client = chromadb.PersistentClient(path="./tibetan_qa_db")
collection = client.create_collection("tibetan_qa")

# Embed and store Q&A pairs
questions = df['question'].tolist()
answers = df['text'].tolist()
qa_pairs = df['qa_pair'].tolist()

# Generate embeddings (batch processing)
batch_size = 100
for i in tqdm(range(0, len(questions), batch_size)):
    batch_questions = questions[i:i+batch_size]
    batch_answers = answers[i:i+batch_size]
    batch_qa = qa_pairs[i:i+batch_size]
    
    embeddings = model.encode(batch_questions)
    
    collection.add(
        embeddings=embeddings.tolist(),
        documents=batch_qa,
        metadatas=[{"question": q, "answer": a} for q, a in zip(batch_questions, batch_answers)],
        ids=[f"qa_{j}" for j in range(i, i+len(batch_questions))]
    )

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
  0%|          | 0/21 [00:00<?, ?it/s]Failed to send telemetry event CollectionAddEvent: capture() takes 1 positional argument but 3 were given
100%|██████████| 21/21 [02:23<00:00,  6.84s/it]


In [33]:
import requests
import json

class ApertusSwissLLM:
    def __init__(self, api_key=None, base_url="https://chat.publicai.co"):
        self.api_key = api_key
        self.base_url = base_url
        
    def generate_response(self, prompt, context="", max_tokens=500):
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
        
        payload = {
            "model": "swiss-llm",  # Adjust model name as needed
            "messages": [
                {"role": "system", "content": f"Context: {context}"},
                {"role": "user", "content": prompt}
            ],
            "max_tokens": max_tokens,
            "temperature": 0.7
        }
        
        response = requests.post(f"{self.base_url}/chat/completions", 
                               headers=headers, json=payload)
        
        if response.status_code == 200:
            return response.json()['choices'][0]['message']['content']
        else:
            return f"Error: {response.status_code} - {response.text}"

# Initialize LLM
llm = ApertusSwissLLM(api_key="sk-768c82ef24604a4db381bf8588a73007")

In [34]:
class TibetanRAGSystem:
    def __init__(self, collection, embedding_model, llm):
        self.collection = collection
        self.embedding_model = embedding_model
        self.llm = llm
        
    def retrieve_relevant_qa(self, query, n_results=5):
        query_embedding = self.embedding_model.encode([query])
        
        results = self.collection.query(
            query_embeddings=query_embedding.tolist(),
            n_results=n_results
        )
        
        return results
    
    def generate_answer(self, query):
        # Retrieve relevant Q&A pairs
        relevant_qa = self.retrieve_relevant_qa(query)
        
        # Prepare context from retrieved documents
        context = ""
        for i, doc in enumerate(relevant_qa['documents'][0]):
            metadata = relevant_qa['metadatas'][0][i]
            context += f"Q: {metadata['question']}\nA: {metadata['answer']}\n\n"
        
        # Generate system prompt for Tibetan
        system_prompt = f"""
        You are a helpful assistant that answers questions in Tibetan based on the provided context.
        Use the following question-answer pairs as reference to answer the user's question.
        If you cannot find relevant information in the context, politely say so in Tibetan.
        
        Context:
        {context}
        """
        
        # Generate response
        response = self.llm.generate_response(query, context=system_prompt)
        return response, relevant_qa

# Initialize RAG system
rag_system = TibetanRAGSystem(collection, model, llm)

In [35]:
%%writefile tibetan_rag_streamlit.py
import streamlit as st

def main():
    st.title("བོད་ཡིག་ RAG བཤད་པ་པོ། (Tibetan RAG Chatbot)")
    st.write("Ask questions in Tibetan and get answers from the TibetanQA dataset!")
    
    # Initialize session state
    if 'messages' not in st.session_state:
        st.session_state.messages = []
    
    # Display chat history
    for message in st.session_state.messages:
        with st.chat_message(message["role"]):
            st.markdown(message["content"])
    
    # Chat input
    if prompt := st.chat_input("འདིར་དྲི་བ་བྲིས་རོགས། (Please write your question here)"):
        # Add user message to chat history
        st.session_state.messages.append({"role": "user", "content": prompt})
        with st.chat_message("user"):
            st.markdown(prompt)
        
        # Generate response
        with st.chat_message("assistant"):
            with st.spinner("Thinking..."):
                response, sources = rag_system.generate_answer(prompt)
                st.markdown(response)
                
                # Show sources
                with st.expander("Sources"):
                    for i, doc in enumerate(sources['documents'][0]):
                        metadata = sources['metadatas'][0][i]
                        st.write(f"**Q:** {metadata['question']}")
                        st.write(f"**A:** {metadata['answer']}")
                        st.write("---")
        
        # Add assistant response to chat history
        st.session_state.messages.append({"role": "assistant", "content": response})

if __name__ == "__main__":
    main()

Writing tibetan_rag_streamlit.py


In [24]:


# Cell 2: Run it
import os
os.system("streamlit run tibetan_rag_app.py --server.port 8502 &")
print("App running at: http://localhost:8502")

App running at: http://localhost:8502


In [18]:
import subprocess
result = subprocess.run(["netstat", "-an"], capture_output=True, text=True)
print("Listening ports:")
for line in result.stdout.split('\n'):
    if ':850' in line and 'LISTEN' in line:
        print(line)

Listening ports:


In [10]:
import subprocess
subprocess.Popen(["streamlit", "run", "app.py"])

<Popen: returncode: None args: ['streamlit', 'run', 'app.py']>

In [12]:
!pip install ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting comm>=0.1.3 (from ipywidgets)
  Downloading comm-0.2.3-py3-none-any.whl.metadata (3.7 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl.metadata (20 kB)
Downloading ipywidgets-8.1.7-py3-none-any.whl (139 kB)
Downloading comm-0.2.3-py3-none-any.whl (7.3 kB)
Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl (216 kB)
Downloading widgetsnbextension-4.0.14-py3-none-any.whl (2.2 MB)
   ---------------------------------------- 0.0/2.2 MB ? eta -:--:--
   ---- ----------------------------------- 0.3/2.2 MB ? eta -:--:--
   --------- ------------------------------ 0.5/2.2 MB 1.7 MB/s eta 0:00:01
   ------------------- -------------------- 1.0/2.2 MB 1.9 MB/s eta 0:00:01
   -------------------------


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: C:\Python\Python38\python.exe -m pip install --upgrade pip


In [13]:
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

class TibetanRAGNotebook:
    def __init__(self, rag_system):
        self.rag_system = rag_system
        self.messages = []
        
    def create_interface(self):
        # Title
        display(HTML("<h1>བོད་ཡིག་ RAG བཤད་པ་པོ། (Tibetan RAG Chatbot)</h1>"))
        display(HTML("<p>Ask questions in Tibetan and get answers from the TibetanQA dataset!</p>"))
        
        # Chat input
        self.text_input = widgets.Text(
            placeholder="འདིར་དྲི་བ་བྲིས་རོགས། (Please write your question here)",
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='70%')
        )
        
        self.send_button = widgets.Button(
            description="Send",
            button_style='primary'
        )
        
        self.output = widgets.Output()
        
        # Event handler
        def on_send_click(b):
            query = self.text_input.value
            if query.strip():
                self.process_query(query)
                self.text_input.value = ""
        
        self.send_button.on_click(on_send_click)
        
        # Display interface
        display(widgets.HBox([self.text_input, self.send_button]))
        display(self.output)
    
    def process_query(self, query):
        with self.output:
            # Show user message
            print(f"🧑 You: {query}")
            print("🤖 Assistant: Thinking...")
            
            try:
                # Generate response
                response, sources = self.rag_system.generate_answer(query)
                
                # Clear "thinking" and show response
                clear_output(wait=True)
                
                # Show conversation
                for msg in self.messages:
                    print(f"🧑 You: {msg['user']}")
                    print(f"🤖 Assistant: {msg['assistant']}")
                    print("-" * 50)
                
                print(f"🧑 You: {query}")
                print(f"🤖 Assistant: {response}")
                
                # Show sources
                print("\n📚 Sources:")
                for i, doc in enumerate(sources['documents'][0][:3]):  # Show top 3
                    metadata = sources['metadatas'][0][i]
                    print(f"Q: {metadata['question']}")
                    print(f"A: {metadata['answer']}")
                    print("-" * 30)
                
                # Store message
                self.messages.append({"user": query, "assistant": response})
                
            except Exception as e:
                clear_output(wait=True)
                for msg in self.messages:
                    print(f"🧑 You: {msg['user']}")
                    print(f"🤖 Assistant: {msg['assistant']}")
                    print("-" * 50)
                print(f"🧑 You: {query}")
                print(f"🤖 Assistant: Error - {str(e)}")

# Usage in notebook
notebook_ui = TibetanRAGNotebook(rag_system)
notebook_ui.create_interface()

ModuleNotFoundError: No module named 'ipywidgets'

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

class ApertusSwissLLM:
    def __init__(self, model_path="./apertus-swiss-model"):

        # load the tokenizer and the model
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
        )
        
    def generate_response(self, prompt, context="", max_tokens=500):
        full_prompt = f"Context: {context}\n\nUser: {prompt}\nAssistant:"
        
        inputs = self.tokenizer.encode(full_prompt, return_tensors="pt")
        
        with torch.no_grad():
            outputs = self.model.generate(
                inputs,
                max_new_tokens=max_tokens,
                temperature=0.7,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )
        
        response = self.tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
        return response.strip()

# Initialize LLM
model_name = "swiss-ai/Apertus-8B-Instruct-2509"
llm = ApertusSwissLLM(model_path="swiss-ai/Apertus-8B-Instruct-2509")

Exception: data did not match any variant of untagged enum ModelWrapper at line 1217953 column 3

In [32]:
from chromadb import PersistentClient
from pathlib import Path

BASE_DIR = Path(__file__).parent if '__file__' in globals() else Path.cwd()
DB_DIR = (BASE_DIR / 'tibetan_qa_db').resolve()

print(DB_DIR)
client = PersistentClient(path=str(DB_DIR))

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given


C:\Users\MM\OneDrive\Desktop\HackthonBern\Bhopa_alpha\tibetan_qa_db


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "swiss-ai/Apertus-8B-Instruct-2509"
device = "cuda"  # for GPU usage or "cpu" for CPU usage

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
).to(device)

Exception: data did not match any variant of untagged enum ModelWrapper at line 1217953 column 3