# Preparations:
initialize environment and install dependencies.
```bash
    python -m venv .venv
    .venv\Scripts\activate
    pip install chromadb sentence-transformers transformers gradio
    pip install ipykernel
    python -m ipykernel install --user --name=.venv --display-name "Python (.venv)"
```

if there is problem, try:
```bash
    pip install --upgrade jupyter ipywidgets
```


# Code:

In [None]:
import chromadb
from chromadb.utils import embedding_functions
from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
import gradio as gr

In [None]:
# 1. Inisialisasi ChromaDB
chroma_client = chromadb.PersistentClient(path="./chroma_db")  

collection = chroma_client.get_or_create_collection(
    name="faq_collection",
    metadata={"hnsw:space": "cosine"}  # Gunakan "dot" untuk dot product
)

In [None]:
# 2. Model untuk embedding (all-MiniLM-L6-v2)
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
# 3. Model LLM untuk Generasi Jawaban (Flan-T5 Base)
tokenizer_flant5 = T5Tokenizer.from_pretrained("google/flan-t5-base")
model_flant5 = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

In [None]:
# 4. Model LLM untuk Parafrase Jawaban (T5 Small)
tokenizer_t5 = T5Tokenizer.from_pretrained("google-t5/t5-small")
model_t5 = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")

In [None]:
# 5. Dataset Kecil (FAQ Chatbot)
dataset = [
    {"question": "What is AI?", "answer": "AI (Artificial Intelligence) refers to computer systems that can perform tasks requiring human-like intelligence such as learning and problem-solving."},
    {"question": "What is Machine Learning?", "answer": "Machine Learning (ML) is a subset of AI where systems learn patterns from data without explicit programming."},
    {"question": "How does ML relate to AI?", "answer": "ML is the primary method used to achieve AI capabilities through data-driven learning."},
    {"question": "What are the types of Machine Learning?", "answer": "Three main types: Supervised Learning, Unsupervised Learning, and Reinforcement Learning."},
    {"question": "What is Supervised Learning?", "answer": "ML approach using labeled data to train models, like predicting house prices from historical data."},
    {"question": "What is Unsupervised Learning?", "answer": "ML technique finding patterns in unlabeled data, commonly used for customer segmentation."},
    {"question": "What is Deep Learning?", "answer": "A ML specialization using artificial neural networks to process complex data like images and speech."},
    {"question": "What are neural networks?", "answer": "Computing systems inspired by biological brains, using interconnected layers to process information."},
    {"question": "What is Natural Language Processing (NLP)?", "answer": "AI technology enabling computers to understand, interpret, and generate human language."},
    {"question": "What is Computer Vision?", "answer": "AI field focused on enabling computers to interpret visual information from the world."},
    
    {"question": "What are common AI applications?", "answer": "Virtual assistants, recommendation systems, fraud detection, and facial recognition."},
    {"question": "How does AI impact daily life?", "answer": "Through smartphone features, online recommendations, spam filters, and smart home devices."},
    {"question": "What data does AI need?", "answer": "AI systems require relevant, high-quality data - both structured (tables) and unstructured (images/text)."},
    {"question": "Why is data important for AI?", "answer": "Data trains AI models - more diverse data typically leads to better performance."},
    {"question": "What is an AI algorithm?", "answer": "Step-by-step instructions that help AI systems process data and make decisions."},
    {"question": "What programming languages are used in AI?", "answer": "Python (most popular), R, Java, and Julia, often using libraries like TensorFlow and PyTorch."},
    {"question": "What is Reinforcement Learning?", "answer": "ML method where systems learn through trial-and-error using reward feedback."},
    {"question": "What is the Turing Test?", "answer": "A measure of machine intelligence where a human can't distinguish between AI and human responses."},
    {"question": "What are AI ethics?", "answer": "Principles ensuring AI development respects privacy, fairness, transparency, and human values."},
    {"question": "What is AI bias?", "answer": "Unfair outcomes caused by biased training data or flawed algorithms in AI systems."},
    
    {"question": "What is Generative AI?", "answer": "AI that creates new content like text, images, or music (e.g., ChatGPT, DALL-E)."},
    {"question": "How do chatbots work?", "answer": "Using NLP to understand queries and ML models to generate appropriate responses."},
    {"question": "What is predictive analytics?", "answer": "AI technique analyzing historical data to forecast future outcomes."},
    {"question": "What is pattern recognition?", "answer": "Core AI capability identifying regularities in data for classification/prediction."},
    {"question": "What is automation in AI?", "answer": "Using AI to perform repetitive tasks without human intervention."},
    {"question": "What is AI training?", "answer": "Process of feeding data to ML models to help them learn patterns."},
    {"question": "What is AI inference?", "answer": "Using trained models to make predictions on new data."},
    {"question": "What is overfitting?", "answer": "When AI models perform well on training data but poorly on new data."},
    {"question": "What is underfitting?", "answer": "When AI models fail to learn patterns from training data properly."},
    {"question": "What is transfer learning?", "answer": "Reusing pre-trained AI models for new tasks to save time/resources."},
    
    {"question": "What are AI chips?", "answer": "Special processors (like GPUs) optimized for AI computations."},
    {"question": "What is edge AI?", "answer": "Running AI algorithms directly on devices instead of cloud servers."},
    {"question": "What is weak AI vs strong AI?", "answer": "Weak AI (narrow AI) handles specific tasks, while Strong AI (AGI) would match human intelligence."},
    {"question": "What is explainable AI?", "answer": "AI systems that can explain their decisions in understandable terms."},
    {"question": "How is AI used in healthcare?", "answer": "For medical imaging analysis, drug discovery, and personalized treatment plans."},
    {"question": "How is AI used in finance?", "answer": "For fraud detection, algorithmic trading, and credit scoring."},
    {"question": "What are AI risks?", "answer": "Job displacement, privacy concerns, security vulnerabilities, and ethical challenges."},
    {"question": "What is robotic process automation?", "answer": "Using AI bots to automate repetitive digital tasks like data entry."},
    {"question": "What is computer vision used for?", "answer": "Facial recognition, medical imaging, autonomous vehicles, and quality control."},
    {"question": "What is the AI development process?", "answer": "1. Problem definition 2. Data collection 3. Model training 4. Testing 5. Deployment"},
    
    {"question": "What skills are needed for AI?", "answer": "Programming, statistics, data analysis, and domain knowledge."},
    {"question": "What is feature engineering?", "answer": "Process of selecting/transforming raw data into useful inputs for AI models."},
    {"question": "What is a validation set?", "answer": "Data used during training to tune model parameters and prevent overfitting."},
    {"question": "What is a test set?", "answer": "Unseen data used to evaluate final model performance before deployment."},
    {"question": "What is precision vs recall?", "answer": "Precision measures accuracy of positive predictions, recall measures coverage of actual positives."},
    {"question": "What is big data in AI?", "answer": "Large, complex datasets requiring special processing that AI systems can analyze."},
    {"question": "What is the AI lifecycle?", "answer": "Continuous process: Data collection → Model training → Deployment → Monitoring → Updating"},
    {"question": "What is MLOps?", "answer": "Practices combining ML development with IT operations for reliable AI deployment."},
    {"question": "What is synthetic data?", "answer": "Artificially generated data used when real data is scarce or sensitive."},
    {"question": "How to start learning AI?", "answer": "Begin with Python programming, basic statistics, and online ML courses, then practice with projects."}
]

In [None]:
# 6. Hapus data sebelumnya pada dataset
collection.delete(ids=[str(i) for i in range(len(dataset))])  # Hapus data lama berdasarkan ID

In [None]:
# 7. Simpan Dataset ke ChromaDB
for i, data in enumerate(dataset):
    embedding = embedding_func([data["question"]])[0]
    collection.add(ids=[str(i)], documents=[data["answer"]], embeddings=[embedding])

In [None]:
print("Stored Data:", collection.get())
print('\n')
print("Stored IDs:", collection.get()["ids"])

In [None]:
# 6. Fungsi Retrieve & Generate Jawaban
def retrieve_answer(query):
    query_embedding = embedding_func([query])[0]
    results = collection.query(query_embeddings=[query_embedding], n_results=5)
    print(f"\nDistances: {results['distances'][0][0]}")  # Debugging
    
    if not results["documents"] or results["distances"][0][0] > 0.6:  # Threshold
        return None
    return results["documents"][0][0]
    
def rephrase(responses):
    response = ". ".join(responses) + "."
    prompt = f"Rewrite the following text in a fluent and natural way: {response}"
    print(f"\nPrompt (rephrase): {prompt}")  # Debugging
    
    input_ids = tokenizer_t5.encode(prompt, return_tensors="pt")

    # Generate output
    outputs = model_t5.generate(
        input_ids,
        
        # Panjang output
        min_length=10,              
        max_length=150,             
    
        # Sampling (jika `do_sample=True`)
        do_sample=True,             
        temperature=0.8,            
        top_k=50,                   
        top_p=0.9,                  
        
        # # Beam Search
        # num_beams=5,               
        # early_stopping=True,        
    
        # Penalti untuk menghindari pengulangan kata/frasa
        repetition_penalty=1.0,     
        no_repeat_ngram_size=2,     
    
        # # Penalti panjang output
        # length_penalty=1.2,
    
        # # Jumlah output yang dihasilkan
        # num_return_sequences=3  
    )

    # Decode hasil
    rephrased_responses = tokenizer_t5.decode(outputs[0], skip_special_tokens=True)
    return rephrased_responses

def chat(query):
    context = retrieve_answer(query)
    if context is None:
        return "Sorry, i can't answer that question."
    
    prompt = f"Answer the question: {query} based on the given context: {context}"
    print(f"\nPrompt (chat): {prompt}")  # Debugging
    
    input_ids = tokenizer_flant5.encode(prompt, return_tensors="pt")
    
    # Generate output
    outputs = model_flant5.generate(
        input_ids,
        
        # Panjang output
        min_length=10,              
        max_length=100,             
    
        # # Sampling (jika `do_sample=True`)
        # do_sample=True,             
        # temperature=0.7,            
        # top_k=50,                   
        # top_p=0.9,                  
        
        # Beam Search
        num_beams=5,               
        early_stopping=True,        
    
        # Penalti untuk menghindari pengulangan kata/frasa
        repetition_penalty=1.5,     
        no_repeat_ngram_size=2,     
    
        # Penalti panjang output
        length_penalty=1.2,
    
        # Jumlah output yang dihasilkan
        num_return_sequences=3      
    )
    
    # Decode hasil
    responses = [tokenizer_flant5.decode(output, skip_special_tokens=True) for output in outputs]
    print(f"\nResponses (chat): {responses}")  # Debugging
    
    rephrased_responses = rephrase(responses)
    return rephrased_responses

# **Penjelasan Parameter**
| **Parameter** | **Fungsi** | **Nilai Default** |
|--------------|-----------|------------------|
| `max_length` | Maksimal panjang output dalam token. | `20` |
| `min_length` | Minimal panjang output dalam token. | `0` |
| `do_sample` | Aktifkan sampling untuk variasi respons. | `False` |
| `temperature` | Mengontrol randomness (semakin tinggi, semakin acak). | `1.0` |
| `top_k` | Memilih hanya **k** token dengan probabilitas tertinggi. | `50` |
| `top_p` | Hanya memilih token hingga probabilitas kumulatifnya mencapai **p** (0.0-1.0). | `1.0` |
| `repetition_penalty` | Penalti untuk pengulangan kata yang sama. | `1.0` |
| `length_penalty` | Mengontrol panjang output. Nilai lebih kecil = lebih pendek. | `1.0` |
| `num_beams` | Menentukan jumlah kemungkinan output yang dipertimbangkan. | `1` |
| `early_stopping` | Berhenti lebih cepat jika model merasa sudah selesai. | `False` |
| `num_return_sequences` | Jumlah output berbeda yang dihasilkan. | `1` |

---

# **Kapan Gunakan Parameter Tertentu?**
| **Skenario** | **Gunakan** |
|-------------|------------|
| **Mau output lebih panjang?** | `max_length` besar, `length_penalty > 1.0` |
| **Mau output lebih pendek?** | `max_length` kecil, `length_penalty < 1.0` |
| **Mau variasi jawaban?** | `do_sample=True`, `temperature > 0.7`, `num_return_sequences > 1` |
| **Mau jawaban lebih akurat & deterministik?** | `do_sample=False`, `temperature=0`, `top_k=1` |
| **Mau menghindari pengulangan kata?** | `repetition_penalty > 1.0` |
| **Mau jawaban lebih kreatif?** | `temperature > 1.0`, `top_k > 50`, `top_p > 0.9` |


In [None]:
# 9. Buat UI Chatbot dengan Gradio
demo = gr.Interface(fn=chat, inputs="text", outputs="text", title="FAQ Chatbot")

demo.launch()