<a href="https://colab.research.google.com/github/Ruqyai/ruqyai.github.io/blob/main/_notebooks/RAG_for_Arabic_Wikipedia_Using_Gemma2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Build a RAG Application Using Gemma 2.

In [None]:
# Install necessary packages

!pip install langchain langchainhub langchain_community langchain-huggingface faiss-gpu transformers accelerate datasets bitsandbytes langchain-text-splitters sentence-transformers huggingface_hub chromadb gradio > /dev/null 2>&1


In [None]:
import os
import torch
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_huggingface import HuggingFacePipeline
from langchain import hub
from langchain.schema import Document
from huggingface_hub import login
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import gradio as gr
# from dotenv import load_dotenv
# Load environment variables if needed
# load_dotenv()


In [None]:
from google.colab import userdata

# Login to Hugging Face Hub
hf_token = userdata.get('gemma2')
login(token=hf_token)



The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
# Load dataset
dataset_name = "wikimedia/wikipedia"
page_content_column = "text"
name = "20231101.ar"
loader = HuggingFaceDatasetLoader(dataset_name, page_content_column, name)
data = loader.load()

# Select the first 20 entries for demonstration
documents = data[:20]



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/131k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/408M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/172M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/145M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/131M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/65.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/156M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/246M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1219201 [00:00<?, ? examples/s]

In [None]:
documents

[Document(metadata={'id': '7', 'url': 'https://ar.wikipedia.org/wiki/%D9%85%D8%A7%D8%A1', 'title': 'ماء'}, page_content='"\\u0627\\u0644\\u0645\\u0627\\u0621 \\u0645\\u0627\\u062f\\u0629\\u064c \\u0634\\u0641\\u0627\\u0641\\u0629\\u064c \\u0639\\u062f\\u064a\\u0645\\u0629 \\u0627\\u0644\\u0644\\u0648\\u0646 \\u0648\\u0627\\u0644\\u0631\\u0627\\u0626\\u062d\\u0629\\u060c \\u0648\\u0647\\u0648 \\u0627\\u0644\\u0645\\u0643\\u0648\\u0651\\u0646 \\u0627\\u0644\\u0623\\u0633\\u0627\\u0633\\u064a \\u0644\\u0644\\u062c\\u062f\\u0627\\u0648\\u0644 \\u0648\\u0627\\u0644\\u0628\\u062d\\u064a\\u0631\\u0627\\u062a \\u0648\\u0627\\u0644\\u0628\\u062d\\u0627\\u0631 \\u0648\\u0627\\u0644\\u0645\\u062d\\u064a\\u0637\\u0627\\u062a \\u0648\\u0643\\u0630\\u0644\\u0643 \\u0644\\u0644\\u0633\\u0648\\u0627\\u0626\\u0644 \\u0641\\u064a \\u062c\\u0645\\u064a\\u0639 \\u0627\\u0644\\u0643\\u0627\\u0626\\u0646\\u0627\\u062a \\u0627\\u0644\\u062d\\u064a\\u0651\\u0629\\u060c \\u0648\\u0647\\u0648 \\u0623\\u0643\\u0

In [None]:
# Split text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=0, length_function=len, is_separator_regex=False)

def process_document(document):
    chunks = text_splitter.split_text(document.page_content)
    split_docs = []
    for chunk in chunks:
        try:
            decoded_content = chunk.encode().decode('unicode_escape')
        except UnicodeDecodeError:
            decoded_content = chunk
        split_docs.append(Document(page_content=decoded_content, metadata=document.metadata))
    return split_docs

split_documents = []

with ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_document, doc) for doc in documents]
    for future in as_completed(futures):
        split_documents.extend(future.result())

split_documents[:2]

[Document(metadata={'id': '1185', 'url': 'https://ar.wikipedia.org/wiki/%D8%A7%D9%84%D8%B3%D9%86%D8%A7%D9%81%D8%B1', 'title': 'السنافر'}, page_content='"السنافر (جمع سنفور)  ،  هي شخصيات خيالية صغيرة الحجم، زرقاء اللون، وتعيش في'),
 Document(metadata={'id': '1185', 'url': 'https://ar.wikipedia.org/wiki/%D8%A7%D9%84%D8%B3%D9%86%D8%A7%D9%81%D8%B1', 'title': 'السنافر'}, page_content='الغابة، ابتكرها الرسام البلجيكي بيير كوليفورد (Pierre Culliford) المعروف باسم بييو . أُلفت')]

In [None]:
# Initialize embeddings model
model_path = "sentence-transformers/all-MiniLM-L12-v2"
model_kwargs = {'device': 'cuda'}  # Use GPU for embeddings
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(model_name=model_path, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)



  warn_deprecated(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
text = split_documents[0].page_content
query_result = embeddings.embed_query(text)
query_result[:3]

[0.053765103220939636, 0.03836403414607048, -0.08215793967247009]

In [None]:

vector_db = FAISS.from_documents(split_documents, embeddings)
vector_db.save_local("/kaggle/working/faiss_index")

In [None]:
question = "ما هو الماء "
search_docs = vector_db.similarity_search(question)
search_docs[0].page_content

'الماء هو السائل المستخدم لنقل الحرارة بدخوله في دورة مغلقة بين المرجل'

In [None]:
# Initialize LLM model for text generation
base_model = "google/gemma-2-9b-it"
tokenizer = AutoTokenizer.from_pretrained(base_model)
model = AutoModelForCausalLM.from_pretrained(base_model, return_dict=True, low_cpu_mem_usage=True,
                                            torch_dtype=torch.float16, device_map="auto", trust_remote_code=True)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=20)
llm = HuggingFacePipeline(pipeline=pipe)


tokenizer_config.json:   0%|          | 0.00/40.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/857 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/39.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [None]:
# Configure retrieval
retriever = vector_db.as_retriever()


In [None]:
# QA chain
rag_prompt = hub.pull("rlm/rag-prompt")
qa_chain = ({"context": retriever, "question": RunnablePassthrough()} | rag_prompt | llm | StrOutputParser())



In [None]:
# parse qa_chain output
def extract_answer(qa_chain_output):
    lines = qa_chain_output.split('\n')
    for line in lines:
        if line.startswith('Answer:'):
            return line.split(':', 1)[1].strip()
    return None

In [None]:
# Execute the chain or further questions as needed
question = "أين تقع قرطاج؟"
result = qa_chain.invoke(question)
extract_answer(result)

'تقع قرطاج في شمال أفريقيا.'

In [None]:
# Initialize Gradio interface
def chatbot_response(question):
    # Invoke QA chain with the question
    result = qa_chain.invoke(question)
    return result.split("Answer: ")[1]  # Extract the answer part

# Launch Gradio interface
chatbot = gr.Interface(
    fn=chatbot_response,
    inputs="text",
    outputs="text",
    live=False,
    title="Gemmea2 Chatbot ",
    description="Ask me anything"
)

chatbot.launch(debug=True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://7b9f1dba1d3053f75a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://7b9f1dba1d3053f75a.gradio.live


