In [1]:
!pip install python-docx pandas openpyxl python-pptx
!pip install markitdown
!pip install chromadb
!pip install optimum
!pip install bitsandbytes



In [2]:
import transformers
from torch import cuda, bfloat16
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

In [3]:
from huggingface_hub import login
login('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')

In [4]:
from time import time
from transformers import AutoModelForCausalLM, AutoTokenizer
model_id='google/gemma-2-2b-it'
time_1 = time()
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=False,
    config=model_config,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
time_2 = time()
print(f"Prepare model, tokenizer: {round(time_2-time_1, 3)} sec.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Prepare model, tokenizer: 62.528 sec.


In [5]:
import torch
time_1 = time()
query_pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.float16,
        device_map="auto",
        max_length=1500  # Tingkatkan batas panjang input
        )
time_2 = time()
print(f"Prepare pipeline: {round(time_2-time_1, 3)} sec.")

Device set to use cpu


Prepare pipeline: 0.127 sec.


In [6]:
def test_model(tokenizer, pipeline, prompt_to_test):
    """
    Perform a query
    print the result
    Args:
        tokenizer: the tokenizer
        pipeline: the pipeline
        prompt_to_test: the prompt
    Returns
        None
    """
    # adapted from https://huggingface.co/blog/llama2#using-transformers
    time_1 = time()
    sequences = pipeline(
        prompt_to_test,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=200,)
    time_2 = time()
    print(f"Test inference: {round(time_2-time_1, 3)} sec.")
    for seq in sequences:
        print(f"Result: {seq['generated_text']}")

In [7]:
test_model(tokenizer,
           query_pipeline,
           "kerajaan majapahit adalah")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=200) and `max_length`(=1500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.


Test inference: 209.031 sec.
Result: kerajaan majapahit adalah kerajaan yang maju dari segi sosial dan politik, tetapi juga memiliki budaya yang kaya. Kerajaan Majapahit dikenal karena keunggulannya dalam bidang pertanian, maritim, dan perdagangan.

Berikut adalah beberapa poin penting tentang Kerajaan Majapahit:

**Sosial:**

* **Struktur sosial yang kompleks:** Majapatih memiliki sistem sosial yang kompleks dengan berbagai lapisan, termasuk bangsawan, pedagang, dan petani.
* **Keberlanjutan:** Sistem sosial ini berlanjut hingga era kolonialisme, mencerminkan kesatuan masyarakat.
* **Perbedaan sosial yang jelas:**  Meskipun memiliki sistem sosial yang kompleks, kerajaan Majapahit juga memiliki perbedaan sosial yang jelas, terutama dalam hal kekuasaan dan status sosial.

**Politik:**

* **Kekuasaan centralized:** Kerajaan Majapahit memiliki struktur politik yang terorganisir dengan sistem pusat yang kuat.
* **Pemimpin yang kuat:**  Para raja Majapahit, khususnya Hayam Wuruk, dikenal de

In [6]:
!pip install langchain-community



In [7]:
from langchain.llms import HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=query_pipeline) # Adjust max_new_tokens as needed

  llm = HuggingFacePipeline(pipeline=query_pipeline) # Adjust max_new_tokens as needed


## Load documents

In [8]:
from markitdown import MarkItDown
from openai import OpenAI
import os

input_dir = '/content/drive/MyDrive/data_sample'
client = OpenAI(api_key="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")
md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20")
supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')

# Gunakan list untuk menyimpan file .md yang telah dikonversi
converted_files = []

# Dapatkan daftar file di input directory
files_to_convert = [f for f in os.listdir(input_dir) if f.lower().endswith(supported_extensions)]
for file in files_to_convert:
    print(f"\nConverting {file}...")
    try:
        # Buat path lengkap untuk file input dan output
        input_file_path = os.path.join(input_dir, file)
        md_file = os.path.splitext(file)[0] + '.md'
        md_file_path = os.path.join(input_dir, md_file)

        # Konversi file menggunakan MarkItDown
        result = md.convert(input_file_path)

        # Simpan konten file yang dikonversi ke dalam variabel
        converted_files.append({
            'filename': md_file,
            'content': result.text_content,
            'url': f"https://drive.google.com/file/d/{md_file}/view"  # Contoh: jika file ID tersedia
        })

        # Tulis hasil konversi ke file .md
        with open(md_file_path, 'w') as f:
            f.write(result.text_content)

        print(f"Successfully converted {file} to {md_file}")
    except Exception as e:
        print(f"Error converting {file}: {str(e)}")

print("\nAll conversions completed!")

# Output untuk memastikan semua file tersimpan di variabel
print(f"\nConverted files: {converted_files}")





Converting Paparan Keamanan Informasi (1).pdf...
Successfully converted Paparan Keamanan Informasi (1).pdf to Paparan Keamanan Informasi (1).md

Converting Notulen Enhancing Cybersecurity Awareness in Facing Cyber Threats.docx...
Successfully converted Notulen Enhancing Cybersecurity Awareness in Facing Cyber Threats.docx to Notulen Enhancing Cybersecurity Awareness in Facing Cyber Threats.md

Converting pembinaan jf sandiman dan MI 2022 final.pptx...
Successfully converted pembinaan jf sandiman dan MI 2022 final.pptx to pembinaan jf sandiman dan MI 2022 final.md

All conversions completed!

Converted files: [{'filename': 'Paparan Keamanan Informasi (1).md', 'content': 'Dinas Komunikasi Informatika\nStatistik dan Persandian Kota\nSemarang\n\nSosialisasi Keamanan\nInformasi\n\nSemarang, 21-24 Oktober 2024\n\n\x0cPendahuluan\n\nrisikonya,\n\nteknologi  saat\n\nini  yang\nPerkembangan\nmeliputi  Big  Data  dan  Artifisial  Inteligence\nmemberikan  dampak  luar  biasa  bagi  umat\nmanusia

In [10]:
from langchain.schema.document import Document
documents = [
    Document(
        page_content=file_data['content'],
        metadata={
            'source': file_data['filename'],
            'url': file_data.get('url', 'No URL available')  # Ambil URL dari metadata
        }
    )
    for file_data in converted_files
]


## Chunking

In [11]:
from langchain.schema.document import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
# Assuming 'converted_files' from the previous cell contains the document data
documents = [Document(page_content=file_data['content'], metadata={'source': file_data['filename']}) for file_data in converted_files]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)

In [12]:
from langchain.embeddings import HuggingFaceEmbeddings
model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
model_kwargs = {"device": "cpu"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

  embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)


# Chromadb

## Create database

In [13]:
from langchain.vectorstores.chroma import Chroma
vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db")

In [14]:
from langchain.chains import RetrievalQA
retriever = vectordb.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    verbose=True,
    return_source_documents=True  # Tambahkan sumber dokumen ke output
)



In [17]:
def test_rag(qa, query):
    print(f"Query: {query}\n")
    time_1 = time()
    # Use qa({"query": query}) instead of qa.run(query)
    # This will handle multiple output keys correctly
    result = qa({"query": query})
    time_2 = time()
    print(f"Inference time: {round(time_2-time_1, 3)} sec.")
    print("\nResult: ", result['result']) # Access the result using the key 'result'
    # Tampilkan sumber dokumen
    print("\nSources:")
    for doc in result['source_documents']:
        source = doc.metadata.get('source', 'Unknown source')
        url = doc.metadata.get('url', 'No URL available')  # Beri nilai default jika URL tidak tersedia
        print(f"- {source}: {url}")

In [None]:
query = "apa visi indonesia 2045"
test_rag(qa, query)

  result = qa({"query": query})
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.


Query: apa visi indonesia 2045



[1m> Entering new RetrievalQA chain...[0m


In [None]:
docs = vectordb.similarity_search(query)
print(f"Query: {query}")
print(f"Retrieved documents: {len(docs)}")
for doc in docs:
    doc_details = doc.to_json()['kwargs']
    print("Source: ", doc_details['metadata']['source'])
    print("Text: ", doc_details['page_content'], "\n")