In [None]:
### This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%%capture
!pip install llama-index
!pip install llama-index-llms-groq
!pip install PyPDF2
!pip install llama-index-packs-raft-dataset
!pip install llama-index-embeddings-huggingface
!pip install llama-index-embeddings-instructor
!pip install pymupdf
!pip install --upgrade pip
!pip install ipywidgets==8.1.5

In [None]:
from llama_index.packs.raft_dataset import RAFTDatasetPack
from llama_index.llms.groq import Groq
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import PyPDF2

In [None]:
# Define the file paths for input and output
pdf_files = [
    '/kaggle/input/testing-manual/Circuit Breaker Testing Manual.pdf',
    '/kaggle/input/testing-manual/Power Cable Testing Manual.pdf',
    '/kaggle/input/testing-manual/Transformer Testing Manual.pdf'
]
output_path = '/kaggle/working/Merged_Testing_Manuals.pdf'

# Create a PDF writer object to hold the merged content
pdf_writer = PyPDF2.PdfWriter()

# Loop through each PDF file and add its pages to the writer
for pdf_file in pdf_files:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    for page in range(len(pdf_reader.pages)):
        pdf_writer.add_page(pdf_reader.pages[page])

# Write the combined PDF to the output path
with open(output_path, 'wb') as output_pdf:
    pdf_writer.write(output_pdf)

print("PDF files merged successfully!")


In [None]:
llm = Groq(model="qwen-2.5-32b", api_key="")
embeddings = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [None]:
raft_dataset = RAFTDatasetPack(file_path="/kaggle/working/Merged_Testing_Manuals.pdf",
                                 llm=llm,
                                 num_questions_per_chunk=1,
                                 num_distract_docs=2,
                                 embed_model = embeddings,
                                 )

In [None]:
chunks = raft_dataset.get_chunks("/kaggle/working/Merged_Testing_Manuals.pdf", raft_dataset.chunk_size)
len(chunks)

In [None]:
dataset = raft_dataset.run()

In [None]:
dataset

In [None]:
len(dataset[0]['context']['sentences'][0])

In [None]:
output_path = "/kaggle/working/RAFTDATA"
# Save as .arrow format
dataset.save_to_disk(output_path)

# Save as .jsonl format
dataset.to_json(output_path + ".jsonl")

In [None]:
import zipfile
import os

# Define the directory to be zipped and the output zip file path
directory_to_zip = '/kaggle/working/RAFTDATA'
zip_file_path = '/kaggle/working/RAFTDATA.zip'

# Check if the directory exists
if os.path.exists(directory_to_zip):
    # Create a zip file
    with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Walk the directory and add files to the zip file
        for root, dirs, files in os.walk(directory_to_zip):
            for file in files:
                # Create the complete file path
                file_path = os.path.join(root, file)
                # Add the file to the zip file
                zipf.write(file_path, os.path.relpath(file_path, directory_to_zip))
    
    print(f"Directory '{directory_to_zip}' has been zipped successfully as '{zip_file_path}'.")
else:
    print(f"The directory '{directory_to_zip}' does not exist.")


In [None]:
import pandas as pd

# Path to the JSONL file
file_path = "/kaggle/working/RAFTDATA.jsonl"

# Read the JSONL file into a DataFrame
df = pd.read_json(file_path, lines=True)

# Show the first few rows
print(df.head())


In [None]:
# Save as a CSV
df.to_csv("/kaggle/working/RAFTDATA.csv", index=False)

# Save as a JSON
df.to_json("/kaggle/working/RAFTDATA.json", orient="records", lines=True)

# Save as a Parquet file
df.to_parquet("/kaggle/working/RAFTDATA.parquet", index=False)


In [None]:
# Drop columns where all values are NaN
df_clean = df.dropna(axis=1, how='all')

# Save the cleaned DataFrame to a CSV
df_clean.to_csv("/kaggle/working/RAFTDATA_clean.csv", index=False)


In [None]:
df = df.iloc[:, 2:]  # Removes the first two columns, keeps everything from the third column onward


In [None]:
from IPython.display import display

# Display the DataFrame as a table
display(df)


In [None]:
from IPython.core.display import HTML

# Convert to HTML and display
HTML(df.to_html())