In [16]:
import pickle
import csv
import os
import pandas as pd
from langchain.text_splitter import CharacterTextSplitter
from pptx import Presentation
from reportlab.pdfgen import canvas
from pdf2image import convert_from_path
import time
import pickle
from PIL import Image
from io import BytesIO
import base64
from langchain_core.messages import HumanMessage
from pdf2image import convert_from_path
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
import os
from tqdm import tqdm

In [4]:
from langchain_google_genai import ChatGoogleGenerativeAI
import google.generativeai as genai
from langchain_core.prompts import ChatPromptTemplate

In [5]:
# Define the function to load and split the text
def split_text_file(file_path, chunk_size=1000):
    # Open the file and read its content
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Create a TextSplitter instance, here we use CharacterTextSplitter
    text_splitter = CharacterTextSplitter(
        separator="\n",  # You can change the separator based on your needs (e.g., paragraph or sentence split)
        chunk_size=chunk_size,  # Size of each chunk
        chunk_overlap=200  # Allow overlap of 200 characters between chunks for better context retention
    )

    # Split the text into smaller chunks
    chunks = text_splitter.split_text(text)
    return chunks
  # Function to read the CSV file and convert rows to text
def read_csv_as_text(file_path):
    rows_as_text = []
    with open(file_path, mode='r', encoding='utf-8') as file:
        csv_reader = csv.reader(file)
        headers = next(csv_reader)  # Read header
        for row in csv_reader:
            # Combine the row data into a single text block, you can adjust this depending on the data
            row_text = " | ".join(row)
            rows_as_text.append(row_text)
    return rows_as_text

# Function to split the CSV text data into smaller chunks using LangChain's CharacterTextSplitter
def split_csv_data(file_path, chunk_size=1000):
    # Read the CSV data as text
    rows_as_text = read_csv_as_text(file_path)

    # Initialize the TextSplitter
    text_splitter = CharacterTextSplitter(
        separator="\n",  # Text will be split per row
        chunk_size=chunk_size,  # Define the chunk size
        chunk_overlap=200  # Optional: Allow overlap to maintain context
    )

    # Combine all rows into a single text block and split
    full_text = "\n".join(rows_as_text)
    chunks = text_splitter.split_text(full_text)
    
    return chunks

In [6]:
# Function to convert Images from PDF
def pdf_to_images(pdf_path):
    """
    Convert PDF file into an array of images, one per page.
    
    :param pdf_path: Path to the PDF file
    :return: List of PIL Image objects (one for each page)
    """
    images = convert_from_path(pdf_path)
    return images

In [9]:
os.environ["GOOGLE_API_KEY"]=os.environ["GEMINI_API_KEY"]

In [10]:
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")


In [33]:
# Define the directory to scan for files
directory_to_scan = './knowledge_source'
pickle_dir = './PickleFiles'
# Loop through each file in the specified directory
for file_name in tqdm(os.listdir(directory_to_scan),total=len(os.listdir(directory_to_scan))):
    
    pickle_path = os.path.join(pickle_dir, f"{file_name}.pkl")

    # Skip processing if a pickle file already exists
    if os.path.exists(pickle_path):
        print(f"Skipping {file_name}: Pickle file already exists.")
        continue
    
    # Get the full path of the file
    file_path = os.path.join(directory_to_scan, file_name)
    
    # Process text files (.txt)
    if file_name.endswith('.txt'):
        file_data = {}  # Dictionary to hold the file's data
        file_data['type'] = 'txt'  # File type
        file_data['pages'] = []  # List to store text chunks
        
        print(f"Reading Text file: {file_name}")
        # Split the text file into chunks of a specified size
        text_chunks = split_text_file(file_path, chunk_size=1000)
        
        # Store the chunks of text in the 'pages' list
        for idx, chunk in enumerate(text_chunks):
            # Optionally, print chunks to debug
            # print(f"Chunk {idx+1}:\n{chunk}\n")
            file_data['pages'].append(chunk)
        
        # Save the extracted data to a pickle file for later use
        pickle.dump(file_data, open(f"./PickleFiles/{file_name}.pkl", "wb"))

    # Process CSV files (.csv)
    elif file_name.endswith('.csv'):
        file_data = {}  # Dictionary to hold the file's data
        file_data['type'] = 'csv'  # File type
        file_data['pages'] = []  # List to store chunks of the CSV data
        
        print(f"Reading CSV file: {file_name}")
        # Split the CSV data into chunks of a specified size
        csv_chunks = split_csv_data(file_path, chunk_size=1000)
        
        # Store each chunk in the 'pages' list
        for idx, chunk in enumerate(csv_chunks):
            # Optionally, print chunks to debug
            # print(f"Chunk {idx+1}:\n{chunk}\n")
            file_data['pages'].append(chunk)
        
        # Save the extracted data to a pickle file for later use
        pickle.dump(file_data, open(f"./PickleFiles/{file_name}.pkl", "wb"))

    # Process image files (.png)
    elif file_name.endswith('.png'):
        # Open the image using PIL
        image = Image.open(file_path)
        
        # Convert the image to a base64-encoded string
        buffered_image = BytesIO()
        image.save(buffered_image, format="PNG")  # Can change format if needed
        img_base64 = base64.b64encode(buffered_image.getvalue()).decode("utf-8")
        data_url = f"data:image/png;base64,{img_base64}"
        
        # Prepare a message to be sent to the LLM
        message = HumanMessage(
            content=[
                {"type": "text", "text": "Summarise the image in more than 1000 words"},
                {"type": "image_url", "image_url": data_url}
            ]
        )
        
        # Call the LLM to summarize the image
        image_summary = llm.invoke([message])
        
        # Save the summary in the dictionary
        file_data = {}
        file_data['type'] = 'png'
        file_data['pages'] = [image_summary.content]
        
        # Save the summary to a pickle file for later use
        pickle.dump(file_data, open(f"./PickleFiles/{file_name}.pkl", "wb"))

    # Process PDF files (.pdf)
    elif file_name.endswith('.pdf'):
        # Convert each page of the PDF into images
        print(f"Reading PDF file: {file_name}")
        pdf_images = pdf_to_images(file_path)
        
        file_data = {}  # Dictionary to hold the PDF's data
        file_data['type'] = 'pdf'  # File type
        file_data['pages'] = []  # List to store summaries of each image
        
        # Process each image (one per page of the PDF)
        for image in tqdm(pdf_images,total=len(pdf_images)):
            # Convert the image to a base64-encoded string
            buffered_image = BytesIO()
            image.save(buffered_image, format="PNG")  # Can change format if needed
            img_base64 = base64.b64encode(buffered_image.getvalue()).decode("utf-8")
            data_url = f"data:image/png;base64,{img_base64}"
            
            # Prepare a message to be sent to the LLM
            message = HumanMessage(
                content=[
                    {"type": "text", "text": "Summarise the image"},
                    {"type": "image_url", "image_url": data_url}
                ]
            )
            
            # Call the LLM to summarize the image
            image_summary = llm.invoke([message])
            
            # Add the summary to the list of pages
            file_data['pages'].append(image_summary.content)
            
            # Wait to avoid hitting rate limits (if necessary)
            time.sleep(1)
        
        time.sleep(10)  # Wait for 10 seconds after processing all images for rate limits
        # Save the summary data to a pickle file for later use
        pickle.dump(file_data, open(f"./PickleFiles/{file_name}.pkl", "wb"))

    # Skip unsupported file types
    else:
        print(f"Skipping file: {file_name} (unsupported file type)")

  0%|          | 0/17 [00:00<?, ?it/s]

Skipping autor-2015-why-are-there-still-so-many-jobs-the-history-and-future-of-workplace-automation.pdf: Pickle file already exists.
Skipping ssrn-4326257.pdf: Pickle file already exists.
Skipping 88684e36-en.pdf: Pickle file already exists.
Skipping MinervaProject_Integrating-Artificial-Intelligence-Key-Strategies-for-Higher-Education_Insights2023.pdf: Pickle file already exists.
Skipping file: .DS_Store (unsupported file type)
Skipping s11023-018-9482-5.pdf: Pickle file already exists.
Skipping WEF_Future_of_Jobs_2020.pdf: Pickle file already exists.
Skipping gx-eri-decarbonization-report.pdf: Pickle file already exists.
Skipping w24196.pdf: Pickle file already exists.
Skipping s41562-023-01788-2.pdf: Pickle file already exists.
Skipping navigating-future-uncertainty-in-australia-with-megatrends-2023.pdf: Pickle file already exists.
Skipping 1467-8462.12542.pdf: Pickle file already exists.
Skipping JSA annual report - summary.pdf: Pickle file already exists.
Skipping GenerativeAIPape

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
 45%|████▌     | 78/173 [04:57<06:02,  3.82s/it]
 88%|████████▊ | 15/17 [05:06<00:40, 20.41s/it]


ResourceExhausted: 429 Resource has been exhausted (e.g. check quota).

In [30]:
directory_to_scan = './knowledge_source'
data=[]
for file_name in os.listdir(directory_to_scan):
    # Get the full path of the file
    file_path = os.path.join(directory_to_scan, file_name)
    if file_name.endswith(('.txt', '.csv', '.pdf', '.png')):
        file=pickle.load(open('./PickleFiles/{}.pkl'.format(file_name),'rb'))
        data_dict={}
        for i,page in enumerate(file['pages']):
            data_dict['File Name']=file_name
            data_dict['File Type']=file_name.split('.')[1]
            data_dict['Page No.']=i
            data_dict['Text']=page
            data.append(data_dict.copy())
        
    else:
        print(f"Skipping file: {file_name} (unsupported file type)")

Skipping file: .DS_Store (unsupported file type)


FileNotFoundError: [Errno 2] No such file or directory: './PickleFiles/Dawson, N. (2021).pdf.pkl'