# Vector Store Index
This code will create embeddings of the files and store them in a vector stroe index.

## Connecting to Google Drive
Google drive is required to connect if you want to store the index in google drive.

In [None]:
from google.colab import drive

drive.mount('/content/drive/')

## Installing all the dependencies

In [None]:
!pip show llama-index
!pip install llama-index --upgrade

In [None]:
%%capture
!pip install llama-index
!pip install openai
!pip install pypdf
!pip install --upgrade llama_index

# to use llama-index embeddings
!pip install llama-index-embeddings-openai

# to use arabert as the embedding model
!pip install arabert
!pip install llama-index-embeddings-huggingface
!pip install llama-index-llms-huggingface
!pip install transformers torch

!pip install llama_index.core.node_parser
!pip install jiwer gradio typing-extensions

## Data and Persist Folder

In [None]:
## -- Transit Security Project -- ##

data_folder = "your data folder" # this folder will have the data that you want
                                # to embed and store in the Vector index
PERSIST_DIR = "your vector store index folder" # this is the location of the vector store index

## Setting up the API

In [None]:
import os
import openai

# setting up the API key to use OpenAI API
os.environ["OPENAI_API_KEY"] = "your_openai_api_key_here"  # replace with your OpenAI API key


## Setting up the Embedding model
 By default, LlamaIndex uses text-embedding-ada-002 from OpenAI. We also support any embedding model offered by Langchain here, as well as providing an easy to extend base class for implementing your own embeddings.

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import Settings

Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-large")
embed_model = OpenAIEmbedding(model="text-embedding-3-large")

# You can change the chunk size and overlap as your need
Settings.chunk_size = 512
Settings.chunk_overlap = 20

## Getting all the files in the data folder

In [None]:
import os
import glob

def get_files(root_folder):
    files = []
    # List of labels to match against filenames
    # Our dataset had only 61 labels, but you can add more as needed
    labels = ['T1495','T1485','T1595','T1134','T1040','T1132','T1098','T1069','T1036','T1562','T1187','T1486','T1119','T1027','T1498','T1654','T1548','T1082','T1552','T1614','T1531','T1204','T1529','T1046','T1489','T1195','T1566','T1659','T1059','T1213','T1133','T1080','T1005','T1078','T1001','T1190','T1203','T1136','T1491','T1033','T1189','T1068','T1652','T1049','T1020','T1041','T1021','T1105','T1518','T1200','T1053','T1557','T1056','T1087','T1565','T1499','T1657','T1559','T1074','T1106','T1560', 'T1589']

    for foldername, subfolders, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.endswith(('.pdf', '.csv','.txt')):
                # not a generic case
                if '_' not in filename:
                    print(filename)
                    if filename[:-4] in labels:
                        files.append(os.path.join(foldername, filename))
    return files


## Creating and Appending the index

 ### Creating the embeddings, index and retriever using Vector Store Index

---








In [None]:
import os.path
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.indices.postprocessor import SimilarityPostprocessor
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.response.pprint_utils import *
from llama_index.core import Settings
from llama_index.embeddings.openai import OpenAIEmbedding
from openai import OpenAI

# Load aragpt2 model directly
# from transformers import AutoModelForCausalLM

import psutil

# from llama_index.core.node_parser import SimpleNodeParser

DATA_FOLDER = data_folder
BATCH_SIZE = 200  # Define the batch size

# Function to print memory usage
def print_memory_usage():
    # Print the current memory usage
    memory_info = psutil.virtual_memory()
    print(f"Memory Usage: {memory_info.percent}%")

index = None


try:
    if not os.path.exists(PERSIST_DIR):
        # creating the index from the documents
        print(f"Creating directory: {PERSIST_DIR}")
        os.mkdir(PERSIST_DIR)

        files = get_files(DATA_FOLDER)
        print(len(files))
        if not files:
            raise ValueError("No files found in the specified data folder.")

        # Process files in batches
        for i in range(0, len(files), BATCH_SIZE):
            batch_files = files[i:i + BATCH_SIZE]
            # print(f"Loading documents from files: {batch_files}")
            documents = SimpleDirectoryReader(input_files=batch_files).load_data()
            # print(f"Loaded {len(documents)} documents.")

            if index is None:
                print("Creating VectorStoreIndex...")
                index = VectorStoreIndex.from_documents(documents=documents)
                print(f"--- Files {i} to {i + BATCH_SIZE-1} (Ceated new Index) ---")
            else:
                # index.add_documents(documents=documents) ##
                for document in documents:
                    index.insert(document=document)
                # # Parse documents into nodes
                # parser = SimpleNodeParser()
                # new_nodes = parser.get_nodes_from_documents(documents)

                # # Add nodes to the existing index
                # index.insert_nodes(new_nodes)
                print(f"--- Files {i} to {i + BATCH_SIZE-1} (added to the existing Index) ---")

            # Persist the index after each batch
            print(f"Persisting index to {PERSIST_DIR}...")
            index.storage_context.persist(persist_dir=PERSIST_DIR)
            print("Index persisted successfully.")

            # Print memory usage
            print_memory_usage()
            # break


    else:
        # Retrieving a storage context from already existing context and loading the index
        print(f"Loading index from storage: {PERSIST_DIR}...")
        storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
        index = load_index_from_storage(storage_context=storage_context)
        print("Index loaded successfully.")

    # Check if index is loaded successfully
    if index is None:
        raise ValueError("Failed to load or create the index.")

    retriever = VectorIndexRetriever(index=index, similarity_top_k=20)
    postprocessor = SimilarityPostprocessor(similarity_cutoff=0.60)
    print("--- Successfully created the embeddings, index, and retriever ---")

except Exception as e:
    print(f"Caught an exception: {e}")
    if os.path.exists(PERSIST_DIR):
        os.rmdir(PERSIST_DIR)

### This code is to append the index





In [None]:
import os.path
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.indices.postprocessor import SimilarityPostprocessor
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.response.pprint_utils import *
from llama_index.core import Settings

from llama_index.embeddings.openai import OpenAIEmbedding
from openai import OpenAI

# Load aragpt2 model directly
from transformers import AutoModelForCausalLM

import psutil

# from llama_index.core.node_parser import SimpleNodeParser

DATA_FOLDER = data_folder
BATCH_SIZE = 1  # Define the batch size
starting_index = 8 # file index from where the vector index starts being appended

# Function to print memory usage
def print_memory_usage():
    # Print the current memory usage
    memory_info = psutil.virtual_memory()
    print(f"Memory Usage: {memory_info.percent}%")

index = None

try:
    if os.path.exists(PERSIST_DIR):
        print(f"Loading index from storage: {PERSIST_DIR}...")
        storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
        index = load_index_from_storage(storage_context=storage_context)
        print("Index loaded successfully.")

        files = get_files(DATA_FOLDER)
        if not files:
            raise ValueError("No files found in the specified data folder.")

        # Process files in batches
        for i in range(starting_index, 660, BATCH_SIZE):
            batch_files = files[i:i + BATCH_SIZE]
            # print(f"Loading documents from files: {batch_files}")
            documents = SimpleDirectoryReader(input_files=batch_files).load_data()
            # print(f"Loaded {len(documents)} documents.")

            if index is None:
                print("Creating VectorStoreIndex...")
                index = VectorStoreIndex.from_documents(documents=documents)
                print(f"--- Files {i} to {i + BATCH_SIZE-1} (Ceated new Index) ---")

            else:
                # index.add_documents(documents=documents) ##
                for document in documents:
                    index.insert(document=document)

                print(f"--- Files {i} to {i + BATCH_SIZE-1} (added to the existing Index) ---")

            # Persist the index after each batch
            print(f"Persisting index to {PERSIST_DIR}...")
            index.storage_context.persist(persist_dir=PERSIST_DIR)
            print("Index persisted successfully.")

            # Print memory usage
            print_memory_usage()

    else:
        # Retrieving a storage context from already existing context and loading the index
        print("PERSIST dir does not exist. Create from the start")


    # Check if index is loaded successfully
    if index is None:
        raise ValueError("Failed to load or create the index.")

    retriever = VectorIndexRetriever(index=index, similarity_top_k=20)
    postprocessor = SimilarityPostprocessor(similarity_cutoff=0.60)
    print("--- Successfully created the embeddings, index, and retriever ---")

except Exception as e:
    print(f"Caught an exception: {e}")
    # if os.path.exists(PERSIST_DIR):
        # os.rmdir(PERSIST_DIR)