In [1]:
import sys
import pandas as pd
from pandasql import sqldf
import re
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import CTransformers
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [2]:
 
def load_data(path):
    df = pd.read_csv(path)  
    return df
 
def calculate_chunk_size(df):
    column_names = df.columns
    first_row_values = df.iloc[0]
    total_chars_in_column_names = sum(len(name) for name in column_names)
    total_chars_in_first_row = sum(len(str(value)) for value in first_row_values)
    chunk_size = int((total_chars_in_column_names + total_chars_in_first_row)*1.4)
    return chunk_size
 
def load_and_split_data(path, chunk_size):
    loader = CSVLoader(file_path=path, encoding="utf-8", csv_args={'delimiter': ','})
    data = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    text_chunks = text_splitter.split_documents(data)
    return text_chunks
 
def create_embeddings(text_chunks, model_name):
    embeddings = HuggingFaceEmbeddings(model_name = model_name)
    docsearch = FAISS.from_documents(text_chunks, embeddings)
    return docsearch
 
def save_embeddings(docsearch, DB_FAISS_PATH):
    docsearch.save_local(DB_FAISS_PATH)
 
def load_llm_model(model_path, config):
    llm = CTransformers(model=model_path,
                        model_type="llama",
                        config=config)
    return llm
 
def create_conversational_chain(llm, retriever):
    qa = ConversationalRetrievalChain.from_llm(llm, retriever=retriever)
    return qa
 
def chat(qa):
    while(True):
        chat_history = []
        query = input(f"Input Prompt: ")
        if query == 'exit':
            print('Exiting')
            sys.exit()
        edited_query =  query 
        result = qa({"question": edited_query, "chat_history":chat_history})
        Response = result['answer']
        print("Response: ",Response)
 

In [3]:
def main():
    path = r"C:\Users\ShubhamGupta\Downloads\Data.csv"
    DB_FAISS_PATH = r"C:\Users\ShubhamGupta\db_faiss"
    model_path = r"C:\Users\ShubhamGupta\Downloads\llama-2-7b-chat.ggmlv3.q8_0.bin"
    model_name = 'sentence-transformers/all-MiniLM-L6-v2'
    config = {"max_new_tokens": 2048, "context_length": 4096 , "temperature": 0}
 
    df = load_data(path)
    chunk_size = calculate_chunk_size(df)
    text_chunks = load_and_split_data(path, chunk_size)
    docsearch = create_embeddings(text_chunks, model_name)
    save_embeddings(docsearch, DB_FAISS_PATH)
    llm = load_llm_model(model_path, config)
    qa = create_conversational_chain(llm, docsearch.as_retriever())
    chat(qa)
 
if __name__ == "__main__":
    main()

Input Prompt: What is the purpose of the dataset? 
Response:   The purpose of this dataset is to provide information about passengers on the Titanic, including their name, age, sex, class, fare, and survival status. This information can be used for various purposes such as research, analysis, or visualization.
Input Prompt: What type of data is it (e.g., marketing, transactional, dimensional, etc.)? 
Response:   Based on the provided context, it appears that this data is transactional in nature. The data includes information about passengers on the Titanic, including their names, ages, genders, ticket numbers, fares, and cabin assignments. This type of data is typically used for business purposes, such as managing bookings, tracking passenger information, and conducting market research.
Input Prompt: What are the key entities in the dataset? 
Response:   The key entities in the dataset are Passenger_Id, Survived, Pclass, Name, Sex, Age, SibSp, and Parch. These entities represent the di

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
