In [1]:
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import CTransformers
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
import sys
import pandas as pd
from pandasql import sqldf
import re

path = r"<insert-dataset-path-here>"

df = pd.read_csv(path)  

# Get the column names
column_names = df.columns

# Get the first row values
first_row_values = df.iloc[0]

# Calculate the total number of characters in the column names
total_chars_in_column_names = sum(len(name) for name in column_names)

# Calculate the total number of characters in the first row
total_chars_in_first_row = sum(len(str(value)) for value in first_row_values)

chunk_size = int((total_chars_in_column_names + total_chars_in_first_row)*1.4)


DB_FAISS_PATH = r"<insert-path>"
loader = CSVLoader(file_path=path, encoding="utf-8", csv_args={'delimiter': ','})
data = loader.load()
# print(data)

# Split the text into Chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
text_chunks = text_splitter.split_documents(data)

print(len(text_chunks))

for chunk in text_chunks:
    print(chunk,"\n")


996
page_content='Passenger_Id: 1\nSurvived: 0\nPclass: 3\nName: Braund, Mr. Owen Harris\nSex: male\nAge: 22\nSibSp: 1\nParch: 0\nTicket: A/5 21171\nFare: 7.25\nCabin: \nEmbarked: S' metadata={'source': 'C:\\Users\\ShubhamGupta\\Downloads\\Data.csv', 'row': 0} 

page_content='Passenger_Id: 2\nSurvived: 1\nPclass: 1\nName: Cumings, Mrs. John Bradley (Florence Briggs Thayer)\nSex: female\nAge: 38\nSibSp: 1\nParch: 0\nTicket: PC 17599\nFare: 71.2833' metadata={'source': 'C:\\Users\\ShubhamGupta\\Downloads\\Data.csv', 'row': 1} 

page_content='Cabin: C85\nEmbarked: C' metadata={'source': 'C:\\Users\\ShubhamGupta\\Downloads\\Data.csv', 'row': 1} 

page_content='Passenger_Id: 3\nSurvived: 1\nPclass: 3\nName: Heikkinen, Miss. Laina\nSex: female\nAge: 26\nSibSp: 0\nParch: 0\nTicket: STON/O2. 3101282\nFare: 7.925\nCabin: \nEmbarked: S' metadata={'source': 'C:\\Users\\ShubhamGupta\\Downloads\\Data.csv', 'row': 2} 

page_content='Passenger_Id: 4\nSurvived: 1\nPclass: 1\nName: Futrelle, Mrs. Jacqu

In [2]:
# Download Sentence Transformers Embedding From Hugging Face
embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')

In [3]:
# Converting the text Chunks into embeddings and saving the embeddings into FAISS Knowledge Base
docsearch = FAISS.from_documents(text_chunks, embeddings)

In [4]:
docsearch.save_local(DB_FAISS_PATH)

In [5]:
patients = df

In [6]:
#query = "What is the value of GDP per capita of Finland provided in the data?"

#docs = docsearch.similarity_search(query, k=3)

#print("Result", docs)

llm = CTransformers(model=r"<insert path to model llama-2-7b-chat.ggmlv3.q8_0.bin>",
                    model_type="llama",
                    config={"max_new_tokens": 2048, "context_length": 4096 , "temperature": 0})

qa = ConversationalRetrievalChain.from_llm(llm, retriever=docsearch.as_retriever(), verbose= True)


In [7]:

chat_history = []
#     query = "What is this dataset about?"
query = input(f"Input Prompt: ")
if query == 'exit':
    print('Exiting')
    sys.exit()

edited_query = "Write a SQL query to find " + query


result = qa({"question": edited_query, "chat_history":chat_history})
Response = result['answer']

print("Response: ",Response)
    


Input Prompt: number of duplicate records


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mUse the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Passenger_Id: 473
Survived: 1
Pclass: 2
Name: West, Mrs. Edwy Arthur (Ada Mary Worth)
Sex: female
Age: 33
SibSp: 1
Parch: 2
Ticket: C.A. 34651
Fare: 27.75
Cabin:

Passenger_Id: 165
Survived: 0
Pclass: 3
Name: Panula, Master. Eino Viljami
Sex: male
Age: 1
SibSp: 4
Parch: 1
Ticket: 3101295
Fare: 39.6875
Cabin: 
Embarked: S

Passenger_Id: 269
Survived: 1
Pclass: 1
Name: Graham, Mrs. William Thompson (Edith Junkins)
Sex: female
Age: 58
SibSp: 0
Parch: 1
Ticket: PC 17582
Fare: 153.4625

Passenger_Id: 33
Survived: 1
Pclass: 3
Name: Glynn, Miss. Mary Agatha
Sex: female
Age: 
SibSp: 0
Parch: 0
Ticket: 335677
Fare: 7.75
Cabin: 
Embarked: Q

Question: Write a S

In [8]:
# Regular expression pattern to match SQL queries enclosed in quotes
pattern = r'(?i)(SELECT.*?;)'

# Find all matches of the pattern in the input string
matches = re.findall(pattern, Response, re.IGNORECASE)

# Print all SQL queries found
for match in matches:
    print(match)




SELECT COUNT(*) FROM passengers WHERE passenger_id IN (SELECT passenger_id FROM passengers GROUP BY passenger_id HAVING COUNT(*) > 1);


In [9]:
passengers = df
# Use pandasql to execute the SQL query
result = sqldf(matches[0], locals())
# Print the result
if( result.empty ):
    print("No records")
    
else:
    print(result)

   COUNT(*)
0         0
