In [1]:
import os
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [2]:
from langchain_openai import ChatOpenAI
llm_name = 'gpt-4-turbo-preview'
llm = ChatOpenAI(model_name = llm_name, temperature = 0)

In [3]:
from langchain.document_loaders.csv_loader import CSVLoader
loader = CSVLoader(file_path = '../dat/526963_processed_data.csv')
sections = loader.load()

In [114]:
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
import pandas as pd
import numpy as np

In [8]:
file = '../chatbot_doc_export_231/246024_data.csv'

In [58]:
df = pd.read_csv(file)
df.shape

(10, 8)

In [49]:
x = df.iloc[0].to_frame().transpose()
x

Unnamed: 0,portfolio_id,project_id,section_reference,category,section,subsection,cost_related,docu_txt
0,231,246024,1.1,Property Description,GENERAL PROPERTY DATA - EXECUTIVE SUMMARY TABLE,,False,ADA-designated parking count - Property Data i...


In [50]:
y = df.iloc[1].to_frame().transpose()
y

Unnamed: 0,portfolio_id,project_id,section_reference,category,section,subsection,cost_related,docu_txt
1,231,246024,3.1,Property Characteristics,Parcel Configuration,,False,Acreage is and comprised approximately 14.1 ac...


In [51]:
giant_row = pd.merge(x, y, how='outer', left_on=['portfolio_id', 'project_id'], right_on=['portfolio_id', 'project_id'])
giant_row

Unnamed: 0,portfolio_id,project_id,section_reference_x,category_x,section_x,subsection_x,cost_related_x,docu_txt_x,section_reference_y,category_y,section_y,subsection_y,cost_related_y,docu_txt_y
0,231,246024,1.1,Property Description,GENERAL PROPERTY DATA - EXECUTIVE SUMMARY TABLE,,False,ADA-designated parking count - Property Data i...,3.1,Property Characteristics,Parcel Configuration,,False,Acreage is and comprised approximately 14.1 ac...


In [131]:
# combine one csv into one row
def combine_rows(filename: str, name: str) -> pd.DataFrame:
    df = pd.read_csv(filename)

    # get first row
    one_row = df.iloc[0].to_frame().transpose()

    # add suffix
    one_row = one_row.add_suffix(f'_{name}_0')

    # merge other rows
    for i in range(1, len(df)):
        row = df.iloc[i].to_frame().transpose()
        row = row.add_suffix(f'_{name}_{i}')
        one_row = pd.merge(one_row, row, how='left', left_on=[f'portfolio_id_{name}_0', f'project_id_{name}_0'], 
                           right_on=[f'portfolio_id_{name}_{i}', f'project_id_{name}_{i}'])

    # reset column names
    one_row = one_row.T.reset_index(drop=True).T
    return one_row

x = combine_rows(file, '246024')
x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,70,71,72,73,74,75,76,77,78,79
0,231,246024,1.1,Property Description,GENERAL PROPERTY DATA - EXECUTIVE SUMMARY TABLE,,False,ADA-designated parking count - Property Data i...,231,246024,...,,There is an Spread Reserve cost item regarding...,231,246024,5.5.2,,,,,There is an Single-year event Reserve cost ite...


In [132]:
y = combine_rows('../chatbot_doc_export_231/246025_data.csv', '246025')
y

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,231,246025,3.1,Property Characteristics,Parcel Configuration,,False,Acreage is and comprised approximately 18.47 a...,231,246025,...,,There is an Single-year event Reserve cost ite...,231,246025,5.2,Mechanical and Electrical Systems,"Heating, Ventilation, and Air Conditioning (HVAC)",,,There is an Spread Reserve cost item regarding...


In [133]:
def new_csv(folder: str) -> pd.DataFrame:
    all_rows = []
    for filename in os.listdir(folder):
        new_row = combine_rows(folder + filename, filename[:6])
        all_rows.append(new_row)

    # Step 1: Find the maximum number of columns among all DataFrames
    max_columns = max(df.shape[1] for df in all_rows)
    
    # Step 2: Add NaN values to the end of each row to match the maximum number of columns
    for df in all_rows:
        num_columns_to_add = max_columns - df.shape[1]
        if num_columns_to_add > 0:
            df = pd.concat([df, pd.DataFrame(np.nan, index=df.index, columns=range(num_columns_to_add))], axis=1, ignore_index=True)
            
    # combine all dataframes into 1 dataframe
    combined_df = pd.concat([df.reset_index(drop=True) for df in all_rows], ignore_index=True)
    return combined_df

x = new_csv('../chatbot_doc_export_231/')
x.to_csv('portfolio_data.csv', index=False)

In [134]:
file = 'portfolio_data.csv'

In [140]:
# load documents
loader = CSVLoader(file_path = file)
sections = loader.load()

# split documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
secs = text_splitter.split_documents(sections)

# define embedding
embeddings = OpenAIEmbeddings()

# create vector database from data
vectordb = Chroma.from_documents(secs, embeddings)

# define retriever
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 100})

# memory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

# prompt
template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Provide a thorough explanation of your reasoning, using bullet points for clarity where needed.
Conclude each response with 'Thank you for asking, Is there anything else I can help you with?' placed 1 line after the answer.

{context}

Return the output in the following format: 
Question: {question}
Answer:

Thank you for asking, Is there anything else I can help you with?
"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template)

#ConversationalRetrievalChain
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory,
    verbose = True,
    combine_docs_chain_kwargs={'prompt': QA_CHAIN_PROMPT}
)

In [141]:
question = "What is the total cost of everything?"
result = qa({"question": question})
print(result['answer'])



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mUse the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Provide a thorough explanation of your reasoning, using bullet points for clarity where needed.
Conclude each response with 'Thank you for asking, Is there anything else I can help you with?' placed 1 line after the answer.

40: 231
41: 246057
42: 4.4.1
43: 
44: 
45: 
46: 
47: There is an Single-year event Reserve cost item regarding Roof replacement - EPDM for total unit of 164000 SF that have 6 years remaining useful life with total cost of 984000.0 dollars, year 6 costs $984000.0.
48: 231
49: 246057
50: 5.2
51: Mechanical and Electrical Systems
52: Heating, Ventilation, and Air Conditioning (HVAC)
53: 
54: 
55: There is an Spread Reserve cost item regarding HVAC package unit (RTU), Replac