In [55]:
import pandas as pd

def read_policy(file_path):
    """
    Reads a policy CSV file and returns its contents as a list of dictionaries.
    
    Args:
        file_path (str): The path to the CSV file.
        
    Returns:
        List[Dict]: A list of dictionaries representing the rows in the CSV file.
    """
    df = pd.read_csv(file_path, dtype=str,index_col=0)
    df["document_name"] = ""
    #print(df.columns)
    for idx, row in df.iterrows():
        df.loc[idx,"document_name"] = f"{row['state']}_{row['Bill.Type']}_{row['Number']}_{row['year']}"
    return df

policies = read_policy('/home/vicente/Github/BDLab-Agent/backend/DISES/CodingResults.csv')
display(policies)


Unnamed: 0,state,Bill.Type,Number,year,sess_year,codes,document_name
1,AK,S,205,2024,2024,"Land Acquisition, Specific Location, Authoriza...",AK_S_205_2024
2,AZ,H,2720,2024,2024,"As of Right, Zoning, Urban, Local Gov, Require...",AZ_H_2720_2024
3,AZ,H,2721,2024,2024,"Time of Adoption, Requirement, Urban, Zoning, ...",AZ_H_2721_2024
4,AZ,S,1415,2024,2024,"Specific Location, Urban, Requirement, ADU, Si...",AZ_S_1415_2024
5,CA,A,2345,2020,2019-2020,"State Gov, Requirement, Reporting,Local Gov, R...",CA_A_2345_2020
...,...,...,...,...,...,...,...
271,WA,S,5746,2020,2019-2020,"Affordable, Low-Income, Requirement, State Gov...",WA_S_5746_2020
272,WA,S,5466,2023,2023-2024,"Definition, Floor Area Ratio,NA, Transit Zones...",WA_S_5466_2023
273,WA,S,6173,2024,2023-2024,"Affordable, Income Specification, Local Gov, I...",WA_S_6173_2024
274,WV,H,4502,2022,2022,"Definition,Reporting, Increasing Tax Credits, ...",WV_H_4502_2022


In [56]:
#display(policies)
results = pd.read_csv('/home/vicente/Github/BDLab-Agent/backend/DISES/DISES-policy-coding-quotations.csv', dtype=str)
print(results.columns)

def find_overlap(policies, results):
    rows = []

    for _, row in results.iterrows():
        doc = row.get("document")
        if doc is None:
            continue
        doc = str(doc)

        if not doc.endswith("Emma.pdf"):
            continue

        if doc.endswith(".pdf"):
            doc = doc[:-4]

        if doc.endswith("_Emma"):
            doc = doc[:-5]

        document_name = doc.replace(" ", "_").replace("-", "_")

        raw_codes = str(row.get("codes", ""))
        codes_list = []
        for part in raw_codes.split(","):
            part = part.strip()
            if part:
                codes_list.append(part)

        out = {
            "document_name": document_name,
            "query": row.get("quotation", ""),
            "codes": codes_list,
        }
        rows.append(out)

    if len(rows) == 0:
        return pd.DataFrame(columns=["document_name", "query", "codes"])

    return pd.DataFrame(rows)

overlap = find_overlap(policies, results)

Index(['document', 'quotation', 'codes', 'comment'], dtype='object')


In [57]:

display(overlap)

Unnamed: 0,document_name,query,codes
0,CA_S_20_1400,This bill would enact the Accessory Dwelling U...,"[ADU, Homeowners, Budget Amount Allocated, Inc..."
1,CA_S_20_1400,This bill would provide for submission of the ...,[Voter Referendum]
2,CA_S_20_1400,SEC. 3. Section 1 of this act shall be submitt...,[Voter Referendum]
3,CA_S_20_1400,(a) To make and execute contracts and all othe...,[nan]
4,CA_S_20_1400,"d) To provide advice, technical information, a...",[Technical Assistance]
...,...,...,...
307,CA_AB_20_434,987.010. (a) For any loans issued pursuant to ...,"[Increasing Loan Access, Developers, Veterans,..."
308,CA_AB_20_434,(b) All moneys received by the department in r...,"[Increasing Funding, Increasing Loan Access, D..."
309,CA_AB_20_434,(f) All moneys set aside for the default reser...,"[Increasing Loan Access, Increasing Funding, D..."
310,CA_AB_20_434,(h) This section shall remain in effect only u...,[Repeal]


In [58]:
overlap.to_csv('/home/vicente/Github/BDLab-Agent/backend/utils/overlap.csv', index=False)