In [1]:
import pandas as pd
from sqlalchemy import create_engine, func


In [2]:
benefits_csv = r"/Users/matthewguy422/Documents/Healthcare-Value-Explorer-/PUF CSV/benefits-and-cost-sharing-puf.zip"

sqldb_connection = "postgresql://postgres:postgres@localhost:5432/healthcare_value_db"

In [3]:
# Read csv into panda dataframe
benefits_df = pd.read_csv(benefits_csv)
print(benefits_df.head())

  benefits_df = pd.read_csv(benefits_csv)


   BusinessYear StateCode  IssuerId SourceName           ImportDate  \
0          2025        AK     21989       HIOS  2024-08-29 01:02:15   
1          2025        AK     21989       HIOS  2024-08-29 01:02:15   
2          2025        AK     21989       HIOS  2024-08-29 01:02:15   
3          2025        AK     21989       HIOS  2024-08-29 01:02:15   
4          2025        AK     21989       HIOS  2024-08-29 01:02:15   

  StandardComponentId             PlanId                      BenefitName  \
0      21989AK0030001  21989AK0030001-00  Routine Dental Services (Adult)   
1      21989AK0030001  21989AK0030001-00     Dental Check-Up for Children   
2      21989AK0030001  21989AK0030001-00        Basic Dental Care - Child   
3      21989AK0030001  21989AK0030001-00              Orthodontia - Child   
4      21989AK0030001  21989AK0030001-00        Major Dental Care - Child   

    CopayInnTier1 CopayInnTier2  ... IsEHB IsCovered QuantLimitOnSvc LimitQty  \
0  Not Applicable           N

In [4]:
# Get unique benefit names
unique_benefits = benefits_df['BenefitName'].dropna().unique()
unique_benefit_count = len(unique_benefits)
print(f"Number of unique Benefits: {unique_benefit_count}")

Number of unique Benefits: 266


In [5]:
# Create dataframe with the unique Benefit Names and assign an ID
clean_benefits_df = pd.DataFrame({
    "BenefitsName": unique_benefits,
    "BenefitsID": range(1, len(unique_benefits) + 1)
})
clean_benefits_df.head()

Unnamed: 0,BenefitsName,BenefitsID
0,Routine Dental Services (Adult),1
1,Dental Check-Up for Children,2
2,Basic Dental Care - Child,3
3,Orthodontia - Child,4
4,Major Dental Care - Child,5


In [7]:
# Conntect to PostgreSQL database (need psycopg2-binary - pip install psycopg2-binary)
engine = create_engine("postgresql://postgres:postgres@localhost:5432/healthcare_value_db")

In [8]:
# Append dataframe to Rates table in HealthCareValues_DB
clean_benefits_df.to_sql("Benefits", engine, if_exists="append", index=False)

266

In [9]:
benefits_cost_sharing_df = pd.DataFrame({
    "BCS_ID": None, # will calculate later as PlanId-BenefitsId
    "StateCode": benefits_df["StateCode"],
    "IssuerID": benefits_df["IssuerId"],
    "StandardComponentId": benefits_df["StandardComponentId"],
    "PlanID": benefits_df["PlanId"],
    "BenefitsName": benefits_df["BenefitName"],
    "Exclusion": benefits_df["Exclusions"]
})

benefits_cost_sharing_df.head()

Unnamed: 0,BCS_ID,StateCode,IssuerID,StandardComponentId,PlanID,BenefitsName,Exclusion
0,,AK,21989,21989AK0030001,21989AK0030001-00,Routine Dental Services (Adult),
1,,AK,21989,21989AK0030001,21989AK0030001-00,Dental Check-Up for Children,
2,,AK,21989,21989AK0030001,21989AK0030001-00,Basic Dental Care - Child,
3,,AK,21989,21989AK0030001,21989AK0030001-00,Orthodontia - Child,
4,,AK,21989,21989AK0030001,21989AK0030001-00,Major Dental Care - Child,


In [10]:
# Strip extra spaces from BenefitNames columns for cleaner match
benefits_cost_sharing_df["BenefitsName"] = benefits_cost_sharing_df["BenefitsName"].str.strip()
clean_benefits_df["BenefitsName"] = clean_benefits_df["BenefitsName"].str.strip()

# Join Benefits to BenefitCostSharing
benefits_cost_sharing_df = benefits_cost_sharing_df.merge(
    clean_benefits_df,
    on="BenefitsName",
    how="left"
)

In [11]:
# Print column names in the merged Dataframe
print(benefits_cost_sharing_df.columns.tolist())

['BCS_ID', 'StateCode', 'IssuerID', 'StandardComponentId', 'PlanID', 'BenefitsName', 'Exclusion', 'BenefitsID']


In [12]:
# Check that BenefitsID is populated in the BenefitsCostSharing dataframe
print(benefits_cost_sharing_df[['BenefitsName', 'BenefitsID']].head())

                      BenefitsName  BenefitsID
0  Routine Dental Services (Adult)           1
1     Dental Check-Up for Children           2
2        Basic Dental Care - Child           3
3              Orthodontia - Child           4
4        Major Dental Care - Child           5


In [13]:
# Calculate Benefit ID in BenefitsCostSharing table
benefits_cost_sharing_df["BCS_ID"] = (
    benefits_cost_sharing_df["PlanID"].astype(str) + "-" +
    benefits_cost_sharing_df["BenefitsID"].astype(int).astype(str).str.zfill(3)
)

benefits_cost_sharing_df.head()

Unnamed: 0,BCS_ID,StateCode,IssuerID,StandardComponentId,PlanID,BenefitsName,Exclusion,BenefitsID
0,21989AK0030001-00-001,AK,21989,21989AK0030001,21989AK0030001-00,Routine Dental Services (Adult),,1
1,21989AK0030001-00-002,AK,21989,21989AK0030001,21989AK0030001-00,Dental Check-Up for Children,,2
2,21989AK0030001-00-003,AK,21989,21989AK0030001,21989AK0030001-00,Basic Dental Care - Child,,3
3,21989AK0030001-00-004,AK,21989,21989AK0030001,21989AK0030001-00,Orthodontia - Child,,4
4,21989AK0030001-00-005,AK,21989,21989AK0030001,21989AK0030001-00,Major Dental Care - Child,,5


In [14]:
# Drop the BenefitsName column
benefits_cost_sharing_df.drop(columns=["BenefitsName"], inplace=True)

In [15]:
# Count number of records
benefits_cost_sharing_count = len(benefits_cost_sharing_df)

print(f'The number of records in the BenefitsCostSharing table: {benefits_cost_sharing_count}')

The number of records in the BenefitsCostSharing table: 1647036


In [16]:
# Count how many times each ID appears
duplicate_counts = benefits_cost_sharing_df['BCS_ID'].value_counts()
duplicate_counts = duplicate_counts[duplicate_counts > 1]
print(duplicate_counts)

BCS_ID
39424OR1660001-01-038    2
56707OR1420003-00-038    2
10091OR0760001-00-038    2
56707OR1380010-00-038    2
10091OR0750020-00-038    2
                        ..
10091OR0760001-02-038    2
71287OR0420001-01-038    2
56707OR1360004-06-038    2
71287OR0420002-03-038    2
39424OR1700001-01-038    2
Name: count, Length: 210, dtype: int64


In [17]:
# Show duplicates
duplicates = benefits_cost_sharing_df[benefits_cost_sharing_df.duplicated(subset='BCS_ID', keep=False)]
duplicates = duplicates.sort_values(by='BCS_ID')
duplicates.head(20)

Unnamed: 0,BCS_ID,StateCode,IssuerID,StandardComponentId,PlanID,Exclusion,BenefitsID
993269,10091OR0750002-00-038,OR,10091,10091OR0750002,10091OR0750002-00,,38
993315,10091OR0750002-00-038,OR,10091,10091OR0750002,10091OR0750002-00,,38
993347,10091OR0750002-01-038,OR,10091,10091OR0750002,10091OR0750002-01,,38
993393,10091OR0750002-01-038,OR,10091,10091OR0750002,10091OR0750002-01,,38
993425,10091OR0750002-02-038,OR,10091,10091OR0750002,10091OR0750002-02,,38
993471,10091OR0750002-02-038,OR,10091,10091OR0750002,10091OR0750002-02,,38
993503,10091OR0750002-03-038,OR,10091,10091OR0750002,10091OR0750002-03,,38
993549,10091OR0750002-03-038,OR,10091,10091OR0750002,10091OR0750002-03,,38
993581,10091OR0750004-00-038,OR,10091,10091OR0750004,10091OR0750004-00,,38
993627,10091OR0750004-00-038,OR,10091,10091OR0750004,10091OR0750004-00,,38


In [18]:
# Drop duplicate IssuerId-IssuerMarketPlaceMarketingName pairs
benefits_cost_sharing_df = benefits_cost_sharing_df.drop_duplicates(subset='BCS_ID', keep='first')

# Count number of records
benefits_cost_sharing_count = len(benefits_cost_sharing_df)

print(f'The number of records in the BenefitsCostSharing table: {benefits_cost_sharing_count}')
benefits_cost_sharing_df.head()

The number of records in the BenefitsCostSharing table: 1646826


Unnamed: 0,BCS_ID,StateCode,IssuerID,StandardComponentId,PlanID,Exclusion,BenefitsID
0,21989AK0030001-00-001,AK,21989,21989AK0030001,21989AK0030001-00,,1
1,21989AK0030001-00-002,AK,21989,21989AK0030001,21989AK0030001-00,,2
2,21989AK0030001-00-003,AK,21989,21989AK0030001,21989AK0030001-00,,3
3,21989AK0030001-00-004,AK,21989,21989AK0030001,21989AK0030001-00,,4
4,21989AK0030001-00-005,AK,21989,21989AK0030001,21989AK0030001-00,,5


In [19]:
# Insert the cleaned DataFrame into the database
benefits_cost_sharing_df.to_sql("BenefitsCostSharing", engine, if_exists="append", index=False)

826