In [None]:
from census import Census
import os 

# Get Api key from user env
c_apikey = os.environ.get("CENSUS_APIKEY")

if c_apikey:
    c = Census(c_apikey)
else:
    print("User did not set CENSUS_APIKEY in their environment")

: 

In [None]:
#From each .csv file, only use the counties of interest

#Counties to include: 
#Loudoun County, Virginia
#Falls Church City, Virginia
#Alexandria City, Virginia
#Fairfax County, Virginia
#District of Columbia
#Fairfax City, Virginia
#Montgomery County, Maryland
#Prince William County, Virginia
#Prince Georges County, Maryland
#Arlington County, Virginia

import pandas as pd

# Define the list of DMV counties and cities
dmv_areas = [
    "Loudoun County, Virginia",
    "Falls Church City, Virginia",
    "Alexandria City, Virginia",
    "Fairfax County, Virginia",
    "District of Columbia",
    "Fairfax City, Virginia",
    "Montgomery County, Maryland",
    "Prince William County, Virginia",
    "Prince Georges County, Maryland",
    "Arlington County, Virginia"
]

# Read the four input files
file1 = "colonoscopy.csv" 
file2 = "household_income.csv"  
file3 = "SVI_index.csv"  
file4 = "college_educated.csv" 
file5 = "pap_smear.csv" 

# Load the datasets
data1 = pd.read_csv(file1, header=None)
data1.columns = ["County", "FIPS", "Model-Based Percent(3)", "Lower 95% Confidence Interval", "Upper 95% Confidence Interval"]

data2 = pd.read_csv(file2)
data2.columns = ["County", "FIPS", "Rural-Urban Continuum", "Value (Dollars)", "Rank within US"]

data3 = pd.read_csv(file3)
data3.columns = ["County", "FIPS", "Rural-Urban Continuum", "Value (Index)"]

data4 = pd.read_csv(file4)
data4.columns = ["County", "FIPS", "Rural-Urban Continuum", "Value (Percent)", "People (Education: At Least Bachelor's Degree)", "Rank within US"]

data5 = pd.read_csv(file5, header=None)
data1.columns = ["County", "FIPS", "Model-Based Percent(3)", "Lower 95% Confidence Interval", "Upper 95% Confidence Interval"]

# Filter the data for DMV counties and cities
filtered_data1 = data1[data1["County"].isin(dmv_areas)]
filtered_data2 = data2[data2["County"].isin(dmv_areas)]
filtered_data3 = data3[data3["County"].isin(dmv_areas)]
filtered_data4 = data4[data4["County"].isin(dmv_areas)]
filtered_data5 = data5[data5["County"].isin(dmv_areas)]

# Merge all four datasets on the 'County' column
merged_data = pd.merge(filtered_data1, filtered_data2, on="County", how="inner")
merged_data = pd.merge(merged_data, filtered_data3, on="County", how="inner")
merged_data = pd.merge(merged_data, filtered_data4, on="County", how="inner")
merged_data = pd.merge(merged_data, filtered_data5, on='County', how="inner")

# Save the merged data to a new CSV file
output_file = "merged_dmv_data.csv"
merged_data.to_csv(output_file, index=False)

print(f"Merged data saved to {output_file}")



In [None]:
#Clustering Based on All Features 
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Load the merged data (from previous step)
df = pd.read_csv('merged_dmv_data.csv')

# Selecting relevant columns for clustering (e.g., excluding FIPS and County)
columns_for_clustering = [
    "Model-Based Percent(3)",
    "Value (Dollars)",
    "Value (Index)",
    "Value (Percent)"
]
df_cluster = df[columns_for_clustering]

# Handle missing values by filling them with the mean (you can choose a different method)
df_cluster = df_cluster.fillna(df_cluster.mean())

# Standardize the data (important for K-Means clustering)
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_cluster)

# K-Means Clustering (adjust the number of clusters as needed)
kmeans = KMeans(n_clusters=3, random_state=42)  # You can try different values for n_clusters
df['Cluster'] = kmeans.fit_predict(df_scaled)

# Visualize the clusters (optional)
plt.figure(figsize=(8, 6))
plt.scatter(df['Model-Based Percent(3)'], df['Value (Dollars)'], c=df['Cluster'], cmap='viridis')
plt.xlabel('Model-Based Percent(3)')
plt.ylabel('Value (Dollars)')
plt.title('Clustering DMV Counties')
plt.colorbar(label='Cluster')
plt.show()

# Save the clustered data to a new file
df.to_csv('clustered_dmv_counties.csv', index=False)

print("Clustering complete. The data has been saved to 'clustered_dmv_counties.csv'.")


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt

# Load the merged data (from previous step)
df = pd.read_csv('merged_dmv_data.csv')

# Selecting relevant columns for clustering (e.g., excluding FIPS and County)
columns_for_clustering = [
    "Model-Based Percent(3)",
    "Value (Dollars)",
    "Value (Index)",
    "Value (Percent)"
]
df_cluster = df[columns_for_clustering]

# Handle missing values by filling them with the mean (you can choose a different method)
df_cluster = df_cluster.fillna(df_cluster.mean())

# Standardize the data (important for DBSCAN clustering)
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_cluster)

# DBSCAN Clustering (adjust eps and min_samples as needed)
dbscan = DBSCAN(eps=0.5, min_samples=2)  # eps is the maximum distance between two samples, min_samples is the minimum number of points to form a cluster
df['DBSCAN_Cluster'] = dbscan.fit_predict(df_scaled)

# Visualize the clusters
plt.figure(figsize=(8, 6))

# Plot DBSCAN clustering result
plt.scatter(df['Model-Based Percent(3)'], df['Value (Dollars)'], c=df['DBSCAN_Cluster'], cmap='viridis', marker='o')
plt.xlabel('Model-Based Percent(3)')
plt.ylabel('Value (Dollars)')
plt.title('DBSCAN Clustering of DMV Counties')
plt.colorbar(label='Cluster')
plt.show()

# Save the clustered data to a new file
df.to_csv('dbscan_clustered_dmv_counties.csv', index=False)

print("DBSCAN clustering complete. The data has been saved to 'dbscan_clustered_dmv_counties.csv'.")
