# Cleaning the Data

In [1792]:
import pandas as pd
import ast
import numpy as np

In [1793]:
county = "Dane, WI"

## Cleaning the NREL Data

In [1794]:
nrel = pd.read_csv("dSTAC_US.csv", dtype={"state_fips": "object", "county_fips": "object"})
nrel.head()

Unnamed: 0,state_abbr,state_fips,county_fips,sector_abbr,energy_value_us_dollars_per_kwh,percent_customers_with_nonzero_sys_size,avg_roof_sqft_customers_with_nonzero_sys_size,avg_roof_sqft_total,hourly_capacity_factor,hourly_capacity_factor_scalar
0,AL,1,1,com,0.0911,0.99,2473.0,4196.0,"[0,0,0,0,0,0,0,799,4190,6338,7315,7366,7049,65...",10000
1,AL,1,1,ind,0.1143,1.0,4196.0,4196.0,"[0,0,0,0,0,0,0,799,4190,6338,7315,7366,7049,65...",10000
2,AL,1,1,res,0.1037,0.54,415.0,349.0,"[0,0,0,0,0,0,0,799,4190,6338,7315,7366,7049,65...",10000
3,AL,1,3,com,0.0743,0.56,2596.0,3973.0,"[0,0,0,0,0,0,0,674,1976,2935,3579,1372,4828,55...",10000
4,AL,1,3,ind,0.0715,0.34,6109.0,3973.0,"[0,0,0,0,0,0,0,674,1976,2935,3579,1372,4828,55...",10000


In [1795]:
nrel = nrel.rename(columns={
    "state_abbr": "State Abbreviation",
    "state_fips": "State FIPS",
    "county_fips": "County FIPS",
    "sector_abbr": "Sector Abbreviation",
    "energy_value_us_dollars_per_kwh": "Energy Value US Dollars Per kWh",
    "percent_customers_with_nonzero_sys_size": "Percent Customers With Nonzero System Size",
    "avg_roof_sqft_customers_with_nonzero_sys_size": "Average Roof Sqft Customers With Nonzero System Size",
    "avg_roof_sqft_total": "Average Root Sqft Total",
    "hourly_capacity_factor": "Hourly Capacity Factor",
    "hourly_capacity_factor_scalar": "Hourly Capacity Factor Scalar"
})

In [1796]:
nrel["State FIPS"] = nrel["State FIPS"].apply(lambda x: x.zfill(2))
nrel["County FIPS"] = nrel["County FIPS"].apply(lambda x: x.zfill(3))

In [1797]:
full_fips = {
    "04013": "Maricopa, AZ",
    "06037": "Los Angeles, CA",
    "12025": "Dade, FL",
    "36047": "Kings, NY",
    "53033": "King, WA",
    "17031": "Cook, IL",
    "32003": "Clark, NV",
    "48201": "Harris, TX",
    "55025": "Dane, WI",
    "51059": "Fairfax, VA"
}

In [1798]:
nrel["Full FIPS"] = nrel["State FIPS"] + nrel["County FIPS"]
nrel["Full FIPS"].sample(5)

4707    30011
4150    29033
244     05029
528     06037
3442    25009
Name: Full FIPS, dtype: object

In [1799]:
"12025" in nrel["Full FIPS"].unique()

False

Dade, FL is not present in the NREL data. It will be dropped from the VC data later.

In [1800]:
del full_fips["12025"]

In [1801]:
nrel = nrel[nrel["Full FIPS"].isin(full_fips.keys())]

In [1802]:
nrel["County"] = nrel["Full FIPS"].apply(lambda x: full_fips[x])
print(nrel["County"].unique())
print(len(nrel["County"].unique()))

['Maricopa, AZ' 'Los Angeles, CA' 'Cook, IL' 'Clark, NV' 'Kings, NY'
 'Harris, TX' 'Fairfax, VA' 'King, WA' 'Dane, WI']
9


In [1803]:
nrel["Hourly Capacity Factor"] = nrel["Hourly Capacity Factor"].apply(lambda x: ast.literal_eval(x))

In [1804]:
nrel = nrel.explode("Hourly Capacity Factor").reset_index(drop=True)
nrel.head()

Unnamed: 0,State Abbreviation,State FIPS,County FIPS,Sector Abbreviation,Energy Value US Dollars Per kWh,Percent Customers With Nonzero System Size,Average Roof Sqft Customers With Nonzero System Size,Average Root Sqft Total,Hourly Capacity Factor,Hourly Capacity Factor Scalar,Full FIPS,County
0,AZ,4,13,com,0.0638,1.0,1880.0,1880.0,0,10000,4013,"Maricopa, AZ"
1,AZ,4,13,com,0.0638,1.0,1880.0,1880.0,0,10000,4013,"Maricopa, AZ"
2,AZ,4,13,com,0.0638,1.0,1880.0,1880.0,0,10000,4013,"Maricopa, AZ"
3,AZ,4,13,com,0.0638,1.0,1880.0,1880.0,0,10000,4013,"Maricopa, AZ"
4,AZ,4,13,com,0.0638,1.0,1880.0,1880.0,0,10000,4013,"Maricopa, AZ"


In [1805]:
def add_datetime(group):
    start = pd.Timestamp("2024-01-01 00:00:00")
    group["Datetime"] = [start + pd.Timedelta(hours=i) for i in range(len(group))]
    return group

nrel = nrel.groupby(["County", "Sector Abbreviation"]).apply(add_datetime, include_groups=False).reset_index(drop=False)

In [1806]:
nrel[nrel["State Abbreviation"] == "WI"]

Unnamed: 0,County,Sector Abbreviation,level_2,State Abbreviation,State FIPS,County FIPS,Energy Value US Dollars Per kWh,Percent Customers With Nonzero System Size,Average Roof Sqft Customers With Nonzero System Size,Average Root Sqft Total,Hourly Capacity Factor,Hourly Capacity Factor Scalar,Full FIPS,Datetime
52560,"Dane, WI",com,210240,WI,55,025,0.0762,0.69,7431.0,7375.0,0,10000,55025,2024-01-01 00:00:00
52561,"Dane, WI",com,210241,WI,55,025,0.0762,0.69,7431.0,7375.0,0,10000,55025,2024-01-01 01:00:00
52562,"Dane, WI",com,210242,WI,55,025,0.0762,0.69,7431.0,7375.0,0,10000,55025,2024-01-01 02:00:00
52563,"Dane, WI",com,210243,WI,55,025,0.0762,0.69,7431.0,7375.0,0,10000,55025,2024-01-01 03:00:00
52564,"Dane, WI",com,210244,WI,55,025,0.0762,0.69,7431.0,7375.0,0,10000,55025,2024-01-01 04:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78835,"Dane, WI",res,236515,WI,55,025,0.1242,1.00,426.0,426.0,0,10000,55025,2024-12-30 19:00:00
78836,"Dane, WI",res,236516,WI,55,025,0.1242,1.00,426.0,426.0,0,10000,55025,2024-12-30 20:00:00
78837,"Dane, WI",res,236517,WI,55,025,0.1242,1.00,426.0,426.0,0,10000,55025,2024-12-30 21:00:00
78838,"Dane, WI",res,236518,WI,55,025,0.1242,1.00,426.0,426.0,0,10000,55025,2024-12-30 22:00:00


In [1807]:
# Divide hourly_capacity_factor by hourly_capacity_value_scalar to render generation in the correct unit (kWh/kW)
# nrel["Real Generation"] = nrel.apply(lambda row: [hour / row["Hourly Capacity Factor Scalar"] for hour in ast.literal_eval(row["Hourly Capacity Factor"])], axis=1)
# nrel.apply(lambda x: print(x.keys())) # ["Hourly Capacity Factor"] / x["Hourly Capacity Factor Scalar"]
nrel["Real Generation"] = nrel["Hourly Capacity Factor"] / nrel["Hourly Capacity Factor Scalar"]
nrel.head()

Unnamed: 0,County,Sector Abbreviation,level_2,State Abbreviation,State FIPS,County FIPS,Energy Value US Dollars Per kWh,Percent Customers With Nonzero System Size,Average Roof Sqft Customers With Nonzero System Size,Average Root Sqft Total,Hourly Capacity Factor,Hourly Capacity Factor Scalar,Full FIPS,Datetime,Real Generation
0,"Clark, NV",com,78840,NV,32,3,0.0598,1.0,11746.0,11746.0,0,10000,32003,2024-01-01 00:00:00,0.0
1,"Clark, NV",com,78841,NV,32,3,0.0598,1.0,11746.0,11746.0,0,10000,32003,2024-01-01 01:00:00,0.0
2,"Clark, NV",com,78842,NV,32,3,0.0598,1.0,11746.0,11746.0,0,10000,32003,2024-01-01 02:00:00,0.0
3,"Clark, NV",com,78843,NV,32,3,0.0598,1.0,11746.0,11746.0,0,10000,32003,2024-01-01 03:00:00,0.0
4,"Clark, NV",com,78844,NV,32,3,0.0598,1.0,11746.0,11746.0,0,10000,32003,2024-01-01 04:00:00,0.0


In [1808]:
# for county in nrel["County"].unique():
#     new_row = {"County": county}

#     nrel[nrel["County"] == county]

In [1809]:
# def agg_func(x):
#     # Transpose the lists so that matching indices of the lists are grouped
#     # [[1, 2, 3], [4, 5, 6], [7, 8, 9]] -> [[1, 4, 7], [2, 5, 8], [3, 6, 9]]
#     transposed = zip(*x.values)

#     # Calculate the average of each group of indices
#     return [sum(values) / len(values) for values in transposed]

# nrel = nrel.groupby("State Abbreviation")["Real Generation"].agg(agg_func)
# print(nrel)

In [1810]:
power_density = 160   # 160 W/m^2 (Rooftop Solar Photovoltaic Technical Potential in the United States: A Detailed Assessment)

# Convert power_density from W/m^2 to kW/m^2
power_density = power_density / 1000   # 1000 W = 1 kW

nrel = nrel.reset_index(drop=False)

# Convert hourly_capacity_factor (kWh) to kWh/m^2
nrel["Energy Per Area"] = nrel["Real Generation"] * power_density  # kWh/kW * kW/m^2 = kWh/m^2
nrel[["County", "Energy Per Area"]].sample(5).head()

Unnamed: 0,County,Energy Per Area
6487,"Clark, NV",0.052688
125771,"Harris, TX",0.104624
144223,"King, WA",0.013072
227748,"Maricopa, AZ",0.128368
13124,"Clark, NV",0.0


In [1811]:
for county in nrel["County"].unique():
    # print(len(nrel[nrel["County"] == county]))
    assert len(nrel[nrel["County"] == county]) / 3 == 8760   # Divide by 3 for 3 sectors

In [1812]:
nrel.head()

Unnamed: 0,index,County,Sector Abbreviation,level_2,State Abbreviation,State FIPS,County FIPS,Energy Value US Dollars Per kWh,Percent Customers With Nonzero System Size,Average Roof Sqft Customers With Nonzero System Size,Average Root Sqft Total,Hourly Capacity Factor,Hourly Capacity Factor Scalar,Full FIPS,Datetime,Real Generation,Energy Per Area
0,0,"Clark, NV",com,78840,NV,32,3,0.0598,1.0,11746.0,11746.0,0,10000,32003,2024-01-01 00:00:00,0.0,0.0
1,1,"Clark, NV",com,78841,NV,32,3,0.0598,1.0,11746.0,11746.0,0,10000,32003,2024-01-01 01:00:00,0.0,0.0
2,2,"Clark, NV",com,78842,NV,32,3,0.0598,1.0,11746.0,11746.0,0,10000,32003,2024-01-01 02:00:00,0.0,0.0
3,3,"Clark, NV",com,78843,NV,32,3,0.0598,1.0,11746.0,11746.0,0,10000,32003,2024-01-01 03:00:00,0.0,0.0
4,4,"Clark, NV",com,78844,NV,32,3,0.0598,1.0,11746.0,11746.0,0,10000,32003,2024-01-01 04:00:00,0.0,0.0


In [1813]:
nrel.drop(["index", "Real Generation"], axis="columns", inplace=True)

# com = nrel[nrel["Sector Abbreviation"] == "com"]
# res = nrel[nrel["Sector Abbreviation"] == "res"]
# ind = nrel[nrel["Sector Abbreviation"] == "ind"]
# ind

## Clean the Visual Crossing Data

In [1814]:
vc = pd.read_csv("hourly ten counties 2023-04-01 to 2025-04-09.csv")
vc.head()

Unnamed: 0,name,datetime,temp,feelslike,dew,humidity,precip,precipprob,preciptype,snow,...,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,severerisk,conditions,icon,stations
0,"Maricopa County, AZ",2023-04-01T00:00:00,51.5,51.5,29.4,42.42,0.0,0.0,,0.0,...,1019.3,5.7,8.5,0.0,0.0,0,10,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111"
1,"Maricopa County, AZ",2023-04-01T01:00:00,52.3,52.3,30.0,42.2,0.0,0.0,,0.0,...,1018.8,5.7,9.9,0.0,0.0,0,10,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111"
2,"Maricopa County, AZ",2023-04-01T02:00:00,50.5,50.5,30.0,45.05,0.0,0.0,,0.0,...,1018.6,3.2,9.7,0.0,0.0,0,10,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111"
3,"Maricopa County, AZ",2023-04-01T03:00:00,46.5,42.9,30.1,52.67,0.0,0.0,,0.0,...,1018.2,3.2,9.8,0.0,0.0,0,10,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111"
4,"Maricopa County, AZ",2023-04-01T04:00:00,45.9,42.5,30.1,53.86,0.0,0.0,,0.0,...,1017.7,3.2,9.7,0.0,0.0,0,10,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111"


In [1815]:
vc.columns.values

array(['name', 'datetime', 'temp', 'feelslike', 'dew', 'humidity',
       'precip', 'precipprob', 'preciptype', 'snow', 'snowdepth',
       'windgust', 'windspeed', 'winddir', 'sealevelpressure',
       'cloudcover', 'visibility', 'solarradiation', 'solarenergy',
       'uvindex', 'severerisk', 'conditions', 'icon', 'stations'],
      dtype=object)

In [1816]:
vc = vc.rename(columns={
    "name": "County",
    "datetime": "Datetime",
    "temp": "Temperature",
    "feelslike": "Feels Like",
    "dew": "Dew",
    "humidity": "Humidity",
    "precip": "Precipitation",
    "precipprob": "Precipitation Probability",
    "preciptype": "Precipitation Type",
    "snow": "Snow",
    "snowdepth": "Snow Depth",
    "windgust": "Wind Gust",
    "windspeed": "Wind Speed",
    "winddir": "Wind Direction",
    "sealevelpressure": "Sea Level Pressure",
    "cloudcover": "Cloud Cover",
    "visibility": "Visibility",
    "solarradiation": "Solar Radiation",
    "solarenergy": "Solar Energy",
    "uvindex": "UV Index",
    "severerisk": "Severe Risk",
    "conditions": "Conditions",
    "icon": "Icon",
    "stations": "Stations"})

In [1817]:
vc["Datetime"].dtype

dtype('O')

In [1818]:
vc["Datetime"] = pd.to_datetime(vc["Datetime"])

In [1819]:
print(vc["Datetime"].min())
print(vc["Datetime"].max())

2023-04-01 00:00:00
2025-04-08 23:00:00


In [1820]:
condition = vc["Datetime"].dt.year != 2024
vc = vc.drop(vc[condition].index)

In [1821]:
vc["County"].unique()

array(['Maricopa County, AZ', 'los angeles, Ca', 'Miami-Dade, FL',
       'Kings County, NY', 'King County, WA', 'Cook county, IL',
       'Clark county, NV', 'Harris County, TX', 'Dane County, WI',
       'Fairfax County, VA'], dtype=object)

In [1822]:
county_replacement = {
    "Maricopa County, AZ": "Maricopa, AZ",
    "los angeles, Ca": "Los Angeles, CA",
    "Miami-Dade, FL": "Dade, FL",
    "Kings County, NY": "Kings, NY",
    "King County, WA": "King, WA",
    "Cook county, IL": "Cook, IL",
    "Clark county, NV": "Clark, NV",
    "Harris County, TX": "Harris, TX",
    "Dane County, WI": "Dane, WI",
    "Fairfax County, VA": "Fairfax, VA"
}
vc["County"] = vc["County"].replace(county_replacement)

In [1823]:
print(vc["County"].unique())
print(len(vc["County"].unique()))

['Maricopa, AZ' 'Los Angeles, CA' 'Dade, FL' 'Kings, NY' 'King, WA'
 'Cook, IL' 'Clark, NV' 'Harris, TX' 'Dane, WI' 'Fairfax, VA']
10


In [1824]:
condition = vc["County"] == "Dade, FL"
vc = vc.drop(vc[condition].index)

In [1825]:
vc["State Abbreviation"] = vc["County"].apply(lambda x: x[-2:])

In [1826]:
# vc = vc[vc["County"] == county]

In [1827]:
vc["Datetime"].nunique()

8784

In [1828]:
vc["Datetime"].value_counts()

Datetime
2024-11-03 01:00:00    17
2024-12-31 08:00:00     9
2024-12-30 16:00:00     9
2024-01-01 00:00:00     9
2024-12-31 07:00:00     9
                       ..
2024-01-02 05:00:00     9
2024-01-02 06:00:00     9
2024-01-02 07:00:00     9
2024-01-01 17:00:00     9
2024-03-10 02:00:00     1
Name: count, Length: 8784, dtype: int64

In [1829]:
# vc = vc.drop_duplicates(subset=["County", "Datetime"])
# vc["Datetime"].value_counts()

In [1830]:
print(vc["Datetime"].min())
print(vc["Datetime"].max())

2024-01-01 00:00:00
2024-12-31 23:00:00


In [1831]:
vc[vc["Datetime"].dt.month == 2]

Unnamed: 0,County,Datetime,Temperature,Feels Like,Dew,Humidity,Precipitation,Precipitation Probability,Precipitation Type,Snow,...,Cloud Cover,Visibility,Solar Radiation,Solar Energy,UV Index,Severe Risk,Conditions,Icon,Stations,State Abbreviation
7344,"Maricopa, AZ",2024-02-01 00:00:00,65.6,65.6,46.8,50.66,0.0,0.0,,0.0,...,14.2,9.9,0.0,0.0,0,10,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111,SRP39",AZ
7345,"Maricopa, AZ",2024-02-01 01:00:00,63.8,63.8,46.7,53.76,0.0,0.0,,0.0,...,9.9,9.9,0.0,0.0,0,10,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111,SRP39",AZ
7346,"Maricopa, AZ",2024-02-01 02:00:00,62.3,62.3,46.5,56.25,0.0,0.0,,0.0,...,19.3,9.9,0.0,0.0,0,10,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111,SRP39",AZ
7347,"Maricopa, AZ",2024-02-01 03:00:00,59.4,59.4,46.5,62.21,0.0,0.0,,0.0,...,5.7,9.1,0.0,0.0,0,10,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111,SRP39",AZ
7348,"Maricopa, AZ",2024-02-01 04:00:00,56.5,56.5,46.4,68.99,0.0,0.0,,0.0,...,10.9,9.9,0.0,0.0,0,10,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111,SRP39",AZ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167660,"Fairfax, VA",2024-02-29 19:00:00,43.7,39.4,10.6,25.58,0.0,0.0,,0.0,...,11.2,9.9,0.0,0.0,0,10,Clear,clear-night,"72403603710,KIAD,72403093738,72403793728,KDAA,...",VA
167661,"Fairfax, VA",2024-02-29 20:00:00,41.9,37.5,10.8,27.51,0.0,0.0,,0.0,...,0.0,9.9,0.0,0.0,0,10,Clear,clear-night,"72403603710,KIAD,72403093738,72403793728,KDAA,...",VA
167662,"Fairfax, VA",2024-02-29 21:00:00,37.5,34.0,14.3,38.07,0.0,0.0,,0.0,...,0.0,9.9,0.0,0.0,0,10,Clear,clear-night,"72403603710,KIAD,72403093738,72403793728,KDAA,...",VA
167663,"Fairfax, VA",2024-02-29 22:00:00,34.4,34.4,16.6,47.62,0.0,0.0,,0.0,...,0.0,9.9,0.0,0.0,0,10,Clear,clear-night,"72403603710,KIAD,72403093738,72403793728,KDAA,...",VA


2024 was a leap year. February had 29 days. The total hours in 2024 was 8784 instead of 8760. 

In [1832]:
vc = vc[~((vc["Datetime"].dt.month == 2) & (vc["Datetime"].dt.day == 29))]
assert vc[(vc["Datetime"].dt.month == 2) & (vc["Datetime"].dt.day == 29)].empty

In [1833]:
vc["Datetime"].value_counts()

Datetime
2024-11-03 01:00:00    17
2024-01-01 01:00:00     9
2024-01-01 02:00:00     9
2024-01-01 03:00:00     9
2024-01-01 04:00:00     9
                       ..
2024-12-31 05:00:00     9
2024-12-31 06:00:00     9
2024-12-31 07:00:00     9
2024-12-31 23:00:00     9
2024-03-10 02:00:00     1
Name: count, Length: 8760, dtype: int64

In [1834]:
vc.drop_duplicates(["Datetime", "County"], inplace=True)
vc

Unnamed: 0,County,Datetime,Temperature,Feels Like,Dew,Humidity,Precipitation,Precipitation Probability,Precipitation Type,Snow,...,Cloud Cover,Visibility,Solar Radiation,Solar Energy,UV Index,Severe Risk,Conditions,Icon,Stations,State Abbreviation
6600,"Maricopa, AZ",2024-01-01 00:00:00,44.4,44.4,39.4,82.43,0.000,0.0,,0.0,...,1.8,9.0,0.0,0.0,0,10,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111,SRP39",AZ
6601,"Maricopa, AZ",2024-01-01 01:00:00,44.0,41.1,39.2,83.12,0.000,0.0,,0.0,...,3.2,8.9,0.0,0.0,0,10,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111,SRP39",AZ
6602,"Maricopa, AZ",2024-01-01 02:00:00,41.4,38.8,39.0,91.00,0.000,0.0,,0.0,...,9.3,8.4,0.0,0.0,0,10,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111,SRP39",AZ
6603,"Maricopa, AZ",2024-01-01 03:00:00,42.5,38.8,39.0,87.23,0.000,0.0,,0.0,...,14.2,9.7,0.0,0.0,0,10,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111,SRP39",AZ
6604,"Maricopa, AZ",2024-01-01 04:00:00,42.6,39.4,39.2,87.64,0.000,0.0,,0.0,...,13.2,9.3,0.0,0.0,0,10,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111,SRP39",AZ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175004,"Fairfax, VA",2024-12-31 19:00:00,48.8,46.6,47.0,93.55,0.095,100.0,rain,0.0,...,85.0,9.9,0.0,0.0,0,10,"Rain, Partially cloudy",rain,"72403603710,KIAD,72403093738,72403793728,KDAA,...",VA
175005,"Fairfax, VA",2024-12-31 20:00:00,49.2,46.6,46.8,91.62,0.000,0.0,rain,0.0,...,75.0,9.9,0.0,0.0,0,10,Partially cloudy,partly-cloudy-night,"72403603710,KIAD,72403093738,72403793728,KDAA,...",VA
175006,"Fairfax, VA",2024-12-31 21:00:00,48.1,48.1,46.0,92.62,0.000,0.0,rain,0.0,...,90.7,9.9,0.0,0.0,0,10,Overcast,cloudy,"72403603710,KIAD,72403093738,72403793728,KDAA,...",VA
175007,"Fairfax, VA",2024-12-31 22:00:00,47.0,47.0,45.6,94.94,0.000,0.0,,0.0,...,93.7,8.7,0.0,0.0,0,10,Overcast,cloudy,"72403603710,KIAD,72403093738,72403793728,KDAA,...",VA


In [1835]:
vc[(vc["Datetime"].dt.month == 11) & (vc["Datetime"].dt.day == 3) & (vc["Datetime"].dt.hour == 1)]

Unnamed: 0,County,Datetime,Temperature,Feels Like,Dew,Humidity,Precipitation,Precipitation Probability,Precipitation Type,Snow,...,Cloud Cover,Visibility,Solar Radiation,Solar Energy,UV Index,Severe Risk,Conditions,Icon,Stations,State Abbreviation
13969,"Maricopa, AZ",2024-11-03 01:00:00,57.6,57.6,40.7,53.23,0.0,0.0,,0.0,...,3.2,9.9,0.0,0.0,0,3,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111,SRP39",AZ
31705,"Los Angeles, CA",2024-11-03 01:00:00,56.3,56.3,53.9,91.81,0.0,0.0,,0.0,...,13.1,8.1,0.0,0.0,0,10,Clear,clear-night,"KHHR,72295023174,KBUR,KSMO,72295603167,F1624,7...",CA
67177,"Kings, NY",2024-11-03 01:00:00,47.6,44.0,31.7,53.86,0.0,0.0,,0.0,...,0.0,9.9,0.0,0.0,0,10,Clear,clear-night,"72505394728,KJFK,KLGA,74486094789,F8726,KNYC,7...",NY
84913,"King, WA",2024-11-03 01:00:00,47.7,47.7,43.6,85.69,0.0,0.0,rain,0.0,...,100.0,9.9,0.0,0.0,0,10,Overcast,cloudy,"72793024233,KSEA,KBFI,72793524234,72793494248,...",WA
102649,"Cook, IL",2024-11-03 01:00:00,50.6,50.6,33.2,51.26,0.0,0.0,,0.0,...,79.3,9.9,0.0,0.0,0,10,Partially cloudy,partly-cloudy-night,"72534014819,72212604879,KORD,KMDW,72530094846,...",IL
120385,"Clark, NV",2024-11-03 01:00:00,56.1,56.1,32.5,40.69,0.0,0.0,,0.0,...,7.7,9.9,0.0,0.0,0,10,Clear,clear-night,"KHND,KLAS,D4394,72386023169,KVGT,72484653123,7...",NV
138121,"Harris, TX",2024-11-03 01:00:00,74.1,74.1,73.3,97.24,0.0,0.0,,0.0,...,91.6,5.4,0.0,0.0,0,10,Overcast,cloudy,"72059400188,KIAH,KMCJ,72244012918,72243012960,...",TX
155857,"Dane, WI",2024-11-03 01:00:00,48.2,45.3,40.6,74.89,0.0,0.0,,0.0,...,100.0,9.9,0.0,0.0,0,10,Overcast,cloudy,"F3620,72641014837,KC29,99999900236,KMSN",WI
173593,"Fairfax, VA",2024-11-03 01:00:00,39.6,39.6,35.1,83.8,0.0,0.0,,0.0,...,0.0,9.9,0.0,0.0,0,10,Clear,clear-night,"72403603710,KIAD,72403093738,72403793728,KDAA,...",VA


In [1836]:
vc[(pd.to_datetime("2024-11-02 23:00:00") <= vc["Datetime"]) & (vc["Datetime"] <= pd.to_datetime("2024-11-04")) & (vc["County"] != "Maricopa, AZ")]

Unnamed: 0,County,Datetime,Temperature,Feels Like,Dew,Humidity,Precipitation,Precipitation Probability,Precipitation Type,Snow,...,Cloud Cover,Visibility,Solar Radiation,Solar Energy,UV Index,Severe Risk,Conditions,Icon,Stations,State Abbreviation
31703,"Los Angeles, CA",2024-11-02 23:00:00,58.5,58.5,55.3,89.06,0.0,0.0,,0.0,...,35.8,8.8,0.0,0.0,0,10,Partially cloudy,partly-cloudy-night,"KHHR,72295023174,KBUR,KSMO,72295603167,F1624,7...",CA
31704,"Los Angeles, CA",2024-11-03 00:00:00,57.6,57.6,55.3,92.05,0.0,0.0,,0.0,...,42.2,8.6,0.0,0.0,0,10,Partially cloudy,partly-cloudy-night,"KHHR,72295023174,KBUR,KSMO,72295603167,F1624,7...",CA
31705,"Los Angeles, CA",2024-11-03 01:00:00,56.3,56.3,53.9,91.81,0.0,0.0,,0.0,...,13.1,8.1,0.0,0.0,0,10,Clear,clear-night,"KHHR,72295023174,KBUR,KSMO,72295603167,F1624,7...",CA
31707,"Los Angeles, CA",2024-11-03 02:00:00,54.8,54.8,53.0,93.51,0.0,0.0,,0.0,...,0.0,8.5,0.0,0.0,0,10,Clear,clear-night,"KHHR,72295023174,KBUR,KSMO,72295603167,F1624,7...",CA
31708,"Los Angeles, CA",2024-11-03 03:00:00,55.3,55.3,50.4,83.34,0.0,0.0,,0.0,...,23.8,9.5,0.0,0.0,0,10,Partially cloudy,partly-cloudy-night,"KHHR,72295023174,KBUR,KSMO,72295603167,F1624,7...",CA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173613,"Fairfax, VA",2024-11-03 20:00:00,47.1,45.4,29.2,49.56,0.0,0.0,,0.0,...,11.2,9.9,0.0,0.0,0,10,Clear,clear-night,"72403603710,KIAD,72403093738,72403793728,KDAA,...",VA
173614,"Fairfax, VA",2024-11-03 21:00:00,44.2,44.2,29.2,55.45,0.0,0.0,,0.0,...,11.2,9.9,0.0,0.0,0,10,Clear,clear-night,"72403603710,KIAD,72403093738,72403793728,KDAA,...",VA
173615,"Fairfax, VA",2024-11-03 22:00:00,39.7,39.7,28.8,64.92,0.0,0.0,,0.0,...,11.2,9.9,0.0,0.0,0,10,Clear,clear-night,"72403603710,KIAD,72403093738,72403793728,KDAA,...",VA
173616,"Fairfax, VA",2024-11-03 23:00:00,37.2,37.2,28.3,69.90,0.0,0.0,,0.0,...,0.0,9.9,0.0,0.0,0,10,Clear,clear-night,"72403603710,KIAD,72403093738,F6547,KHEF",VA


In [1837]:
vc[(vc["County"] == "Los Angeles, CA") & (vc["Datetime"].dt.month == 11) & (vc["Datetime"].dt.day == 3)]

Unnamed: 0,County,Datetime,Temperature,Feels Like,Dew,Humidity,Precipitation,Precipitation Probability,Precipitation Type,Snow,...,Cloud Cover,Visibility,Solar Radiation,Solar Energy,UV Index,Severe Risk,Conditions,Icon,Stations,State Abbreviation
31704,"Los Angeles, CA",2024-11-03 00:00:00,57.6,57.6,55.3,92.05,0.0,0.0,,0.0,...,42.2,8.6,0.0,0.0,0,10,Partially cloudy,partly-cloudy-night,"KHHR,72295023174,KBUR,KSMO,72295603167,F1624,7...",CA
31705,"Los Angeles, CA",2024-11-03 01:00:00,56.3,56.3,53.9,91.81,0.0,0.0,,0.0,...,13.1,8.1,0.0,0.0,0,10,Clear,clear-night,"KHHR,72295023174,KBUR,KSMO,72295603167,F1624,7...",CA
31707,"Los Angeles, CA",2024-11-03 02:00:00,54.8,54.8,53.0,93.51,0.0,0.0,,0.0,...,0.0,8.5,0.0,0.0,0,10,Clear,clear-night,"KHHR,72295023174,KBUR,KSMO,72295603167,F1624,7...",CA
31708,"Los Angeles, CA",2024-11-03 03:00:00,55.3,55.3,50.4,83.34,0.0,0.0,,0.0,...,23.8,9.5,0.0,0.0,0,10,Partially cloudy,partly-cloudy-night,"KHHR,72295023174,KBUR,KSMO,72295603167,F1624,7...",CA
31709,"Los Angeles, CA",2024-11-03 04:00:00,56.5,56.5,46.7,69.61,0.0,0.0,,0.0,...,0.0,9.9,0.0,0.0,0,10,Clear,clear-night,"KHHR,72295023174,KBUR,SE428,KSMO,72295603167,7...",CA
31710,"Los Angeles, CA",2024-11-03 05:00:00,58.2,58.2,42.5,55.72,0.0,0.0,,0.0,...,0.0,9.9,0.0,0.0,0,10,Clear,clear-night,"KHHR,72295023174,KBUR,SE428,KSMO,72295603167,7...",CA
31711,"Los Angeles, CA",2024-11-03 06:00:00,58.2,58.2,42.4,55.43,0.0,0.0,,0.0,...,4.8,9.9,0.0,0.0,0,10,Clear,clear-night,"KHHR,72295023174,KBUR,SE428,KSMO,72295603167,7...",CA
31712,"Los Angeles, CA",2024-11-03 07:00:00,58.2,58.2,45.0,61.45,0.0,0.0,,0.0,...,16.3,9.2,9.0,0.0,0,10,Clear,clear-day,"KHHR,72295023174,KBUR,KSMO,72295603167,7228802...",CA
31713,"Los Angeles, CA",2024-11-03 08:00:00,62.7,62.7,40.2,43.52,0.0,0.0,,0.0,...,4.8,9.9,153.0,0.6,2,10,Clear,clear-day,"KHHR,72295023174,KBUR,KSMO,72295603167,7228802...",CA
31714,"Los Angeles, CA",2024-11-03 09:00:00,64.8,64.8,39.6,39.43,0.0,0.0,,0.0,...,4.8,9.9,382.0,1.4,4,10,Clear,clear-day,"KHHR,72295023174,KBUR,KSMO,72295603167,7228802...",CA


In [1838]:
print(len(vc[vc["County"] == "Clark, NV"]))

8759


In [1839]:
def find_missing_hour(group):
    expected_hours = set(pd.date_range(start="2024-01-01", periods=8760, freq="h"))
    expected_hours = {dt for dt in expected_hours if not (dt.month == 2 and dt.day == 29)}
    actual_hours = set(group["Datetime"])
    missing_hours = expected_hours - actual_hours
    return missing_hours

missing_hours = vc.groupby("County").apply(find_missing_hour, include_groups=False)
print(missing_hours)

County
Clark, NV          {2024-03-10 02:00:00}
Cook, IL           {2024-03-10 02:00:00}
Dane, WI           {2024-03-10 02:00:00}
Fairfax, VA        {2024-03-10 02:00:00}
Harris, TX         {2024-03-10 02:00:00}
King, WA           {2024-03-10 02:00:00}
Kings, NY          {2024-03-10 02:00:00}
Los Angeles, CA    {2024-03-10 02:00:00}
Maricopa, AZ                          {}
dtype: object


In [1840]:
vc[(vc["Datetime"].dt.month == 3) & (vc["Datetime"].dt.day == 10) & (vc["County"] == "Los Angeles, CA")]

Unnamed: 0,County,Datetime,Temperature,Feels Like,Dew,Humidity,Precipitation,Precipitation Probability,Precipitation Type,Snow,...,Cloud Cover,Visibility,Solar Radiation,Solar Energy,UV Index,Severe Risk,Conditions,Icon,Stations,State Abbreviation
25993,"Los Angeles, CA",2024-03-10 00:00:00,56.1,56.1,49.8,79.29,0.0,0.0,,0.0,...,2.9,9.9,0.0,0.0,0,10,Clear,clear-night,"KHHR,72295023174,KBUR,72295603167,F1624,722880...",CA
25994,"Los Angeles, CA",2024-03-10 01:00:00,55.0,55.0,48.5,78.64,0.0,0.0,,0.0,...,1.8,9.4,0.0,0.0,0,10,Clear,clear-night,"KHHR,72295023174,KBUR,72295603167,F1624,722880...",CA
25995,"Los Angeles, CA",2024-03-10 03:00:00,53.4,53.4,47.3,79.86,0.0,0.0,,0.0,...,2.9,9.8,0.0,0.0,0,10,Clear,clear-night,"KHHR,72295023174,KBUR,72295603167,F1624,722880...",CA
25996,"Los Angeles, CA",2024-03-10 04:00:00,52.4,52.4,44.5,74.33,0.0,0.0,,0.0,...,1.8,9.9,0.0,0.0,0,10,Clear,clear-night,"KHHR,72295023174,KBUR,72295603167,F1624,722880...",CA
25997,"Los Angeles, CA",2024-03-10 05:00:00,51.5,51.5,43.3,73.48,0.0,0.0,,0.0,...,1.8,9.9,0.0,0.0,0,10,Clear,clear-night,"KHHR,72295023174,KBUR,72295603167,F1624,722880...",CA
25998,"Los Angeles, CA",2024-03-10 06:00:00,50.8,50.8,42.8,73.96,0.0,0.0,,0.0,...,1.8,9.9,0.0,0.0,0,10,Clear,clear-night,"KHHR,72295023174,KBUR,AT259,72295603167,722880...",CA
25999,"Los Angeles, CA",2024-03-10 07:00:00,50.4,50.4,41.9,72.67,0.0,0.0,,0.0,...,2.9,9.9,0.0,0.0,0,10,Clear,clear-night,"KHHR,72295023174,KBUR,AT259,72295603167,722880...",CA
26000,"Los Angeles, CA",2024-03-10 08:00:00,51.3,51.3,42.0,70.39,0.0,0.0,,0.0,...,2.9,9.9,41.0,0.1,0,10,Clear,clear-day,"KHHR,72295023174,KBUR,72295603167,72288023152,...",CA
26001,"Los Angeles, CA",2024-03-10 09:00:00,56.5,56.5,42.7,59.93,0.0,0.0,,0.0,...,5.3,9.9,214.0,0.8,2,10,Clear,clear-day,"KHHR,72295023174,KBUR,72295603167,72288023152,...",CA
26002,"Los Angeles, CA",2024-03-10 10:00:00,61.7,61.7,42.7,49.62,0.0,0.0,,0.0,...,5.3,9.9,453.0,1.6,5,10,Clear,clear-day,"KHHR,72295023174,KBUR,72295603167,72288023152,...",CA


In [1841]:
# Fill in missing hours
def fill_missing_hours(group):
    missing_hours = find_missing_hour(group)
    for missing_hour in sorted(missing_hours):
        previous_row = group[group["Datetime"] < missing_hour].iloc[-1]
        new_row = previous_row.copy()
        new_row["Datetime"] = missing_hour
        group = pd.concat([group, new_row.to_frame().T], ignore_index=True)
    return group.sort_values(by="Datetime").reset_index(drop=True)

vc = vc.groupby("County").apply(fill_missing_hours, include_groups=False).reset_index(drop=False)
vc.drop("level_1", axis="columns", inplace=True)

In [1842]:
def assert_county_length(group):
    assert len(group) == 8760, f"County {group.County} does not have 8760 rows"
    return group

vc.groupby("County").apply(assert_county_length, include_groups=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Datetime,Temperature,Feels Like,Dew,Humidity,Precipitation,Precipitation Probability,Precipitation Type,Snow,Snow Depth,...,Cloud Cover,Visibility,Solar Radiation,Solar Energy,UV Index,Severe Risk,Conditions,Icon,Stations,State Abbreviation
County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
"Clark, NV",0,2024-01-01 00:00:00,48.5,46.2,35.4,60.45,0.0,0.0,,0.0,0.0,...,93.3,9.8,0.0,0.0,0,10,Overcast,cloudy,"KHND,KLAS,D4394,72386023169,KVGT,72484653123,7...",NV
"Clark, NV",1,2024-01-01 01:00:00,46.8,44.3,35.1,63.62,0.0,0.0,,0.0,0.0,...,92.1,9.8,0.0,0.0,0,10,Overcast,cloudy,"KHND,KLAS,D4394,72386023169,KVGT,72484653123,7...",NV
"Clark, NV",2,2024-01-01 02:00:00,46.1,46.1,34.4,63.65,0.0,0.0,,0.0,0.0,...,56.3,9.3,0.0,0.0,0,10,Partially cloudy,partly-cloudy-night,"KHND,KLAS,D4394,72386023169,KVGT,72484653123,7...",NV
"Clark, NV",3,2024-01-01 03:00:00,44.0,39.9,34.0,67.62,0.0,0.0,,0.0,0.0,...,16.5,9.8,0.0,0.0,0,10,Clear,clear-night,"KHND,KLAS,D4394,72386023169,KVGT,72484653123,7...",NV
"Clark, NV",4,2024-01-01 04:00:00,42.4,38.8,34.0,71.77,0.0,0.0,,0.0,0.0,...,14.0,9.9,0.0,0.0,0,10,Clear,clear-night,"KHND,KLAS,D4394,72386023169,KVGT,72484653123,7...",NV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Maricopa, AZ",78835,2024-12-31 19:00:00,56.9,56.9,30.1,35.91,0.0,0.0,,0.0,0.0,...,3.2,9.8,0.0,0.0,0,10,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111,SRP39",AZ
"Maricopa, AZ",78836,2024-12-31 20:00:00,54.8,54.8,28.9,36.79,0.0,0.0,,0.0,0.0,...,1.8,9.1,0.0,0.0,0,10,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111,SRP39",AZ
"Maricopa, AZ",78837,2024-12-31 21:00:00,54.0,54.0,26.3,34.04,0.0,0.0,,0.0,0.0,...,0.0,9.6,0.0,0.0,0,10,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111,SRP39",AZ
"Maricopa, AZ",78838,2024-12-31 22:00:00,51.2,51.2,27.8,40.22,0.0,0.0,,0.0,0.0,...,0.0,9.6,0.0,0.0,0,10,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111,SRP39",AZ


Solar Energy is measured in MJ/(m^2).

In [1843]:
# Convert solarenergy from MJ/m^2 to kWh/m^2
# 1 kWh = 3.6 MJ
# vc["Solar Energy"] = vc["Solar Energy"].apply(lambda x: x / 3.6)
vc["Solar Energy"] = vc["Solar Energy"] / 3.6

In [1844]:
vc.head()

Unnamed: 0,County,Datetime,Temperature,Feels Like,Dew,Humidity,Precipitation,Precipitation Probability,Precipitation Type,Snow,...,Cloud Cover,Visibility,Solar Radiation,Solar Energy,UV Index,Severe Risk,Conditions,Icon,Stations,State Abbreviation
0,"Clark, NV",2024-01-01 00:00:00,48.5,46.2,35.4,60.45,0.0,0.0,,0.0,...,93.3,9.8,0.0,0.0,0,10,Overcast,cloudy,"KHND,KLAS,D4394,72386023169,KVGT,72484653123,7...",NV
1,"Clark, NV",2024-01-01 01:00:00,46.8,44.3,35.1,63.62,0.0,0.0,,0.0,...,92.1,9.8,0.0,0.0,0,10,Overcast,cloudy,"KHND,KLAS,D4394,72386023169,KVGT,72484653123,7...",NV
2,"Clark, NV",2024-01-01 02:00:00,46.1,46.1,34.4,63.65,0.0,0.0,,0.0,...,56.3,9.3,0.0,0.0,0,10,Partially cloudy,partly-cloudy-night,"KHND,KLAS,D4394,72386023169,KVGT,72484653123,7...",NV
3,"Clark, NV",2024-01-01 03:00:00,44.0,39.9,34.0,67.62,0.0,0.0,,0.0,...,16.5,9.8,0.0,0.0,0,10,Clear,clear-night,"KHND,KLAS,D4394,72386023169,KVGT,72484653123,7...",NV
4,"Clark, NV",2024-01-01 04:00:00,42.4,38.8,34.0,71.77,0.0,0.0,,0.0,...,14.0,9.9,0.0,0.0,0,10,Clear,clear-night,"KHND,KLAS,D4394,72386023169,KVGT,72484653123,7...",NV


## Join the NREL and VC Data

In [1845]:
nrel.drop(["level_2", "State Abbreviation"], axis="columns", inplace=True)
nrel.head()

Unnamed: 0,County,Sector Abbreviation,State FIPS,County FIPS,Energy Value US Dollars Per kWh,Percent Customers With Nonzero System Size,Average Roof Sqft Customers With Nonzero System Size,Average Root Sqft Total,Hourly Capacity Factor,Hourly Capacity Factor Scalar,Full FIPS,Datetime,Energy Per Area
0,"Clark, NV",com,32,3,0.0598,1.0,11746.0,11746.0,0,10000,32003,2024-01-01 00:00:00,0.0
1,"Clark, NV",com,32,3,0.0598,1.0,11746.0,11746.0,0,10000,32003,2024-01-01 01:00:00,0.0
2,"Clark, NV",com,32,3,0.0598,1.0,11746.0,11746.0,0,10000,32003,2024-01-01 02:00:00,0.0
3,"Clark, NV",com,32,3,0.0598,1.0,11746.0,11746.0,0,10000,32003,2024-01-01 03:00:00,0.0
4,"Clark, NV",com,32,3,0.0598,1.0,11746.0,11746.0,0,10000,32003,2024-01-01 04:00:00,0.0


In [1846]:
print(nrel["Datetime"].dtype)
print(vc["Datetime"].dtype)

datetime64[ns]
object


In [1847]:
vc["Datetime"] = pd.to_datetime(vc["Datetime"])

In [1848]:
df = pd.merge(nrel, vc, on=["County", "Datetime"])
df

Unnamed: 0,County,Sector Abbreviation,State FIPS,County FIPS,Energy Value US Dollars Per kWh,Percent Customers With Nonzero System Size,Average Roof Sqft Customers With Nonzero System Size,Average Root Sqft Total,Hourly Capacity Factor,Hourly Capacity Factor Scalar,...,Cloud Cover,Visibility,Solar Radiation,Solar Energy,UV Index,Severe Risk,Conditions,Icon,Stations,State Abbreviation
0,"Clark, NV",com,32,003,0.0598,1.0,11746.0,11746.0,0,10000,...,93.3,9.8,0.0,0.0,0,10,Overcast,cloudy,"KHND,KLAS,D4394,72386023169,KVGT,72484653123,7...",NV
1,"Clark, NV",com,32,003,0.0598,1.0,11746.0,11746.0,0,10000,...,92.1,9.8,0.0,0.0,0,10,Overcast,cloudy,"KHND,KLAS,D4394,72386023169,KVGT,72484653123,7...",NV
2,"Clark, NV",com,32,003,0.0598,1.0,11746.0,11746.0,0,10000,...,56.3,9.3,0.0,0.0,0,10,Partially cloudy,partly-cloudy-night,"KHND,KLAS,D4394,72386023169,KVGT,72484653123,7...",NV
3,"Clark, NV",com,32,003,0.0598,1.0,11746.0,11746.0,0,10000,...,16.5,9.8,0.0,0.0,0,10,Clear,clear-night,"KHND,KLAS,D4394,72386023169,KVGT,72484653123,7...",NV
4,"Clark, NV",com,32,003,0.0598,1.0,11746.0,11746.0,0,10000,...,14.0,9.9,0.0,0.0,0,10,Clear,clear-night,"KHND,KLAS,D4394,72386023169,KVGT,72484653123,7...",NV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235867,"Maricopa, AZ",res,04,013,0.0965,1.0,392.0,392.0,0,10000,...,0.0,9.9,0.0,0.0,0,10,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111,SRP39",AZ
235868,"Maricopa, AZ",res,04,013,0.0965,1.0,392.0,392.0,0,10000,...,0.0,9.1,0.0,0.0,0,10,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111,SRP39",AZ
235869,"Maricopa, AZ",res,04,013,0.0965,1.0,392.0,392.0,0,10000,...,0.0,9.7,0.0,0.0,0,10,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111,SRP39",AZ
235870,"Maricopa, AZ",res,04,013,0.0965,1.0,392.0,392.0,0,10000,...,17.4,9.8,0.0,0.0,0,10,Clear,clear-night,"KLUF,72278023183,KPHX,KGXF,KBXK,72278523111,SRP39",AZ


In [1849]:
df.columns.values

array(['County', 'Sector Abbreviation', 'State FIPS', 'County FIPS',
       'Energy Value US Dollars Per kWh',
       'Percent Customers With Nonzero System Size',
       'Average Roof Sqft Customers With Nonzero System Size',
       'Average Root Sqft Total', 'Hourly Capacity Factor',
       'Hourly Capacity Factor Scalar', 'Full FIPS', 'Datetime',
       'Energy Per Area', 'Temperature', 'Feels Like', 'Dew', 'Humidity',
       'Precipitation', 'Precipitation Probability', 'Precipitation Type',
       'Snow', 'Snow Depth', 'Wind Gust', 'Wind Speed', 'Wind Direction',
       'Sea Level Pressure', 'Cloud Cover', 'Visibility',
       'Solar Radiation', 'Solar Energy', 'UV Index', 'Severe Risk',
       'Conditions', 'Icon', 'Stations', 'State Abbreviation'],
      dtype=object)

In [1850]:
df.to_csv("data.csv")

Calculate R^2

In [1851]:
# Calculate the mean of the actual values
mean_actual = vc["Solar Energy"].mean()
print(mean_actual)

0.14194817915326277


In [1852]:
# Calculate the total sum of squares
# tss = ((vc["Solar Energy"] - mean_actual) ** 2).sum()
tss = np.sum((vc["Solar Energy"] - mean_actual) ** 2)
print(tss)

4567.565356923993


In [1853]:
nrel = nrel.explode("Energy Per Area")
nrel.sample(5)

Unnamed: 0,County,Sector Abbreviation,State FIPS,County FIPS,Energy Value US Dollars Per kWh,Percent Customers With Nonzero System Size,Average Roof Sqft Customers With Nonzero System Size,Average Root Sqft Total,Hourly Capacity Factor,Hourly Capacity Factor Scalar,Full FIPS,Datetime,Energy Per Area
13469,"Clark, NV",ind,32,3,0.0623,0.9,12344.0,11746.0,207,10000,32003,2024-07-15 05:00:00,0.003312
19918,"Clark, NV",res,32,3,0.1057,1.0,285.0,285.0,0,10000,32003,2024-04-09 22:00:00,0.0
1434,"Clark, NV",com,32,3,0.0598,1.0,11746.0,11746.0,0,10000,32003,2024-02-29 18:00:00,0.0
81548,"Fairfax, VA",com,51,59,,0.0,,6689.0,0,10000,51059,2024-04-22 20:00:00,0.0
178828,"Kings, NY",res,36,47,0.091,1.0,130.0,130.0,0,10000,36047,2024-05-31 04:00:00,0.0


In [1854]:
nrel.drop(["Real Generation"], axis="columns", inplace=True)
nrel.sample(5)

KeyError: "['Real Generation'] not found in axis"

In [None]:
nrel["Energy Per Area"] = nrel["Energy Per Area"].astype("float64")

In [None]:
nrel = nrel[nrel["County"] == county]
len(nrel)

26280

In [None]:
print(nrel["Energy Per Area"].mean())

0.024788420091324204


In [None]:
print(vc["Solar Energy"].dtype)
print(nrel["Energy Per Area"].dtype)

float64
float64


In [None]:
assert vc["Solar Energy"].isna().sum() == 0
assert nrel["Energy Per Area"].isna().sum() == 0
print(len(vc))
print(len(nrel))
assert len(vc) == len(nrel)
assert vc["Solar Energy"].dtype == nrel["Energy Per Area"].dtype

8760
26280


AssertionError: 

In [None]:
# Calculate the residual sum of squares
# nrel["Residuals"] = vc["Solar Energy"] - nrel["Energy Per Area"]
# nrel.sample(10)
# print(type(vc))
# print(type(vc["Solar Energy"]))
# print(len(vc))
# print(vc.index.duplicated())
# print(vc.index)
# print(type(nrel))
# print(type(nrel["Energy Per Area"]))
# print(len(nrel))
# print(nrel.index.duplicated())
# print(nrel.index)
print(vc["Solar Energy"].iloc[515])
print(nrel["Energy Per Area"].iloc[515])
nrel = nrel.reset_index(drop=True)
vc = vc.reset_index(drop=True)
nrel["Residuals"] = vc["Solar Energy"] - nrel["Energy Per Area"]
rss = np.sum((nrel["Residuals"]) ** 2)
print(rss)

0.3333333333333333
0.12073600000000001
843.9993742139575


Calculate R squared

In [None]:
r2 = 1 - rss / tss
print(r2)

-0.15294961464730572
