Relative Humidity data is collected from Met Office: https://www.metoffice.gov.uk/hadobs/hadisd/

This dataset contains weather data from all counties from 1937 to 2020. The code to extract the US hourly climate data can be found in our group's github: https://github.com/CrivelliLab/Geospatial_Analsyis

I will upload the code how to convert hourly relative humidity data to daily relative humidity data to GitHub, but I will not talk about this part in my demo

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dask.dataframe as ddf
from pandas import Series, DataFrame

### we need to find the county adjacent data first
the data can be download here: https://www2.census.gov/geo/docs/reference/county_adjacency/county_adjacency2010.txt

In [4]:
county_adjacency = pd.read_csv('county_adjacency.txt', sep='\t', dtype = {'01001': object, '01001.1': 'object'})
county_adjacency

Unnamed: 0,"Autauga County, AL",01001,"Autauga County, AL.1",01001.1
0,"Autauga County, AL",01001,"Autauga County, AL",01001
1,,,"Chilton County, AL",01021
2,,,"Dallas County, AL",01047
3,,,"Elmore County, AL",01051
4,,,"Lowndes County, AL",01085
...,...,...,...,...
22195,"St. Croix Island, VI",78010,"St. Croix Island, VI",78010
22196,"St. John Island, VI",78020,"St. John Island, VI",78020
22197,,,"St. Thomas Island, VI",78030
22198,"St. Thomas Island, VI",78030,"St. John Island, VI",78020


In [5]:
county_adjacency = county_adjacency.rename(columns = {'Autauga County, AL': 'county', '01001': 'fips',  'Autauga County, AL.1': 'Neighbors', 
                                                      '01001.1':'Neighbor Code'})
county_adjacency

Unnamed: 0,county,fips,Neighbors,Neighbor Code
0,"Autauga County, AL",01001,"Autauga County, AL",01001
1,,,"Chilton County, AL",01021
2,,,"Dallas County, AL",01047
3,,,"Elmore County, AL",01051
4,,,"Lowndes County, AL",01085
...,...,...,...,...
22195,"St. Croix Island, VI",78010,"St. Croix Island, VI",78010
22196,"St. John Island, VI",78020,"St. John Island, VI",78020
22197,,,"St. Thomas Island, VI",78030
22198,"St. Thomas Island, VI",78030,"St. John Island, VI",78020


In [6]:
county_list = county_adjacency['county'].tolist()
county_list[0:50]

['Autauga County, AL',
 nan,
 nan,
 nan,
 nan,
 nan,
 'Baldwin County, AL',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 'Barbour County, AL',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 'Bibb County, AL',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 'Blount County, AL',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 'Bullock County, AL',
 nan,
 nan,
 nan,
 nan,
 nan,
 'Butler County, AL',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 'Calhoun County, AL']

In [9]:
# find the index that the county information is not NA
index_list = []
for i in range(len(county_list)):
    if pd.isna(county_list[i]) == False:
        index_list.append(i)
        
index_list[0:10]

[0, 6, 13, 22, 29, 36, 42, 49, 55, 61]

In [10]:
def fillNullCounty(index_list, county_list):
    for i in range(len(index_list)-1):
        k = index_list[i+1] - index_list[i] -1
        for j in range(k):
            county_list[index_list[i]+j+1] = county_list[index_list[i]]
        
    return county_list

In [11]:
county_list_filled = fillNullCounty(index_list, county_list)

In [12]:
county_list_filled[:50]

['Autauga County, AL',
 'Autauga County, AL',
 'Autauga County, AL',
 'Autauga County, AL',
 'Autauga County, AL',
 'Autauga County, AL',
 'Baldwin County, AL',
 'Baldwin County, AL',
 'Baldwin County, AL',
 'Baldwin County, AL',
 'Baldwin County, AL',
 'Baldwin County, AL',
 'Baldwin County, AL',
 'Barbour County, AL',
 'Barbour County, AL',
 'Barbour County, AL',
 'Barbour County, AL',
 'Barbour County, AL',
 'Barbour County, AL',
 'Barbour County, AL',
 'Barbour County, AL',
 'Barbour County, AL',
 'Bibb County, AL',
 'Bibb County, AL',
 'Bibb County, AL',
 'Bibb County, AL',
 'Bibb County, AL',
 'Bibb County, AL',
 'Bibb County, AL',
 'Blount County, AL',
 'Blount County, AL',
 'Blount County, AL',
 'Blount County, AL',
 'Blount County, AL',
 'Blount County, AL',
 'Blount County, AL',
 'Bullock County, AL',
 'Bullock County, AL',
 'Bullock County, AL',
 'Bullock County, AL',
 'Bullock County, AL',
 'Bullock County, AL',
 'Butler County, AL',
 'Butler County, AL',
 'Butler County, A

### we can apply the same process to 'fips' column

In [13]:
fips_list = county_list = county_adjacency['fips'].tolist()
fips_list[:50]

['01001',
 nan,
 nan,
 nan,
 nan,
 nan,
 '01003',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 '01005',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 '01007',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 '01009',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 '01011',
 nan,
 nan,
 nan,
 nan,
 nan,
 '01013',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 '01015']

In [14]:
fips_list_filled = fillNullCounty(index_list, fips_list)
fips_list_filled[:50]

['01001',
 '01001',
 '01001',
 '01001',
 '01001',
 '01001',
 '01003',
 '01003',
 '01003',
 '01003',
 '01003',
 '01003',
 '01003',
 '01005',
 '01005',
 '01005',
 '01005',
 '01005',
 '01005',
 '01005',
 '01005',
 '01005',
 '01007',
 '01007',
 '01007',
 '01007',
 '01007',
 '01007',
 '01007',
 '01009',
 '01009',
 '01009',
 '01009',
 '01009',
 '01009',
 '01009',
 '01011',
 '01011',
 '01011',
 '01011',
 '01011',
 '01011',
 '01013',
 '01013',
 '01013',
 '01013',
 '01013',
 '01013',
 '01013',
 '01015']

#### Create a new county adjacent dataframe

In [15]:
neighbor_county = county_adjacency['Neighbors'].tolist()
neighbor_code = county_adjacency['Neighbor Code'].tolist()
data = {'county': county_list_filled, 'fips': fips_list_filled, 'Neighbors': neighbor_county, 'Neighbor Code': neighbor_code}
county_adjacent =pd.DataFrame(data)
county_adjacent

Unnamed: 0,county,fips,Neighbors,Neighbor Code
0,"Autauga County, AL",01001,"Autauga County, AL",01001
1,"Autauga County, AL",01001,"Chilton County, AL",01021
2,"Autauga County, AL",01001,"Dallas County, AL",01047
3,"Autauga County, AL",01001,"Elmore County, AL",01051
4,"Autauga County, AL",01001,"Lowndes County, AL",01085
...,...,...,...,...
22195,"St. Croix Island, VI",78010,"St. Croix Island, VI",78010
22196,"St. John Island, VI",78020,"St. John Island, VI",78020
22197,"St. John Island, VI",78020,"St. Thomas Island, VI",78030
22198,"St. Thomas Island, VI",78030,"St. John Island, VI",78020


#### create a list of adjacent county list for each county

In [16]:
def str_cat(x):
    return x.str.cat(sep=', ')

In [17]:
county_adjacent = county_adjacent.groupby(['county', 'fips']).agg({'Neighbors': str_cat, 'Neighbor Code': str_cat})
county_adjacent = county_adjacent.reset_index()
county_adjacent

Unnamed: 0,county,fips,Neighbors,Neighbor Code
0,"Abbeville County, SC",45001,"Elbert County, GA, Abbeville County, SC, Ander...","13105, 45001, 45007, 45045, 45047, 45059, 45065"
1,"Acadia Parish, LA",22001,"Acadia Parish, LA, Evangeline Parish, LA, Jeff...","22001, 22039, 22053, 22055, 22097, 22113"
2,"Accomack County, VA",51001,"Somerset County, MD, Worcester County, MD, Acc...","24039, 24047, 51001, 51103, 51115, 51119, 5113..."
3,"Ada County, ID",16001,"Ada County, ID, Boise County, ID, Canyon Count...","16001, 16015, 16027, 16039, 16045, 16073"
4,"Adair County, IA",19001,"Adair County, IA, Adams County, IA, Audubon Co...","19001, 19003, 19009, 19029, 19049, 19077, 1912..."
...,...,...,...,...
3228,"Yuma County, AZ",04027,"La Paz County, AZ, Maricopa County, AZ, Pima C...","04012, 04013, 04019, 04027, 06025"
3229,"Yuma County, CO",08125,"Kit Carson County, CO, Logan County, CO, Phill...","08063, 08075, 08095, 08121, 08125, 20023, 3102..."
3230,"Zapata County, TX",48505,"Jim Hogg County, TX, Starr County, TX, Webb Co...","48247, 48427, 48479, 48505"
3231,"Zavala County, TX",48507,"Dimmit County, TX, Frio County, TX, Kinney Cou...","48127, 48163, 48271, 48323, 48325, 48463, 48507"


we can save this dataframe for future use

In [18]:
county_adjacent.to_csv('county_adjacency.csv')

### Uplaod the relative humidity data

In [19]:
year = '2015'

In [28]:
climate = ddf.read_csv(r'relative_humidity_data/RH_daily_' + str(year) + '.csv', dtype={'fips': 'object', 'year': 'object'}).compute().drop(columns={'Unnamed: 0'})
climate.head()

Unnamed: 0,year,month,day,longitude,latitude,coor,fips,RH_mean,RH_min,RH_max
0,2015,1,1,-124.16,40.81,"(40.81, -124.16)",6023,75.231653,48.314774,91.240265
1,2015,1,1,-123.2,39.128,"(39.128, -123.2)",6033,66.043233,25.072014,87.442169
2,2015,1,1,-123.2,39.128,"(39.128, -123.2)",6045,66.043233,25.072014,87.442169
3,2015,1,1,-121.817,37.7,"(37.7, -121.817)",6001,35.266683,9.924187,59.448555
4,2015,1,1,-120.709,39.276,"(39.276, -120.709)",6017,13.307584,5.550323,24.820023


In [29]:
# combine the year, month, day information together
climate['date'] = pd.to_datetime(climate[['year', 'month', 'day']])
climate.head()

Unnamed: 0,year,month,day,longitude,latitude,coor,fips,RH_mean,RH_min,RH_max,date
0,2015,1,1,-124.16,40.81,"(40.81, -124.16)",6023,75.231653,48.314774,91.240265,2015-01-01
1,2015,1,1,-123.2,39.128,"(39.128, -123.2)",6033,66.043233,25.072014,87.442169,2015-01-01
2,2015,1,1,-123.2,39.128,"(39.128, -123.2)",6045,66.043233,25.072014,87.442169,2015-01-01
3,2015,1,1,-121.817,37.7,"(37.7, -121.817)",6001,35.266683,9.924187,59.448555,2015-01-01
4,2015,1,1,-120.709,39.276,"(39.276, -120.709)",6017,13.307584,5.550323,24.820023,2015-01-01


In [35]:
len(climate['fips'].unique())

1775

#### only 1775 counties have relative humidity data in 1975, so we need to do data imputation

### for daily climate data, we have to impute the data every day to create a full data map every day 

In [37]:
date_list = list(climate['date'].unique())
date_list[:10]

[Timestamp('2015-01-01 00:00:00'),
 Timestamp('2015-01-02 00:00:00'),
 Timestamp('2015-01-03 00:00:00'),
 Timestamp('2015-01-04 00:00:00'),
 Timestamp('2015-01-05 00:00:00'),
 Timestamp('2015-01-06 00:00:00'),
 Timestamp('2015-01-07 00:00:00'),
 Timestamp('2015-01-08 00:00:00'),
 Timestamp('2015-01-09 00:00:00'),
 Timestamp('2015-01-10 00:00:00')]

In [31]:
# merge the climate data with county_adjacent dataframe every day
date_collection = []
for date_item in date_list:
    df = climate[climate['date'] == date_item]
    df_new = county_adjacent.merge(df, on = ['fips'], how = 'left') # here we have to use 'left' to keep the null values
    df_new['date'] = date_item
    date_collection.append(df_new)

In [32]:
county_date = pd.concat(date_collection)
county_date

Unnamed: 0,county,fips,Neighbors,Neighbor Code,year,month,day,longitude,latitude,coor,RH_mean,RH_min,RH_max,date
0,"Abbeville County, SC",45001,"Elbert County, GA, Abbeville County, SC, Ander...","13105, 45001, 45007, 45045, 45047, 45059, 45065",2015,1.0,1.0,-82.153,34.254,"(34.254, -82.153)",74.131870,35.016182,95.092339,2015-01-01
1,"Acadia Parish, LA",22001,"Acadia Parish, LA, Evangeline Parish, LA, Jeff...","22001, 22039, 22053, 22055, 22097, 22113",2015,1.0,1.0,-91.990,30.199,"(30.199, -91.99)",69.395964,53.901741,79.525787,2015-01-01
2,"Accomack County, VA",51001,"Somerset County, MD, Worcester County, MD, Acc...","24039, 24047, 51001, 51103, 51115, 51119, 5113...",,,,,,,,,,2015-01-01
3,"Ada County, ID",16001,"Ada County, ID, Boise County, ID, Canyon Count...","16001, 16015, 16027, 16039, 16045, 16073",2015,1.0,1.0,-116.241,43.567,"(43.567, -116.241)",79.331573,70.576622,86.520767,2015-01-01
4,"Adair County, IA",19001,"Adair County, IA, Adams County, IA, Audubon Co...","19001, 19003, 19009, 19029, 19049, 19077, 1912...",,,,,,,,,,2015-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3228,"Yuma County, AZ",04027,"La Paz County, AZ, Maricopa County, AZ, Pima C...","04012, 04013, 04019, 04027, 06025",,,,,,,,,,2015-12-31
3229,"Yuma County, CO",08125,"Kit Carson County, CO, Logan County, CO, Phill...","08063, 08075, 08095, 08121, 08125, 20023, 3102...",2015,12.0,31.0,-102.282,39.241,"(39.241, -102.282)",58.294590,25.593163,78.518997,2015-12-31
3230,"Zapata County, TX",48505,"Jim Hogg County, TX, Starr County, TX, Webb Co...","48247, 48427, 48479, 48505",,,,,,,,,,2015-12-31
3231,"Zavala County, TX",48507,"Dimmit County, TX, Frio County, TX, Kinney Cou...","48127, 48163, 48271, 48323, 48325, 48463, 48507",,,,,,,,,,2015-12-31


In [33]:
# create a dictionary to hold the fips and relative humidity data
def findDictionary(df, column_name):
    list_climate = df[column_name].tolist()
    fipsList = df['fips'].tolist()
    dict1 = {}
    for i, j in zip(fipsList, list_climate):
        dict1[i] = j
        
    return dict1

In [38]:
column_name = 'RH_mean'

In [41]:
dict1 = findDictionary(county_date, column_name)

In [44]:
# check the first 10 items in this dictionary
for index, item in enumerate(dict1.items()):
    if index == 10:
        break
    print(item)

('45001', 93.930663335891)
('22001', 72.1567090905231)
('51001', nan)
('16001', 75.128662109375)
('19001', nan)
('21001', nan)
('29001', 76.19796520730723)
('40001', nan)
('08001', 67.51653639475505)
('19003', nan)


In [45]:
def dataImputation1(column_value, fips, neighbor):
    if pd.isna(column_value) == True:
        neighborList = neighbor.split(', ')
        rateSum = 0
        k = 0
        for code in neighborList:
            if code in dict1:
                if pd.isna(dict1[code]) == False:
                    k = k + 1
                    rateSum = rateSum + dict1[code]
        if k != 0: 
            mean_rate = rateSum/k
            return mean_rate
    else:
        return column_value

In [46]:
county_date[column_name] = county_date.apply(lambda x: dataImputation1(x[column_name], x['fips'], x['Neighbor Code']), axis = 1)
county_date.head()

Unnamed: 0,county,fips,Neighbors,Neighbor Code,year,month,day,longitude,latitude,coor,RH_mean,RH_min,RH_max,date
0,"Abbeville County, SC",45001,"Elbert County, GA, Abbeville County, SC, Ander...","13105, 45001, 45007, 45045, 45047, 45059, 45065",2015.0,1.0,1.0,-82.153,34.254,"(34.254, -82.153)",74.13187,35.016182,95.092339,2015-01-01
1,"Acadia Parish, LA",22001,"Acadia Parish, LA, Evangeline Parish, LA, Jeff...","22001, 22039, 22053, 22055, 22097, 22113",2015.0,1.0,1.0,-91.99,30.199,"(30.199, -91.99)",69.395964,53.901741,79.525787,2015-01-01
2,"Accomack County, VA",51001,"Somerset County, MD, Worcester County, MD, Acc...","24039, 24047, 51001, 51103, 51115, 51119, 5113...",,,,,,,90.634712,,,2015-01-01
3,"Ada County, ID",16001,"Ada County, ID, Boise County, ID, Canyon Count...","16001, 16015, 16027, 16039, 16045, 16073",2015.0,1.0,1.0,-116.241,43.567,"(43.567, -116.241)",79.331573,70.576622,86.520767,2015-01-01
4,"Adair County, IA",19001,"Adair County, IA, Adams County, IA, Audubon Co...","19001, 19003, 19009, 19029, 19049, 19077, 1912...",,,,,,,69.838634,,,2015-01-01


In [47]:
len(county_date['fips'].unique())

3233

In [48]:
df_missing = county_date[pd.isna(county_date[column_name]) == True]
df_missing

Unnamed: 0,county,fips,Neighbors,Neighbor Code,year,month,day,longitude,latitude,coor,RH_mean,RH_min,RH_max,date
9,"Adams County, IA",19003,"Adair County, IA, Adams County, IA, Cass Count...","19001, 19003, 19029, 19137, 19145, 19159, 1917...",,,,,,,,,,2015-01-01
13,"Adams County, MS",28001,"Concordia Parish, LA, Tensas Parish, LA, Adams...","22029, 22107, 28001, 28037, 28063, 28157",,,,,,,,,,2015-01-01
21,"Adjuntas Municipio, PR",72001,"Adjuntas Municipio, PR, Guayanilla Municipio, ...","72001, 72059, 72081, 72111, 72113, 72141, 72153",,,,,,,,,,2015-01-01
22,"Aguada Municipio, PR",72003,"Aguada Municipio, PR, Aguadilla Municipio, PR,...","72003, 72005, 72011, 72099, 72117",,,,,,,,,,2015-01-01
23,"Aguadilla Municipio, PR",72005,"Aguada Municipio, PR, Aguadilla Municipio, PR,...","72003, 72005, 72071, 72099",,,,,,,,,,2015-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3202,"Wythe County, VA",51197,"Bland County, VA, Carroll County, VA, Grayson ...","51021, 51035, 51077, 51155, 51173, 51197",,,,,,,,,,2015-12-31
3203,"Yabucoa Municipio, PR",72151,"Humacao Municipio, PR, Las Piedras Municipio, ...","72069, 72085, 72095, 72109, 72129, 72151",,,,,,,,,,2015-12-31
3212,"Yauco Municipio, PR",72153,"Adjuntas Municipio, PR, Guánica Municipio, PR,...","72001, 72055, 72059, 72081, 72093, 72121, 72153",,,,,,,,,,2015-12-31
3218,"Yoakum County, TX",48501,"Lea County, NM, Cochran County, TX, Gaines Cou...","35025, 48079, 48165, 48219, 48445, 48501",,,,,,,,,,2015-12-31


### Repeat this process again

In [49]:
def dataImputation2(column_value, fips, neighbor):
    if pd.isna(column_value) == True:
        neighborList = neighbor.split(', ')
        rateSum = 0
        k = 0
        for code in neighborList:
            if code in dict2:
                if pd.isna(dict2[code]) == False:
                    k = k + 1
                    rateSum = rateSum + dict2[code]
        if k != 0: 
            mean_rate = rateSum/k
            return mean_rate
    else:
        return column_value

In [50]:
dict2 = findDictionary(county_date, column_name)
county_date[column_name] = county_date.apply(lambda x: dataImputation2(x[column_name], x['fips'], x['Neighbor Code']), axis = 1)

In [51]:
df_missing = county_date[pd.isna(county_date[column_name]) == True]
df_missing

Unnamed: 0,county,fips,Neighbors,Neighbor Code,year,month,day,longitude,latitude,coor,RH_mean,RH_min,RH_max,date
21,"Adjuntas Municipio, PR",72001,"Adjuntas Municipio, PR, Guayanilla Municipio, ...","72001, 72059, 72081, 72111, 72113, 72141, 72153",,,,,,,,,,2015-01-01
22,"Aguada Municipio, PR",72003,"Aguada Municipio, PR, Aguadilla Municipio, PR,...","72003, 72005, 72011, 72099, 72117",,,,,,,,,,2015-01-01
23,"Aguadilla Municipio, PR",72005,"Aguada Municipio, PR, Aguadilla Municipio, PR,...","72003, 72005, 72071, 72099",,,,,,,,,,2015-01-01
24,"Aguas Buenas Municipio, PR",72007,"Aguas Buenas Municipio, PR, Bayamón Municipio,...","72007, 72021, 72025, 72041, 72045, 72061, 72127",,,,,,,,,,2015-01-01
25,"Aibonito Municipio, PR",72009,"Aibonito Municipio, PR, Barranquitas Municipio...","72009, 72019, 72035, 72041, 72043, 72123",,,,,,,,,,2015-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2983,"Vieques Municipio, PR",72147,"Ceiba Municipio, PR, Vieques Municipio, PR","72037, 72147",,,,,,,,,,2015-12-31
2986,"Villalba Municipio, PR",72149,"Coamo Municipio, PR, Juana Díaz Municipio, PR,...","72043, 72075, 72107, 72149",,,,,,,,,,2015-12-31
3111,"Western District, AS",60050,"Eastern District, AS, Western District, AS","60010, 60050",,,,,,,,,,2015-12-31
3203,"Yabucoa Municipio, PR",72151,"Humacao Municipio, PR, Las Piedras Municipio, ...","72069, 72085, 72095, 72109, 72129, 72151",,,,,,,,,,2015-12-31


### Repeet the process the third time

In [52]:
def dataImputation3(column_value, fips, neighbor):
    if pd.isna(column_value) == True:
        neighborList = neighbor.split(', ')
        rateSum = 0
        k = 0
        for code in neighborList:
            if code in dict3:
                if pd.isna(dict3[code]) == False:
                    k = k + 1
                    rateSum = rateSum + dict3[code]
        if k != 0: 
            mean_rate = rateSum/k
            return mean_rate
    else:
        return column_value

In [53]:
dict3 = findDictionary(county_date, column_name)
county_date[column_name] = county_date.apply(lambda x: dataImputation3(x[column_name], x['fips'], x['Neighbor Code']), axis = 1)

In [54]:
df_missing = county_date[pd.isna(county_date[column_name]) == True]
df_missing

Unnamed: 0,county,fips,Neighbors,Neighbor Code,year,month,day,longitude,latitude,coor,RH_mean,RH_min,RH_max,date
21,"Adjuntas Municipio, PR",72001,"Adjuntas Municipio, PR, Guayanilla Municipio, ...","72001, 72059, 72081, 72111, 72113, 72141, 72153",,,,,,,,,,2015-01-01
22,"Aguada Municipio, PR",72003,"Aguada Municipio, PR, Aguadilla Municipio, PR,...","72003, 72005, 72011, 72099, 72117",,,,,,,,,,2015-01-01
23,"Aguadilla Municipio, PR",72005,"Aguada Municipio, PR, Aguadilla Municipio, PR,...","72003, 72005, 72071, 72099",,,,,,,,,,2015-01-01
24,"Aguas Buenas Municipio, PR",72007,"Aguas Buenas Municipio, PR, Bayamón Municipio,...","72007, 72021, 72025, 72041, 72045, 72061, 72127",,,,,,,,,,2015-01-01
25,"Aibonito Municipio, PR",72009,"Aibonito Municipio, PR, Barranquitas Municipio...","72009, 72019, 72035, 72041, 72043, 72123",,,,,,,,,,2015-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2983,"Vieques Municipio, PR",72147,"Ceiba Municipio, PR, Vieques Municipio, PR","72037, 72147",,,,,,,,,,2015-12-31
2986,"Villalba Municipio, PR",72149,"Coamo Municipio, PR, Juana Díaz Municipio, PR,...","72043, 72075, 72107, 72149",,,,,,,,,,2015-12-31
3111,"Western District, AS",60050,"Eastern District, AS, Western District, AS","60010, 60050",,,,,,,,,,2015-12-31
3203,"Yabucoa Municipio, PR",72151,"Humacao Municipio, PR, Las Piedras Municipio, ...","72069, 72085, 72095, 72109, 72129, 72151",,,,,,,,,,2015-12-31


### Repeat the process the Fourth time

In [55]:
def dataImputation4(column_value, fips, neighbor):
    if pd.isna(column_value) == True:
        neighborList = neighbor.split(', ')
        rateSum = 0
        k = 0
        for code in neighborList:
            if code in dict4:
                if pd.isna(dict4[code]) == False:
                    k = k + 1
                    rateSum = rateSum + dict4[code]
        if k != 0: 
            mean_rate = rateSum/k
            return mean_rate
    else:
        return column_value

In [56]:
dict4 = findDictionary(county_date, column_name)
county_date[column_name] = county_date.apply(lambda x: dataImputation4(x[column_name], x['fips'], x['Neighbor Code']), axis = 1)

In [57]:
df_missing = county_date[pd.isna(county_date[column_name]) == True]
df_missing

Unnamed: 0,county,fips,Neighbors,Neighbor Code,year,month,day,longitude,latitude,coor,RH_mean,RH_min,RH_max,date
21,"Adjuntas Municipio, PR",72001,"Adjuntas Municipio, PR, Guayanilla Municipio, ...","72001, 72059, 72081, 72111, 72113, 72141, 72153",,,,,,,,,,2015-01-01
22,"Aguada Municipio, PR",72003,"Aguada Municipio, PR, Aguadilla Municipio, PR,...","72003, 72005, 72011, 72099, 72117",,,,,,,,,,2015-01-01
23,"Aguadilla Municipio, PR",72005,"Aguada Municipio, PR, Aguadilla Municipio, PR,...","72003, 72005, 72071, 72099",,,,,,,,,,2015-01-01
24,"Aguas Buenas Municipio, PR",72007,"Aguas Buenas Municipio, PR, Bayamón Municipio,...","72007, 72021, 72025, 72041, 72045, 72061, 72127",,,,,,,,,,2015-01-01
25,"Aibonito Municipio, PR",72009,"Aibonito Municipio, PR, Barranquitas Municipio...","72009, 72019, 72035, 72041, 72043, 72123",,,,,,,,,,2015-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2983,"Vieques Municipio, PR",72147,"Ceiba Municipio, PR, Vieques Municipio, PR","72037, 72147",,,,,,,,,,2015-12-31
2986,"Villalba Municipio, PR",72149,"Coamo Municipio, PR, Juana Díaz Municipio, PR,...","72043, 72075, 72107, 72149",,,,,,,,,,2015-12-31
3111,"Western District, AS",60050,"Eastern District, AS, Western District, AS","60010, 60050",,,,,,,,,,2015-12-31
3203,"Yabucoa Municipio, PR",72151,"Humacao Municipio, PR, Las Piedras Municipio, ...","72069, 72085, 72095, 72109, 72129, 72151",,,,,,,,,,2015-12-31


### Impute RH_min record with the same process

In [59]:
column_name = 'RH_min'

In [60]:
dict1 = findDictionary(county_date, column_name)
county_date[column_name] = county_date.apply(lambda x: dataImputation1(x[column_name], x['fips'], x['Neighbor Code']), axis = 1)

dict2 = findDictionary(county_date, column_name)
county_date[column_name] = county_date.apply(lambda x: dataImputation2(x[column_name], x['fips'], x['Neighbor Code']), axis = 1)

dict3 = findDictionary(county_date, column_name)
county_date[column_name] = county_date.apply(lambda x: dataImputation3(x[column_name], x['fips'], x['Neighbor Code']), axis = 1)

dict4 = findDictionary(county_date, column_name)
county_date[column_name] = county_date.apply(lambda x: dataImputation4(x[column_name], x['fips'], x['Neighbor Code']), axis = 1)

### Impute RH_max record with the same proccess

In [61]:
column_name = 'RH_max'

In [62]:
dict1 = findDictionary(county_date, column_name)
county_date[column_name] = county_date.apply(lambda x: dataImputation1(x[column_name], x['fips'], x['Neighbor Code']), axis = 1)

dict2 = findDictionary(county_date, column_name)
county_date[column_name] = county_date.apply(lambda x: dataImputation2(x[column_name], x['fips'], x['Neighbor Code']), axis = 1)

dict3 = findDictionary(county_date, column_name)
county_date[column_name] = county_date.apply(lambda x: dataImputation3(x[column_name], x['fips'], x['Neighbor Code']), axis = 1)

dict4 = findDictionary(county_date, column_name)
county_date[column_name] = county_date.apply(lambda x: dataImputation4(x[column_name], x['fips'], x['Neighbor Code']), axis = 1)

In [63]:
county_date

Unnamed: 0,county,fips,Neighbors,Neighbor Code,year,month,day,longitude,latitude,coor,RH_mean,RH_min,RH_max,date
0,"Abbeville County, SC",45001,"Elbert County, GA, Abbeville County, SC, Ander...","13105, 45001, 45007, 45045, 45047, 45059, 45065",2015,1.0,1.0,-82.153,34.254,"(34.254, -82.153)",74.131870,35.016182,95.092339,2015-01-01
1,"Acadia Parish, LA",22001,"Acadia Parish, LA, Evangeline Parish, LA, Jeff...","22001, 22039, 22053, 22055, 22097, 22113",2015,1.0,1.0,-91.990,30.199,"(30.199, -91.99)",69.395964,53.901741,79.525787,2015-01-01
2,"Accomack County, VA",51001,"Somerset County, MD, Worcester County, MD, Acc...","24039, 24047, 51001, 51103, 51115, 51119, 5113...",,,,,,,90.634712,76.105036,100.000000,2015-01-01
3,"Ada County, ID",16001,"Ada County, ID, Boise County, ID, Canyon Count...","16001, 16015, 16027, 16039, 16045, 16073",2015,1.0,1.0,-116.241,43.567,"(43.567, -116.241)",79.331573,70.576622,86.520767,2015-01-01
4,"Adair County, IA",19001,"Adair County, IA, Adams County, IA, Audubon Co...","19001, 19003, 19009, 19029, 19049, 19077, 1912...",,,,,,,69.838634,48.107819,82.730965,2015-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3228,"Yuma County, AZ",04027,"La Paz County, AZ, Maricopa County, AZ, Pima C...","04012, 04013, 04019, 04027, 06025",,,,,,,37.834705,18.618366,52.298965,2015-12-31
3229,"Yuma County, CO",08125,"Kit Carson County, CO, Logan County, CO, Phill...","08063, 08075, 08095, 08121, 08125, 20023, 3102...",2015,12.0,31.0,-102.282,39.241,"(39.241, -102.282)",58.294590,25.593163,78.518997,2015-12-31
3230,"Zapata County, TX",48505,"Jim Hogg County, TX, Starr County, TX, Webb Co...","48247, 48427, 48479, 48505",,,,,,,79.188763,66.160065,92.833443,2015-12-31
3231,"Zavala County, TX",48507,"Dimmit County, TX, Frio County, TX, Kinney Cou...","48127, 48163, 48271, 48323, 48325, 48463, 48507",,,,,,,74.426829,53.981486,90.659373,2015-12-31


In [64]:
data_valid = county_date[['county','fips', 'date', 'RH_mean', 'RH_min', 'RH_max']]

In [65]:
year

'2015'

In [66]:
data_valid.to_csv('relative_humidity_data_imputed/RH_daily_imputed_' + str(year) + '.csv')