## Notebook goal

Combine all dataframes into a single dataframe.

In [54]:
# import libraries
import os
import pandas as pd
import itertools

In [55]:
# # set working directory
ITM_DIR = os.path.join(os.getcwd(), '../data/intermediate')

In [56]:
# import datasets to combine

drought = pd.read_csv(os.path.join(ITM_DIR, 'drought_quarterly.csv'))
weather = pd.read_csv(os.path.join(ITM_DIR, 'quarterly_weather_summary.csv'))
bees = pd.read_csv(os.path.join(ITM_DIR, 'bees.csv'))

In [57]:
# combine datasets to the weather dataset on state, year and quarter
df = weather.merge(drought, how='left', on=['state', 'year', 'quarter'])
bees_full = df.merge(bees, how='left', on=['state', 'year', 'quarter'])

In [58]:
bees_full

Unnamed: 0,state,latitude,longitude,quarter,temperature_2m_meanmean,temperature_2m_meansum,relative_humidity_2m_meanmean,relative_humidity_2m_meansum,wind_speed_10m_maxmax,temperature_2m_maxmax,...,lost_colonies,percent_lost,added_colonies,renovated_colonies,percent_renovated,varroa_mites,other_pests_and_parasites,diseases,pesticides,other_or_unknown
0,Alabama,32.806671,-86.79113,1,9.138389,822.454992,72.118324,6490.649186,31.782108,25.149500,...,1800.0,26.0,2800.0,250.0,4.0,10.0,5.4,0.0,2.2,9.4
1,Alabama,32.806671,-86.79113,2,22.649935,2061.144045,73.748212,6711.087248,22.737070,35.149500,...,860.0,12.0,1900.0,680.0,9.0,16.7,42.5,0.0,2.3,4.1
2,Alabama,32.806671,-86.79113,3,26.056180,2397.168552,73.825042,6791.903907,18.014393,36.249500,...,1400.0,16.0,160.0,260.0,3.0,63.1,70.6,0.0,2.6,17.7
3,Alabama,32.806671,-86.79113,4,16.406248,1509.374798,75.721304,6966.360008,30.955812,29.049500,...,610.0,8.0,80.0,60.0,1.0,3.1,6.4,0.2,0.2,2.8
4,Alabama,32.806671,-86.79113,1,10.685764,972.404481,70.392922,6405.755898,32.777794,28.049500,...,1700.0,23.0,2100.0,90.0,1.0,24.2,22.0,4.3,8.1,11.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1645,Wyoming,42.755966,-107.30249,1,-8.064703,-725.823311,68.191139,6137.202541,45.143770,14.501500,...,180.0,2.0,20.0,0.0,0.0,2.2,0.0,0.0,0.0,1.2
1646,Wyoming,42.755966,-107.30249,2,6.131697,557.984432,54.287062,4940.122644,57.718000,29.551500,...,1900.0,7.0,4900.0,1800.0,7.0,12.6,2.5,5.1,0.0,3.0
1647,Wyoming,42.755966,-107.30249,3,18.094526,1664.696387,38.602263,3551.408216,40.732533,31.551502,...,3900.0,13.0,1100.0,2300.0,8.0,14.1,4.4,4.9,4.9,1.5
1648,Wyoming,42.755966,-107.30249,4,-4.583916,-421.720318,66.597217,6126.943967,56.993810,18.251501,...,3200.0,15.0,640.0,0.0,0.0,22.9,5.9,4.2,0.0,7.4


In [59]:
bees_full.isna().sum().sort_values(ascending=False).head(50)


other_or_unknown                 259
renovated_colonies               259
state_code                       259
num_colonies                     259
max_colonies                     259
percent_lost                     259
added_colonies                   259
lost_colonies                    259
percent_renovated                259
varroa_mites                     259
other_pests_and_parasites        259
diseases                         259
pesticides                       259
D3_max                             0
D2_mean                            0
D3_mean                            0
D4_mean                            0
D0_max                             0
D1_max                             0
D2_max                             0
D4_frac_nonzero                    0
D4_max                             0
D0_frac_nonzero                    0
D1_frac_nonzero                    0
D2_frac_nonzero                    0
D3_frac_nonzero                    0
latitude                           0
D

In [60]:
# check for which years and quarters data is missing 
missing = bees_full[bees_full.isna().any(axis=1)]
missing = missing[['state', 'year', 'quarter']].drop_duplicates()
missing = missing.sort_values(by=['state', 'year', 'quarter']).reset_index(drop=True)
missing

Unnamed: 0,state,year,quarter
0,Alabama,2019,2
1,Alabama,2023,1
2,Alaska,2015,1
3,Alaska,2015,2
4,Alaska,2015,3
...,...,...,...
254,West Virginia,2023,1
255,Wisconsin,2019,2
256,Wisconsin,2023,1
257,Wyoming,2019,2


In [61]:
# Drop rows with missing values except for year = 2019 and quarter = 2
bees_full = bees_full[~(bees_full.isna().any(axis=1) & ~((bees_full['year'] == 2019) & (bees_full['quarter'] == 2)))]

In [45]:
bees_full.isna().sum().sort_values(ascending=False).head(50)

other_or_unknown                 50
renovated_colonies               50
state_code                       50
num_colonies                     50
max_colonies                     50
percent_lost                     50
added_colonies                   50
lost_colonies                    50
percent_renovated                50
varroa_mites                     50
other_pests_and_parasites        50
diseases                         50
pesticides                       50
D3_max                            0
D2_mean                           0
D3_mean                           0
D4_mean                           0
D0_max                            0
D1_max                            0
D2_max                            0
D4_frac_nonzero                   0
D4_max                            0
D0_frac_nonzero                   0
D1_frac_nonzero                   0
D2_frac_nonzero                   0
D3_frac_nonzero                   0
latitude                          0
D1_mean                     

In [46]:
# check for which years and quarters data is missing 
missing = bees_full[bees_full.isna().any(axis=1)]
missing = missing[['state', 'year', 'quarter']].drop_duplicates()
missing = missing.sort_values(by=['state', 'year', 'quarter']).reset_index(drop=True)
missing

Unnamed: 0,state,year,quarter
0,Alabama,2019,2
1,Alaska,2019,2
2,Arizona,2019,2
3,Arkansas,2019,2
4,California,2019,2
5,Colorado,2019,2
6,Connecticut,2019,2
7,Delaware,2019,2
8,Florida,2019,2
9,Georgia,2019,2


# Missing data to be imputed

the 50 missing datapoints are for year 2019, quarter 2 for 50 states. The bees_full dataset however does not have data for the following states and 

In [47]:
# which states are in weather but not in bees_full
weather_states = weather.state.unique()
bees_states = bees.state.unique()
missing_states = set(weather_states) - set(bees_states)
missing_states

{'Alaska', 'Delaware', 'Nevada', 'New Hampshire', 'Rhode Island'}

In [48]:
# remove rows where state is in missing_states
bees_full = bees_full[~bees_full['state'].isin(missing_states)]

In [53]:
bees_full.state.value_counts()

state
Alabama           32
Montana           32
New Jersey        32
New Mexico        32
New York          32
North Carolina    32
North Dakota      32
Ohio              32
Oklahoma          32
Oregon            32
Pennsylvania      32
South Carolina    32
South Dakota      32
Tennessee         32
Texas             32
Utah              32
Vermont           32
Virginia          32
Washington        32
West Virginia     32
Wisconsin         32
Nebraska          32
Missouri          32
Arizona           32
Mississippi       32
Arkansas          32
California        32
Colorado          32
Connecticut       32
Florida           32
Georgia           32
Idaho             32
Illinois          32
Indiana           32
Iowa              32
Kansas            32
Kentucky          32
Louisiana         32
Maine             32
Maryland          32
Massachusetts     32
Michigan          32
Minnesota         32
Wyoming           32
Hawaii            28
Name: count, dtype: int64

## Hawaii is missing 2022 data as is kept in the dataset for now

In [62]:
# save csv file
OUT_DIR = os.path.join(os.getcwd(), '../data/cleaned')

bees_full.to_csv(os.path.join(OUT_DIR, 'bees_full_cleaned.csv'), index=False)