In [15]:
import pandas as pd
import numpy as np
import tqdm

In [16]:
country_station_df = pd.read_csv('county-station.csv')

In [17]:
country_station_df = country_station_df.drop(columns=['Unnamed: 0'], axis=1)

In [41]:
columns = np.load('columns.npy')
metrics = ['avg', 'min', 'max']
county_names = list(set(country_station_df['County Name'].to_list()))
dates = np.load('dates.npy')

In [42]:
def get_stations(county):
    a = country_station_df[country_station_df['County Name'] == county]
    return a['Station'].to_list()

In [43]:
def get_data_from_station(station, correct=False):
    station_df = pd.read_csv('data/weather/minnesota_daily/' + station + '.csv', names=['data', 'avg', 'min', 'max','prec'], header=None, index_col='data')
    if correct:
        station_df = station_df.apply(correct_avg, axis=1)
    
    first_year = int(station_df.index[0][:4])
    last_year = int(station_df.index[-1][:4])
    
#     print(first_year, last_year)
#     print(station_df)
    
    data = {}
    for year in range(first_year, last_year + 1):
        # selected_df = station_df[str(year)+"-04-30":str(year)+"-08-25"]
        # print(len(selected_df))
        current_year = []
        for date in dates:
            for metric in metrics:
                if str(year) + '-' + date in station_df.index:
                    current_year.append(station_df.loc[str(year) + '-' + date][metric])
                else:
                    current_year.append(None)
        
        data[year] = np.asarray(current_year)
        
    # return pd.DataFrame(index=data.keys(), data=data.values(), columns=columns)
    return data



In [58]:
def unite_stations(station_data, county_name=''):
    years = []
    for data in station_data:
        years = years + list(data.keys())
    years = list(set(years))
    final_data = {}
    for year in years:
        curr = []
        for i in range(354):
            correct_values = []
            for d in station_data:
                if year in d.keys() and not pd.isnull(d[year][i]):
                    correct_values.append(d[year][i])
            if len(correct_values) == 0:
                curr.append(None)
            else:
                curr.append(sum(correct_values) / len(correct_values))
        
        if (len([x for x in curr if x is not None])) == 354:
            final_data[str(year) + '-' + county_name] = curr
    return final_data
        

In [59]:
final_dataframe = pd.DataFrame(columns=columns)
for county in tqdm.tqdm(county_names):
    stations = get_stations(county)
    stations_data = []
    for station in stations:
        curr = get_data_from_station(station)
        stations_data.append(curr)
    current_data = unite_stations(stations_data, county)
    current_dataframe = pd.DataFrame(index=current_data.keys(), data=current_data.values(), columns=columns)
    final_dataframe = pd.concat([final_dataframe, current_dataframe])
    

100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [00:30<00:00,  2.86it/s]


In [62]:
final_dataframe.to_csv('final_data_nonnull.csv')

In [60]:
final_dataframe

Unnamed: 0,04-30-avg,04-30-min,04-30-max,05-01-avg,05-01-min,05-01-max,05-02-avg,05-02-min,05-02-max,05-03-avg,...,08-22-max,08-23-avg,08-23-min,08-23-max,08-24-avg,08-24-min,08-24-max,08-25-avg,08-25-min,08-25-max
2020-McLeod County,21.45,15.5,27.3,17.55,7.05,25.35,20.65,15.75,26.35,26.10,...,27.55,22.40,17.2,29.10,10.05,3.65,15.60,11.90,9.75,14.20
2021-McLeod County,26.95,21.2,32.5,17.75,10.55,24.55,8.50,5.90,12.60,27.05,...,27.25,20.25,15.4,25.55,9.05,3.70,14.35,17.05,8.60,24.45
2021-Goodhue County,26.60,21.5,32.5,17.10,10.50,24.70,8.70,5.60,12.20,27.70,...,26.50,21.40,16.7,28.60,10.00,5.10,14.90,17.10,10.20,23.70
2017-Douglas County,22.50,17.2,27.2,15.50,8.30,22.20,16.10,8.90,22.20,25.60,...,21.70,18.60,15.0,21.70,12.70,3.30,20.00,16.20,11.70,22.20
2020-Douglas County,21.70,15.6,26.7,17.00,7.80,25.00,20.00,10.60,27.80,25.90,...,27.20,24.40,19.4,29.40,8.00,2.20,12.20,12.30,10.00,14.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-Clay County,21.95,17.0,26.0,17.70,8.00,26.00,20.80,12.50,29.00,25.55,...,28.00,25.85,21.0,31.00,7.60,2.50,12.00,14.50,7.50,20.00
2021-Clay County,26.20,20.0,32.0,16.00,8.50,23.00,9.40,7.00,12.00,25.95,...,28.50,21.00,17.0,25.50,5.90,-1.50,12.00,19.55,11.00,27.50
2021-Kandiyohi County,27.70,21.0,33.0,18.30,12.00,24.00,9.00,7.00,13.00,26.90,...,28.00,21.00,17.0,28.00,8.10,2.00,14.00,18.50,11.00,26.00
2021-Wright County,28.20,22.6,34.0,18.10,12.30,24.50,8.20,6.00,11.60,28.00,...,28.00,21.00,15.1,26.90,9.00,3.00,15.00,16.60,8.10,23.70


In [61]:
final_dataframe.isna().sum().sum()

0