# Append and Clean Weather Data

The weather data was too large to download in one file for all stations of interests so I downloaded them in chunks and then merged them.

In [1]:
import pandas as pd
import math
import time, sys
from datetime import datetime

## Global Variables

In [2]:
# data_file1 = 'STA1_200801010000_200812310000.txt'
# data_file2 = 'STA2_200801010000_200812310000.txt'

data_file1 = 'SAMPLE_STA1.csv'
data_file2 = 'SAMPLE_STA2.csv'

## Function Definitions

In [3]:
# create epoch time column in flights and weather data
def get_epoch(str):
    dt_obj = datetime.strptime(str,'%Y-%m-%d %H:%M')
    millisec = dt_obj.timestamp()
    millisec = millisec - (millisec % 3600)
    return millisec*1000

# add epoch time from departure time to each flight
def format_as_epoch_time(year, month, dayofMonth, deptime):
    time_str = str(int(deptime))
    time_str = time_str[:-2] + ':' + time_str[-2:]
    
    my_str = "{}-{}-{} {}".format(year, month, dayofMonth, time_str)
    try:
        return get_epoch(my_str)
    except:
        return 0

## Reading the Data

In [4]:
weather_data = [pd.read_csv(data_file1), pd.read_csv(data_file2)] 
for data in weather_data:
    display(len(data['station']))
    display(data.head(2))

500

Unnamed: 0.1,Unnamed: 0,station,valid,lon,lat,tmpf,dwpf,relh,drct,sknt,...,skyl4,wxcodes,ice_accretion_1hr,ice_accretion_3hr,ice_accretion_6hr,peak_wind_gust,peak_wind_drct,peak_wind_time,feel,metar
0,5614783,CBF,2008-12-22 15:04,-95.7604,41.2611,-5.0,-11.0,74.61,0.0,0.0,...,M,M,M,M,M,M,M,M,-5.0,M
1,5240486,ORC,2008-11-30 07:35,-96.0606,42.9895,30.2,26.6,86.28,340.0,9.0,...,M,-SN,M,M,M,M,M,M,21.28,METAR KORC 300735Z AUTO 34009KT 7SM -SN OVC032...


500

Unnamed: 0.1,Unnamed: 0,station,valid,lon,lat,tmpf,dwpf,relh,drct,sknt,...,skyl4,wxcodes,ice_accretion_1hr,ice_accretion_3hr,ice_accretion_6hr,peak_wind_gust,peak_wind_drct,peak_wind_time,feel,metar
0,6381394,MUT,2008-11-12 09:55,-91.1406,41.367,44.6,44.6,100.0,150.0,6.0,...,M,M,M,M,M,M,M,M,40.71,M
1,2996452,CIN,2008-05-29 21:12,-94.7889,42.0444,62.0,60.0,93.15,130.0,14.0,...,M,M,M,M,M,M,M,M,62.0,M


## Data processing

In [5]:
for i, data in enumerate(weather_data):

    print('creating epoch time column...')
    data['epoch_time'] = [((get_epoch(time)/1000)+(4*3600))*1000 for time in data['valid']]  

    print('removing duplicate weather entries...')
    c_maxes = data.groupby(['station', 'epoch_time']).valid.transform(min)
    data = data.loc[data['valid'] == c_maxes]

    # print("merging weather_data lat,lon...")
    # data['point'] = [(x, y) for x,y in zip(data['lat'], data['lon'])]
    # print("elapsed_time: ", time.time() - start_time)

    print('creating merge_id column...')
    data['merge_id'] = ["{}_{}".format(epoch, iata) for epoch, iata in zip(data['epoch_time'], data['station'])]

    weather_data[i] = data
    print("data {} is done".format(i))

creating epoch time column...
removing duplicate weather entries...
creating merge_id column...
data 0 is done
creating epoch time column...
removing duplicate weather entries...
creating merge_id column...
data 1 is done


## Merge Weather Dataframes

In [6]:
bigdata = weather_data[0].append(weather_data[1], ignore_index=True)
bigdata.to_csv(path_or_buf="./MERGED_STA_SAMPLE.csv", sep=',')

In [8]:
display(len(bigdata['station']))
display(bigdata.head(2))

1000

Unnamed: 0.1,Unnamed: 0,station,valid,lon,lat,tmpf,dwpf,relh,drct,sknt,...,ice_accretion_1hr,ice_accretion_3hr,ice_accretion_6hr,peak_wind_gust,peak_wind_drct,peak_wind_time,feel,metar,epoch_time,merge_id
0,5614783,CBF,2008-12-22 15:04,-95.7604,41.2611,-5.0,-11.0,74.61,0.0,0.0,...,M,M,M,M,M,M,-5.0,M,1229972000000.0,1229972400000.0_CBF
1,5240486,ORC,2008-11-30 07:35,-96.0606,42.9895,30.2,26.6,86.28,340.0,9.0,...,M,M,M,M,M,M,21.28,METAR KORC 300735Z AUTO 34009KT 7SM -SN OVC032...,1228043000000.0,1228042800000.0_ORC
