# Wrangling Balloon Soundings data
Result: `soundings.csv`

In [1]:
import os
import pandas as pd
from datetime import datetime as dt

import path_utils as pu

In [2]:
root = "/Users/jdm/workbench/Perlan" # YMMV
os.chdir(root)

In [3]:
!pwd

/Users/jdm/workbench/Perlan


In [4]:
!ls

[34mControlled.svn[m[m                    [34mdata_website[m[m
Perlan Encore Fellowship          [34mdata_website.broken[m[m
[34mPerlanProject-2020-07-07T19-16-38[m[m [34mdata_website.drupaled.broken[m[m
[34mScience.git[m[m                       [34mperlanproject.org[m[m
[34mTRASH_LATER[m[m                       [34mpods[m[m
[34massets[m[m                            [34mpods.old[m[m
[34mclippings[m[m                         [34mtmp[m[m
[34mdata[m[m                              [34mwindField[m[m
data website plan.ooutline        wp-config.php.save


In [5]:
!pwd

/Users/jdm/workbench/Perlan


In [6]:
# Get the names of the Graphs directories and parse the name into a timestamp we can use
# to match up the Raw data zip and KML files

def make_graphs_df(d=pu.soundings_root):
    # get an array of the graphs directories, which are assumed to be in format 
    # MM_DD_YYYY_Graphs or MM_DD_YYYY_HHMM_Graphs 
    arr = pu.get_subdirs(pu.soundings_root)
    arr = [d[d.rfind('/')+1:] for d in arr] # strip off leading path
    arr.sort()

    df = pd.DataFrame()
    df['dir'] = arr
    df['dir_base'] = df['dir'].apply(lambda s: s[:-len("_Graphs")]) # strip "_Graphs"
    
    df['ts'] = df['dir_base'].apply(lambda s: s + "_0000" if len(s) < 16 else s[:15])
    df['month'] = df['ts'].apply(lambda s: s[:2])
    df['date'] = df['ts'].apply(lambda s: s[3:5])
    df['year'] = df['ts'].apply(lambda s: s[6:10])
    df['time'] = df['ts'].apply(lambda s: s[-4:])
    df['datetime'] = df['ts'].apply(lambda s: dt.strptime(s, "%m_%d_%Y_%H%M"))
    
    return df

graphs_df = make_graphs_df()
graphs_df

Unnamed: 0,dir,dir_base,ts,month,date,year,time,datetime
0,07_16_2017_Graphs,07_16_2017,07_16_2017_0000,07,16,2017,0000,2017-07-16 00:00:00
1,07_18_2017_Graphs,07_18_2017,07_18_2017_0000,07,18,2017,0000,2017-07-18 00:00:00
2,07_24_2017_Graphs,07_24_2017,07_24_2017_0000,07,24,2017,0000,2017-07-24 00:00:00
3,07_25_2017_Graphs,07_25_2017,07_25_2017_0000,07,25,2017,0000,2017-07-25 00:00:00
4,07_30_2017_Graphs,07_30_2017,07_30_2017_0000,07,30,2017,0000,2017-07-30 00:00:00
...,...,...,...,...,...,...,...,...
60,09_12_2019_1730Z_Graphs,09_12_2019_1730Z,09_12_2019_1730,09,12,2019,1730,2019-09-12 17:30:00
61,09_13_2018_Graphs,09_13_2018,09_13_2018_0000,09,13,2018,0000,2018-09-13 00:00:00
62,09_14_2019_1030Z_Graphs,09_14_2019_1030Z,09_14_2019_1030,09,14,2019,1030,2019-09-14 10:30:00
63,09_14_2019_1300Z_Graphs,09_14_2019_1300Z,09_14_2019_1300,09,14,2019,1300,2019-09-14 13:00:00


In [7]:
# Get the names of the Raw ZIP files and parse the name into a timestamp we can use
# to match up the Graphs and KML files

def make_files_df(d=pu.soundings_root, suffix='zip'):
    # get an array of the ZIP files, which are assumed to be in format 
    # MM_DD_YYYY.zip or MM_DD_YYYY_HHMM.zip
    arr = pu.get_files(pu.soundings_root, suffix=suffix)
    arr = [d[d.rfind('/')+1:] for d in arr] # strip off leading path
    arr.sort()

    df = pd.DataFrame()
    df['file'] = arr
    df['file_base'] = df['file'].apply(lambda s: s[:-len(f".{suffix}")]) # strip off suffix    

    df['ts'] = df['file_base'].apply(lambda s: s + "_0000" if len(s) < 16 else s[:15])
    df['month'] = df['ts'].apply(lambda s: s[:2])
    df['date'] = df['ts'].apply(lambda s: s[3:5])
    df['year'] = df['ts'].apply(lambda s: s[6:10])
    df['time'] = df['ts'].apply(lambda s: s[-4:])
    df['datetime'] = df['ts'].apply(lambda s: dt.strptime(s, "%m_%d_%Y_%H%M"))
    
    return df

In [8]:
raw_df = make_files_df(suffix='zip')
raw_df

Unnamed: 0,file,file_base,ts,month,date,year,time,datetime
0,07_14_2017.zip,07_14_2017,07_14_2017_0000,07,14,2017,0000,2017-07-14 00:00:00
1,07_16_2017.zip,07_16_2017,07_16_2017_0000,07,16,2017,0000,2017-07-16 00:00:00
2,07_18_2017.zip,07_18_2017,07_18_2017_0000,07,18,2017,0000,2017-07-18 00:00:00
3,07_24_2017.zip,07_24_2017,07_24_2017_0000,07,24,2017,0000,2017-07-24 00:00:00
4,07_25_2017.zip,07_25_2017,07_25_2017_0000,07,25,2017,0000,2017-07-25 00:00:00
...,...,...,...,...,...,...,...,...
74,09_17_2019_1100Z.zip,09_17_2019_1100Z,09_17_2019_1100,09,17,2019,1100,2019-09-17 11:00:00
75,09_17_2019_1545Z.zip,09_17_2019_1545Z,09_17_2019_1545,09,17,2019,1545,2019-09-17 15:45:00
76,09_18_2019_1530Z.zip,09_18_2019_1530Z,09_18_2019_1530,09,18,2019,1530,2019-09-18 15:30:00
77,09_19_2019_1730Z.zip,09_19_2019_1730Z,09_19_2019_1730,09,19,2019,1730,2019-09-19 17:30:00


In [9]:
kml_df = make_files_df(suffix='kml')
kml_df

Unnamed: 0,file,file_base,ts,month,date,year,time,datetime
0,07_14_2017.kml,07_14_2017,07_14_2017_0000,07,14,2017,0000,2017-07-14 00:00:00
1,07_16_2017.kml,07_16_2017,07_16_2017_0000,07,16,2017,0000,2017-07-16 00:00:00
2,07_24_2017.kml,07_24_2017,07_24_2017_0000,07,24,2017,0000,2017-07-24 00:00:00
3,07_25_2017.kml,07_25_2017,07_25_2017_0000,07,25,2017,0000,2017-07-25 00:00:00
4,07_30_2017.kml,07_30_2017,07_30_2017_0000,07,30,2017,0000,2017-07-30 00:00:00
...,...,...,...,...,...,...,...,...
64,09_17_2019_1100Z.kml,09_17_2019_1100Z,09_17_2019_1100,09,17,2019,1100,2019-09-17 11:00:00
65,09_17_2019_1545Z.kml,09_17_2019_1545Z,09_17_2019_1545,09,17,2019,1545,2019-09-17 15:45:00
66,09_18_2019_1530Z.kml,09_18_2019_1530Z,09_18_2019_1530,09,18,2019,1530,2019-09-18 15:30:00
67,09_19_2019_1730Z.kml,09_19_2019_1730Z,09_19_2019_1730,09,19,2019,1730,2019-09-19 17:30:00


# Merge

In [10]:
graphs = graphs_df[['dir', 'ts']].rename(columns={'dir':'graphs_dir'})
graphs = graphs.set_index('ts')
graphs

Unnamed: 0_level_0,graphs_dir
ts,Unnamed: 1_level_1
07_16_2017_0000,07_16_2017_Graphs
07_18_2017_0000,07_18_2017_Graphs
07_24_2017_0000,07_24_2017_Graphs
07_25_2017_0000,07_25_2017_Graphs
07_30_2017_0000,07_30_2017_Graphs
...,...
09_12_2019_1730,09_12_2019_1730Z_Graphs
09_13_2018_0000,09_13_2018_Graphs
09_14_2019_1030,09_14_2019_1030Z_Graphs
09_14_2019_1300,09_14_2019_1300Z_Graphs


In [11]:
raws = raw_df[['file', 'ts']].rename(columns={'file':'raw_file'})
raws = raws.set_index('ts')
raws

Unnamed: 0_level_0,raw_file
ts,Unnamed: 1_level_1
07_14_2017_0000,07_14_2017.zip
07_16_2017_0000,07_16_2017.zip
07_18_2017_0000,07_18_2017.zip
07_24_2017_0000,07_24_2017.zip
07_25_2017_0000,07_25_2017.zip
...,...
09_17_2019_1100,09_17_2019_1100Z.zip
09_17_2019_1545,09_17_2019_1545Z.zip
09_18_2019_1530,09_18_2019_1530Z.zip
09_19_2019_1730,09_19_2019_1730Z.zip


In [12]:
kmls = kml_df[['file', 'ts']].rename(columns={'file':'kml_file'})
kmls = kmls.set_index('ts')
kmls

Unnamed: 0_level_0,kml_file
ts,Unnamed: 1_level_1
07_14_2017_0000,07_14_2017.kml
07_16_2017_0000,07_16_2017.kml
07_24_2017_0000,07_24_2017.kml
07_25_2017_0000,07_25_2017.kml
07_30_2017_0000,07_30_2017.kml
...,...
09_17_2019_1100,09_17_2019_1100Z.kml
09_17_2019_1545,09_17_2019_1545Z.kml
09_18_2019_1530,09_18_2019_1530Z.kml
09_19_2019_1730,09_19_2019_1730Z.kml


In [13]:
merged = kmls.join(raws, how='outer', lsuffix='_left')
merged

Unnamed: 0_level_0,kml_file,raw_file
ts,Unnamed: 1_level_1,Unnamed: 2_level_1
07_14_2017_0000,07_14_2017.kml,07_14_2017.zip
07_16_2017_0000,07_16_2017.kml,07_16_2017.zip
07_18_2017_0000,,07_18_2017.zip
07_24_2017_0000,07_24_2017.kml,07_24_2017.zip
07_25_2017_0000,07_25_2017.kml,07_25_2017.zip
...,...,...
09_17_2019_1100,09_17_2019_1100Z.kml,09_17_2019_1100Z.zip
09_17_2019_1545,09_17_2019_1545Z.kml,09_17_2019_1545Z.zip
09_18_2019_1530,09_18_2019_1530Z.kml,09_18_2019_1530Z.zip
09_19_2019_1730,09_19_2019_1730Z.kml,09_19_2019_1730Z.zip


In [14]:
merged = merged.join(graphs, how='outer', lsuffix='_left')
merged['ts'] = merged.index
merged['datetime'] = merged['ts'].apply(lambda s: dt.strptime(s, "%m_%d_%Y_%H%M"))

merged

Unnamed: 0_level_0,kml_file,raw_file,graphs_dir,ts,datetime
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
07_14_2017_0000,07_14_2017.kml,07_14_2017.zip,,07_14_2017_0000,2017-07-14 00:00:00
07_16_2017_0000,07_16_2017.kml,07_16_2017.zip,07_16_2017_Graphs,07_16_2017_0000,2017-07-16 00:00:00
07_18_2017_0000,,07_18_2017.zip,07_18_2017_Graphs,07_18_2017_0000,2017-07-18 00:00:00
07_24_2017_0000,07_24_2017.kml,07_24_2017.zip,07_24_2017_Graphs,07_24_2017_0000,2017-07-24 00:00:00
07_25_2017_0000,07_25_2017.kml,07_25_2017.zip,07_25_2017_Graphs,07_25_2017_0000,2017-07-25 00:00:00
...,...,...,...,...,...
09_17_2019_1100,09_17_2019_1100Z.kml,09_17_2019_1100Z.zip,,09_17_2019_1100,2019-09-17 11:00:00
09_17_2019_1545,09_17_2019_1545Z.kml,09_17_2019_1545Z.zip,,09_17_2019_1545,2019-09-17 15:45:00
09_18_2019_1530,09_18_2019_1530Z.kml,09_18_2019_1530Z.zip,,09_18_2019_1530,2019-09-18 15:30:00
09_19_2019_1730,09_19_2019_1730Z.kml,09_19_2019_1730Z.zip,,09_19_2019_1730,2019-09-19 17:30:00


In [15]:
merged.isnull().sum()

kml_file      17
raw_file       7
graphs_dir    21
ts             0
datetime       0
dtype: int64

In [16]:
merged.reset_index(drop=True, inplace=True)
merged

Unnamed: 0,kml_file,raw_file,graphs_dir,ts,datetime
0,07_14_2017.kml,07_14_2017.zip,,07_14_2017_0000,2017-07-14 00:00:00
1,07_16_2017.kml,07_16_2017.zip,07_16_2017_Graphs,07_16_2017_0000,2017-07-16 00:00:00
2,,07_18_2017.zip,07_18_2017_Graphs,07_18_2017_0000,2017-07-18 00:00:00
3,07_24_2017.kml,07_24_2017.zip,07_24_2017_Graphs,07_24_2017_0000,2017-07-24 00:00:00
4,07_25_2017.kml,07_25_2017.zip,07_25_2017_Graphs,07_25_2017_0000,2017-07-25 00:00:00
...,...,...,...,...,...
81,09_17_2019_1100Z.kml,09_17_2019_1100Z.zip,,09_17_2019_1100,2019-09-17 11:00:00
82,09_17_2019_1545Z.kml,09_17_2019_1545Z.zip,,09_17_2019_1545,2019-09-17 15:45:00
83,09_18_2019_1530Z.kml,09_18_2019_1530Z.zip,,09_18_2019_1530,2019-09-18 15:30:00
84,09_19_2019_1730Z.kml,09_19_2019_1730Z.zip,,09_19_2019_1730,2019-09-19 17:30:00


## Break out Year, Month, Date, Time in case needed

In [17]:
merged['year'] = merged.datetime.apply(lambda d: d.year)
merged['month'] = merged.datetime.apply(lambda d: d.month)
merged['day'] = merged.datetime.apply(lambda d: d.day)
merged['time'] = merged.ts.apply(lambda d: d[-4:])

## Stuff URL in Paths

In [18]:
for col in ['kml_file', 'raw_file', 'graphs_dir']:
    merged[col] = merged[col].apply(lambda s: 'http://localhost/data/Soundings/' + s if str == type(s) else "")

merged

Unnamed: 0,kml_file,raw_file,graphs_dir,ts,datetime,year,month,day,time
0,http://localhost/data/Soundings/07_14_2017.kml,http://localhost/data/Soundings/07_14_2017.zip,,07_14_2017_0000,2017-07-14 00:00:00,2017,7,14,0000
1,http://localhost/data/Soundings/07_16_2017.kml,http://localhost/data/Soundings/07_16_2017.zip,http://localhost/data/Soundings/07_16_2017_Graphs,07_16_2017_0000,2017-07-16 00:00:00,2017,7,16,0000
2,,http://localhost/data/Soundings/07_18_2017.zip,http://localhost/data/Soundings/07_18_2017_Graphs,07_18_2017_0000,2017-07-18 00:00:00,2017,7,18,0000
3,http://localhost/data/Soundings/07_24_2017.kml,http://localhost/data/Soundings/07_24_2017.zip,http://localhost/data/Soundings/07_24_2017_Graphs,07_24_2017_0000,2017-07-24 00:00:00,2017,7,24,0000
4,http://localhost/data/Soundings/07_25_2017.kml,http://localhost/data/Soundings/07_25_2017.zip,http://localhost/data/Soundings/07_25_2017_Graphs,07_25_2017_0000,2017-07-25 00:00:00,2017,7,25,0000
...,...,...,...,...,...,...,...,...,...
81,http://localhost/data/Soundings/09_17_2019_110...,http://localhost/data/Soundings/09_17_2019_110...,,09_17_2019_1100,2019-09-17 11:00:00,2019,9,17,1100
82,http://localhost/data/Soundings/09_17_2019_154...,http://localhost/data/Soundings/09_17_2019_154...,,09_17_2019_1545,2019-09-17 15:45:00,2019,9,17,1545
83,http://localhost/data/Soundings/09_18_2019_153...,http://localhost/data/Soundings/09_18_2019_153...,,09_18_2019_1530,2019-09-18 15:30:00,2019,9,18,1530
84,http://localhost/data/Soundings/09_19_2019_173...,http://localhost/data/Soundings/09_19_2019_173...,,09_19_2019_1730,2019-09-19 17:30:00,2019,9,19,1730


# Write CSV

In [19]:
merged.to_csv(f"{pu.data_root}/soundings.csv", index=False)
print("CSV saved!")

CSV saved!
