In [1]:
import os
os.chdir('/Users/tomharrison/Documents/Projects/UV_exposure')

In [2]:
import pandas as pd
import numpy as np

from geopy.geocoders import Nominatim
from utils.UV_exposure import *

In [3]:
geolocator = Nominatim(timeout=10, user_agent="PDS")
location = geolocator.geocode('CF10 4AG')
lat, long = location.latitude, location.longitude

In [4]:
# Load our data. We see it is not in a very useful format. We will skip the first three rows which removes the second column and the descriptions. We will 
# give it the column title "raw_values"
df_ozone = pd.read_csv("./data/ozone_data.txt")
df_ozone.head(5)

Unnamed: 0,Day: 280 Oct 7,2022 OMPS TO3 STD OZONE GEN:22:283 Asc LECT: 01:25 PM
0,Longitudes: 360 bins centered on 179.5 W t...,
1,Latitudes : 180 bins centered on 89.5 S t...,
2,116116116116116116116116116116116116116116116...,
3,116116116116116116116116116116116115115115115...,
4,115115115115115115115115115115115115115115115...,


In [5]:
df_ozone = pd.read_csv("./data/ozone_data.txt", skiprows=3, names = ["raw_values"])
df_ozone.head(15)

Unnamed: 0,raw_values
0,116116116116116116116116116116116116116116116...
1,116116116116116116116116116116116115115115115...
2,115115115115115115115115115115115115115115115...
3,114114114114114114114114114114114114114114114...
4,114114114114114114114114114114114114114114114...
5,114114114114114114114114114114114114114114114...
6,114114114114114114114114114114114113113113113...
7,113113113113113113113113113113113113113113113...
8,113113113113113113113113113113113113113113113...
9,114114114114114114114114114114114114114115115...


In [6]:
df_ozone[int(len(df_ozone)/2)-10: int(len(df_ozone)/2)+10]

Unnamed: 0,raw_values
1340,284287290291294295296295292293296296296296294...
1341,293293292292294294295295293292294296296295293...
1342,292293293293286277278276280273270275273277273...
1343,282277273276280289289289285284287283284281282...
1344,287286287287288288287285284282281281283286286...
1345,268266265267269269269268268269268270270271271...
1346,269267267266263261263264264262265263262265268...
1347,272272272273274273270270265268269270266269269...
1348,276276275275274274274273271270271272273274274...
1349,275275275274275275274273272272 lat = -0.5


In [7]:
# The data is in a format where there are 360 readings for every latitude, with each reading occurring at 1 degree intervals
# of longitude, from 179.5 W  to 179.5 E. It also appears that the readings have been "squashed" into a continuous string for each row.
# It appears that each value is 3 digits long, so we will separate it out into 3-digit values. We will create a dataframe with 
# the correct ranges for latitude and longitude and then fill it with the correct 3-digit values. Below is our dtaframe which will 
# later be filled.

df = pd.DataFrame({'Latitude': np.repeat(np.linspace(-89.5, 89.5, 180), 360),
                   'Longitude': np.linspace(-179.5, 179.5, 360).tolist() * 180})

# Let's check below that we have set the dataframe up correctly. We will look at the crossover where the latitude changes by 1 degree
# and the longitude resets to -179.5 before incrementing by 1 degree again.

df[355:365]

Unnamed: 0,Latitude,Longitude
355,-89.5,175.5
356,-89.5,176.5
357,-89.5,177.5
358,-89.5,178.5
359,-89.5,179.5
360,-88.5,-179.5
361,-88.5,-178.5
362,-88.5,-177.5
363,-88.5,-176.5
364,-88.5,-175.5


In [8]:
# Now we will separate each row into 3-digit values. We need to be careful when the row contains information about the latitude (see above)
# We will include an 'if' statement such that lines containing information about the latitude are trimmed.

# Let's look at a single row to see what we're dealing with. It appears that there is white space at the start of the line. Some rows have whitespace 
# because the ozone value is 0, so we will only remove the space which appears at the beginning of each line. 

df_ozone.loc[149,'raw_values']

' 141141141141140142140139137136   lat =  -80.5'

In [9]:
# Now the same line has no leading whitespace
df_ozone['raw_values'] = df_ozone['raw_values'].str[1:]
df_ozone.loc[149,'raw_values']

'141141141141140142140139137136   lat =  -80.5'

In [10]:
# Now we will remove the word 'lat' and anything that comes after it. 

#============ Do we know that it always features at the end of a line? ===========#

df_ozone['raw_values'] = df_ozone['raw_values'].str.split('lat').str[0]
df_ozone.loc[2696,'raw_values']

'  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0'

In [11]:
# Let's now remove all trailing whitespace. Let's view lines that end with whitespace to see what we're dealing with.
test = df_ozone[df_ozone['raw_values'].str.endswith(' ')]
# df_ozone.loc[50,'raw_values']
test.loc[2639,'raw_values']

df_ozone['raw_values'] = df_ozone['raw_values'].str.rstrip(' ')

In [12]:
# Let's check that there is now no whitespace and no mentions of "lat" in our values - it doesn't!!
df_ozone[df_ozone['raw_values'].str.contains('lat')]

Unnamed: 0,raw_values


In [13]:
# Now we will separate our row entries into 3-digit long values using nested list comprehension.
# We then flatten our list using list comprehension.
two_d_list = [[row[n:(n+3)] for n in range(0, len(row), 3)] for row in df_ozone['raw_values']]
flatten_list = [ii for item in two_d_list for ii in item]

In [14]:
df['ozone_dobson_value'] = flatten_list
df

Unnamed: 0,Latitude,Longitude,ozone_dobson_value
0,-89.5,-179.5,116
1,-89.5,-178.5,116
2,-89.5,-177.5,116
3,-89.5,-176.5,116
4,-89.5,-175.5,116
...,...,...,...
64795,89.5,175.5,0
64796,89.5,176.5,0
64797,89.5,177.5,0
64798,89.5,178.5,0
