In [107]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from datetime import datetime
import dateutil

with open('cities.json') as f:
    cities = json.load(f)

data = []

for city in cities:
    response = requests.get(city['NWS_URL'])
    soup = BeautifulSoup(response.content, 'html.parser')

    location = soup.find('h2', {'class': 'panel-title'})
    lat_lon_elev = soup.find('span', {'class': 'smallTxt'}).text.strip()
    lat, lon, elev = re.findall(r'[-+]?\d*\.\d+|\d+', lat_lon_elev)
    temperature = soup.find('p', {'class': 'myforecast-current-lrg'})
    humidity_elem = soup.find('td', text='Humidity')
    humidity = humidity_elem.find_next('td').text.strip() if humidity_elem else 'NA'
    wind_speed_elem = soup.find('td', text='Wind Speed')
    wind_speed = wind_speed_elem.find_next('td').text.strip() if wind_speed_elem else 'NA'
    barometer_elem = soup.find('td', text='Barometer')
    barometer = barometer_elem.find_next('td').text.strip() if barometer_elem else 'NA'
    dewpoint_elem = soup.find('td', text='Dewpoint')
    dewpoint = dewpoint_elem.find_next('td').text.strip() if dewpoint_elem else 'NA'
    visibility_elem = soup.find('td', text='Visibility')
    visibility = visibility_elem.find_next('td').text.strip() if visibility_elem else 'NA'
    wind_chill_elem = soup.find('td', text='Wind Chill')
    wind_chill = wind_chill_elem.find_next('td').text.strip() if wind_chill_elem else 'NA'
    last_update_elem = soup.find('td', text='Last update')
    last_update = last_update_elem.find_next('td').text.strip() if last_update_elem else 'NA'

    data.append({
        'location': city['Name'],
        'lat': lat,
        'lon': lon,
        'elev_ft': elev,
        'temperature': temperature.text if temperature else 'NA',
        'humidity': humidity,
        'wind_speed': wind_speed,
        'barometer': barometer,
        'dewpoint': dewpoint,
        'vis_miles': visibility,
        'wind_chill': wind_chill,
        'last_update': last_update
    })

df = pd.DataFrame(data)
print(df)




        location       lat        lon elev_ft temperature humidity  \
0   Portland, OR  45.59578  122.60917      20        36°F      86%   
1  San Diego, CA  32.73361  117.18306      13        57°F      67%   
2     Duluth, MN     46.72      92.04     607        34°F      93%   

   wind_speed              barometer    dewpoint vis_miles   wind_chill  \
0     N 0 MPH  30.05 in (1017.61 mb)  32°F (0°C)  10.00 mi           NA   
1  WSW 16 MPH   30.0 in (1015.92 mb)  46°F (8°C)  10.00 mi           NA   
2   NE 10 mph               29.46 in  32°F (0°C)  10.00 mi  26°F (-3°C)   

           last_update  
0  14 Feb 08:20 AM PST  
1  14 Feb 08:20 AM PST  
2  14 Feb 11:55 am CST  


In [108]:
df.dtypes

location       object
lat            object
lon            object
elev_ft        object
temperature    object
humidity       object
wind_speed     object
barometer      object
dewpoint       object
vis_miles      object
wind_chill     object
last_update    object
dtype: object

In [109]:
# Split the 'location' column into separate 'city' and 'state' columns
df[['city', 'state']] = df['location'].str.split(', ', expand=True)

print(df)

        location       lat        lon elev_ft temperature humidity  \
0   Portland, OR  45.59578  122.60917      20        36°F      86%   
1  San Diego, CA  32.73361  117.18306      13        57°F      67%   
2     Duluth, MN     46.72      92.04     607        34°F      93%   

   wind_speed              barometer    dewpoint vis_miles   wind_chill  \
0     N 0 MPH  30.05 in (1017.61 mb)  32°F (0°C)  10.00 mi           NA   
1  WSW 16 MPH   30.0 in (1015.92 mb)  46°F (8°C)  10.00 mi           NA   
2   NE 10 mph               29.46 in  32°F (0°C)  10.00 mi  26°F (-3°C)   

           last_update       city state  
0  14 Feb 08:20 AM PST   Portland    OR  
1  14 Feb 08:20 AM PST  San Diego    CA  
2  14 Feb 11:55 am CST     Duluth    MN  


In [110]:
# Convert 'lat' and 'lon' columns to float type
df[['lat', 'lon']] = df[['lat', 'lon']].astype(float)

# Convert 'elev' column to int type
df['elev_ft'] = df['elev_ft'].astype(int)
print(df)

        location       lat        lon  elev_ft temperature humidity  \
0   Portland, OR  45.59578  122.60917       20        36°F      86%   
1  San Diego, CA  32.73361  117.18306       13        57°F      67%   
2     Duluth, MN  46.72000   92.04000      607        34°F      93%   

   wind_speed              barometer    dewpoint vis_miles   wind_chill  \
0     N 0 MPH  30.05 in (1017.61 mb)  32°F (0°C)  10.00 mi           NA   
1  WSW 16 MPH   30.0 in (1015.92 mb)  46°F (8°C)  10.00 mi           NA   
2   NE 10 mph               29.46 in  32°F (0°C)  10.00 mi  26°F (-3°C)   

           last_update       city state  
0  14 Feb 08:20 AM PST   Portland    OR  
1  14 Feb 08:20 AM PST  San Diego    CA  
2  14 Feb 11:55 am CST     Duluth    MN  


In [111]:
# Extract the numeric part of the temperature string and convert it to int
df['temp_f'] = df['temperature'].str.extract('(\d+)').astype(int)

# Convert temperature to Celsius and add to new column 'temp_c'
df['temp_c'] = (df['temp_f'] - 32) * 5/9

# Round 'temp_c' to nearest integer and cast to int type
df['temp_c'] = df['temp_c'].round().astype(int)
print(df)

        location       lat        lon  elev_ft temperature humidity  \
0   Portland, OR  45.59578  122.60917       20        36°F      86%   
1  San Diego, CA  32.73361  117.18306       13        57°F      67%   
2     Duluth, MN  46.72000   92.04000      607        34°F      93%   

   wind_speed              barometer    dewpoint vis_miles   wind_chill  \
0     N 0 MPH  30.05 in (1017.61 mb)  32°F (0°C)  10.00 mi           NA   
1  WSW 16 MPH   30.0 in (1015.92 mb)  46°F (8°C)  10.00 mi           NA   
2   NE 10 mph               29.46 in  32°F (0°C)  10.00 mi  26°F (-3°C)   

           last_update       city state  temp_f  temp_c  
0  14 Feb 08:20 AM PST   Portland    OR      36       2  
1  14 Feb 08:20 AM PST  San Diego    CA      57      14  
2  14 Feb 11:55 am CST     Duluth    MN      34       1  


In [112]:
# Convert 'humidity' column to float type
df['humidity'] = df['humidity'].str.extract('(\d+)', expand=False).astype(float) / 100

print(df)

        location       lat        lon  elev_ft temperature  humidity  \
0   Portland, OR  45.59578  122.60917       20        36°F      0.86   
1  San Diego, CA  32.73361  117.18306       13        57°F      0.67   
2     Duluth, MN  46.72000   92.04000      607        34°F      0.93   

   wind_speed              barometer    dewpoint vis_miles   wind_chill  \
0     N 0 MPH  30.05 in (1017.61 mb)  32°F (0°C)  10.00 mi           NA   
1  WSW 16 MPH   30.0 in (1015.92 mb)  46°F (8°C)  10.00 mi           NA   
2   NE 10 mph               29.46 in  32°F (0°C)  10.00 mi  26°F (-3°C)   

           last_update       city state  temp_f  temp_c  
0  14 Feb 08:20 AM PST   Portland    OR      36       2  
1  14 Feb 08:20 AM PST  San Diego    CA      57      14  
2  14 Feb 11:55 am CST     Duluth    MN      34       1  


In [113]:
# Split wind speed values into components and convert speed to int type
df['wind_speed'] = df['wind_speed'].str.extract('(\d+)', expand=False).fillna(0).astype(int)

# Set any missing or non-numeric wind speed values to 0
df['wind_speed'] = df['wind_speed'].replace('Calm', 0)

print(df)

        location       lat        lon  elev_ft temperature  humidity  \
0   Portland, OR  45.59578  122.60917       20        36°F      0.86   
1  San Diego, CA  32.73361  117.18306       13        57°F      0.67   
2     Duluth, MN  46.72000   92.04000      607        34°F      0.93   

   wind_speed              barometer    dewpoint vis_miles   wind_chill  \
0           0  30.05 in (1017.61 mb)  32°F (0°C)  10.00 mi           NA   
1          16   30.0 in (1015.92 mb)  46°F (8°C)  10.00 mi           NA   
2          10               29.46 in  32°F (0°C)  10.00 mi  26°F (-3°C)   

           last_update       city state  temp_f  temp_c  
0  14 Feb 08:20 AM PST   Portland    OR      36       2  
1  14 Feb 08:20 AM PST  San Diego    CA      57      14  
2  14 Feb 11:55 am CST     Duluth    MN      34       1  


In [114]:
# Convert 'barometer' column to float type, and convert inches to millibars
df['barometer'] = df['barometer'].apply(lambda x: float(x.split()[0]) * 33.8639 if 'in' in x else float(x.split()[0]))

# Round 'barometer' to two decimal places
df['barometer'] = df['barometer'].round(2)

print(df)

        location       lat        lon  elev_ft temperature  humidity  \
0   Portland, OR  45.59578  122.60917       20        36°F      0.86   
1  San Diego, CA  32.73361  117.18306       13        57°F      0.67   
2     Duluth, MN  46.72000   92.04000      607        34°F      0.93   

   wind_speed  barometer    dewpoint vis_miles   wind_chill  \
0           0    1017.61  32°F (0°C)  10.00 mi           NA   
1          16    1015.92  46°F (8°C)  10.00 mi           NA   
2          10     997.63  32°F (0°C)  10.00 mi  26°F (-3°C)   

           last_update       city state  temp_f  temp_c  
0  14 Feb 08:20 AM PST   Portland    OR      36       2  
1  14 Feb 08:20 AM PST  San Diego    CA      57      14  
2  14 Feb 11:55 am CST     Duluth    MN      34       1  


In [115]:
# Split 'dewpoint' column into separate 'dewpoint_f' and 'dewpoint_c' columns
df[['dewpoint_f', 'dewpoint_c']] = df['dewpoint'].str.extract('(\d+).*?(\d+)', expand=True).astype(int)

print(df)

        location       lat        lon  elev_ft temperature  humidity  \
0   Portland, OR  45.59578  122.60917       20        36°F      0.86   
1  San Diego, CA  32.73361  117.18306       13        57°F      0.67   
2     Duluth, MN  46.72000   92.04000      607        34°F      0.93   

   wind_speed  barometer    dewpoint vis_miles   wind_chill  \
0           0    1017.61  32°F (0°C)  10.00 mi           NA   
1          16    1015.92  46°F (8°C)  10.00 mi           NA   
2          10     997.63  32°F (0°C)  10.00 mi  26°F (-3°C)   

           last_update       city state  temp_f  temp_c  dewpoint_f  \
0  14 Feb 08:20 AM PST   Portland    OR      36       2          32   
1  14 Feb 08:20 AM PST  San Diego    CA      57      14          46   
2  14 Feb 11:55 am CST     Duluth    MN      34       1          32   

   dewpoint_c  
0           0  
1           8  
2           0  


In [116]:
# Convert 'vis_miles' column to float type
df['vis_miles'] = df['vis_miles'].str.extract('(\d+\.\d+|\d+)', expand=False).astype(float).round(2)

print(df)

        location       lat        lon  elev_ft temperature  humidity  \
0   Portland, OR  45.59578  122.60917       20        36°F      0.86   
1  San Diego, CA  32.73361  117.18306       13        57°F      0.67   
2     Duluth, MN  46.72000   92.04000      607        34°F      0.93   

   wind_speed  barometer    dewpoint  vis_miles   wind_chill  \
0           0    1017.61  32°F (0°C)       10.0           NA   
1          16    1015.92  46°F (8°C)       10.0           NA   
2          10     997.63  32°F (0°C)       10.0  26°F (-3°C)   

           last_update       city state  temp_f  temp_c  dewpoint_f  \
0  14 Feb 08:20 AM PST   Portland    OR      36       2          32   
1  14 Feb 08:20 AM PST  San Diego    CA      57      14          46   
2  14 Feb 11:55 am CST     Duluth    MN      34       1          32   

   dewpoint_c  
0           0  
1           8  
2           0  


In [117]:
# Split 'wind_chill' column into separate 'wind_chill_f' and 'wind_chill_c' columns
df[['wind_chill_f', 'wind_chill_c']] = df['wind_chill'].str.extract('(\d+).*?(\d+)', expand=True).astype(float)

print(df)

        location       lat        lon  elev_ft temperature  humidity  \
0   Portland, OR  45.59578  122.60917       20        36°F      0.86   
1  San Diego, CA  32.73361  117.18306       13        57°F      0.67   
2     Duluth, MN  46.72000   92.04000      607        34°F      0.93   

   wind_speed  barometer    dewpoint  vis_miles   wind_chill  \
0           0    1017.61  32°F (0°C)       10.0           NA   
1          16    1015.92  46°F (8°C)       10.0           NA   
2          10     997.63  32°F (0°C)       10.0  26°F (-3°C)   

           last_update       city state  temp_f  temp_c  dewpoint_f  \
0  14 Feb 08:20 AM PST   Portland    OR      36       2          32   
1  14 Feb 08:20 AM PST  San Diego    CA      57      14          46   
2  14 Feb 11:55 am CST     Duluth    MN      34       1          32   

   dewpoint_c  wind_chill_f  wind_chill_c  
0           0           NaN           NaN  
1           8           NaN           NaN  
2           0          26.0          

In [118]:
# Convert 'last_update' column to datetime type with the desired format and time zone
df['last_update'] = df['last_update'].apply(lambda x: dateutil.parser.parse(x, tzinfos={'CST': dateutil.tz.tzoffset(None, -21600)}))

# Convert 'last_update' column to UTC
df['last_update'] = df['last_update'].apply(lambda x: x.astimezone(dateutil.tz.tzutc()))

print(df)

        location       lat        lon  elev_ft temperature  humidity  \
0   Portland, OR  45.59578  122.60917       20        36°F      0.86   
1  San Diego, CA  32.73361  117.18306       13        57°F      0.67   
2     Duluth, MN  46.72000   92.04000      607        34°F      0.93   

   wind_speed  barometer    dewpoint  vis_miles   wind_chill  \
0           0    1017.61  32°F (0°C)       10.0           NA   
1          16    1015.92  46°F (8°C)       10.0           NA   
2          10     997.63  32°F (0°C)       10.0  26°F (-3°C)   

                last_update       city state  temp_f  temp_c  dewpoint_f  \
0 2023-02-14 16:20:00+00:00   Portland    OR      36       2          32   
1 2023-02-14 16:20:00+00:00  San Diego    CA      57      14          46   
2 2023-02-14 17:55:00+00:00     Duluth    MN      34       1          32   

   dewpoint_c  wind_chill_f  wind_chill_c  
0           0           NaN           NaN  
1           8           NaN           NaN  
2           0    

In [119]:
# Drop columns that were split into two values
df = df.drop(['temperature', 'dewpoint', 'wind_chill'], axis=1)

print(df)

        location       lat        lon  elev_ft  humidity  wind_speed  \
0   Portland, OR  45.59578  122.60917       20      0.86           0   
1  San Diego, CA  32.73361  117.18306       13      0.67          16   
2     Duluth, MN  46.72000   92.04000      607      0.93          10   

   barometer  vis_miles               last_update       city state  temp_f  \
0    1017.61       10.0 2023-02-14 16:20:00+00:00   Portland    OR      36   
1    1015.92       10.0 2023-02-14 16:20:00+00:00  San Diego    CA      57   
2     997.63       10.0 2023-02-14 17:55:00+00:00     Duluth    MN      34   

   temp_c  dewpoint_f  dewpoint_c  wind_chill_f  wind_chill_c  
0       2          32           0           NaN           NaN  
1      14          46           8           NaN           NaN  
2       1          32           0          26.0           3.0  


In [123]:
# Move 'city' and 'state' columns after 'location' column
city = df.pop('city')
state = df.pop('state')
temp_f = df.pop('temp_f')
temp_c = df.pop('temp_c')
df.insert(1, 'city', city)
df.insert(2, 'state', state)
df.insert(6, 'temp_f', temp_f)
df.insert(7, 'temp_c', temp_c)

# Move 'last_update' column to last position
last_update = df.pop('last_update')
df['last_update'] = last_update


print(df)

        location       city state       lat        lon  elev_ft  temp_f  \
0   Portland, OR   Portland    OR  45.59578  122.60917       20      36   
1  San Diego, CA  San Diego    CA  32.73361  117.18306       13      57   
2     Duluth, MN     Duluth    MN  46.72000   92.04000      607      34   

   temp_c  humidity  wind_speed  barometer  vis_miles  dewpoint_f  dewpoint_c  \
0       2      0.86           0    1017.61       10.0          32           0   
1      14      0.67          16    1015.92       10.0          46           8   
2       1      0.93          10     997.63       10.0          32           0   

   wind_chill_f  wind_chill_c               last_update  
0           NaN           NaN 2023-02-14 16:20:00+00:00  
1           NaN           NaN 2023-02-14 16:20:00+00:00  
2          26.0           3.0 2023-02-14 17:55:00+00:00  
