In [125]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from datetime import datetime
import dateutil

with open('cities.json') as f:
    cities = json.load(f)

data = []

for city in cities:
    response = requests.get(city['NWS_URL'])
    soup = BeautifulSoup(response.content, 'html.parser')

    location = soup.find('h2', {'class': 'panel-title'})
    lat_lon_elev = soup.find('span', {'class': 'smallTxt'}).text.strip()
    lat, lon, elev = re.findall(r'[-+]?\d*\.\d+|\d+', lat_lon_elev)
    temperature = soup.find('p', {'class': 'myforecast-current-lrg'})
    humidity_elem = soup.find('td', text='Humidity')
    humidity = humidity_elem.find_next('td').text.strip() if humidity_elem else 'NA'
    wind_speed_elem = soup.find('td', text='Wind Speed')
    wind_speed = wind_speed_elem.find_next('td').text.strip() if wind_speed_elem else 'NA'
    barometer_elem = soup.find('td', text='Barometer')
    barometer = barometer_elem.find_next('td').text.strip() if barometer_elem else 'NA'
    dewpoint_elem = soup.find('td', text='Dewpoint')
    dewpoint = dewpoint_elem.find_next('td').text.strip() if dewpoint_elem else 'NA'
    visibility_elem = soup.find('td', text='Visibility')
    visibility = visibility_elem.find_next('td').text.strip() if visibility_elem else 'NA'
    wind_chill_elem = soup.find('td', text='Wind Chill')
    wind_chill = wind_chill_elem.find_next('td').text.strip() if wind_chill_elem else 'NA'
    last_update_elem = soup.find('td', text='Last update')
    last_update = last_update_elem.find_next('td').text.strip() if last_update_elem else 'NA'

    data.append({
        'location': city['Name'],
        'lat': lat,
        'lon': lon,
        'elev_ft': elev,
        'temperature': temperature.text if temperature else 'NA',
        'humidity': humidity,
        'wind_speed': wind_speed,
        'barometer': barometer,
        'dewpoint': dewpoint,
        'vis_miles': visibility,
        'wind_chill': wind_chill,
        'last_update': last_update
    })

df = pd.DataFrame(data)
print(df)




              location       lat        lon elev_ft temperature humidity  \
0         Portland, OR  45.59578  122.60917      20        36°F      86%   
1        San Diego, CA  32.73361  117.18306      13        57°F      67%   
2           Duluth, MN     46.72      92.04     607        34°F      93%   
3      Minneapolis, MN     44.88      93.23     840        37°F      86%   
4   Salt Lake City, UT  40.77069  111.96503    4226        30°F      54%   
5           Denver, CO     39.71     104.76    5577        40°F      43%   
6    San Francisco, CA  37.77056  122.42694     150        44°F      44%   
7    New York City, NY     40.78      73.97     154        50°F      30%   
8         Portland, ME     43.64       70.3      72        43°F      53%   
9          Seattle, WA  47.54548   122.3147      20        36°F      80%   
10       Baltimore, MD     39.28      76.62      20        57°F      28%   

       wind_speed              barometer     dewpoint vis_miles   wind_chill  \
0      

In [126]:
df.dtypes

location       object
lat            object
lon            object
elev_ft        object
temperature    object
humidity       object
wind_speed     object
barometer      object
dewpoint       object
vis_miles      object
wind_chill     object
last_update    object
dtype: object

In [127]:
# Split the 'location' column into separate 'city' and 'state' columns
df[['city', 'state']] = df['location'].str.split(', ', expand=True)

print(df)

              location       lat        lon elev_ft temperature humidity  \
0         Portland, OR  45.59578  122.60917      20        36°F      86%   
1        San Diego, CA  32.73361  117.18306      13        57°F      67%   
2           Duluth, MN     46.72      92.04     607        34°F      93%   
3      Minneapolis, MN     44.88      93.23     840        37°F      86%   
4   Salt Lake City, UT  40.77069  111.96503    4226        30°F      54%   
5           Denver, CO     39.71     104.76    5577        40°F      43%   
6    San Francisco, CA  37.77056  122.42694     150        44°F      44%   
7    New York City, NY     40.78      73.97     154        50°F      30%   
8         Portland, ME     43.64       70.3      72        43°F      53%   
9          Seattle, WA  47.54548   122.3147      20        36°F      80%   
10       Baltimore, MD     39.28      76.62      20        57°F      28%   

       wind_speed              barometer     dewpoint vis_miles   wind_chill  \
0      

In [128]:
# Convert 'lat' and 'lon' columns to float type
df[['lat', 'lon']] = df[['lat', 'lon']].astype(float)

# Convert 'elev' column to int type
df['elev_ft'] = df['elev_ft'].astype(int)
print(df)

              location       lat        lon  elev_ft temperature humidity  \
0         Portland, OR  45.59578  122.60917       20        36°F      86%   
1        San Diego, CA  32.73361  117.18306       13        57°F      67%   
2           Duluth, MN  46.72000   92.04000      607        34°F      93%   
3      Minneapolis, MN  44.88000   93.23000      840        37°F      86%   
4   Salt Lake City, UT  40.77069  111.96503     4226        30°F      54%   
5           Denver, CO  39.71000  104.76000     5577        40°F      43%   
6    San Francisco, CA  37.77056  122.42694      150        44°F      44%   
7    New York City, NY  40.78000   73.97000      154        50°F      30%   
8         Portland, ME  43.64000   70.30000       72        43°F      53%   
9          Seattle, WA  47.54548  122.31470       20        36°F      80%   
10       Baltimore, MD  39.28000   76.62000       20        57°F      28%   

       wind_speed              barometer     dewpoint vis_miles   wind_chil

In [129]:
# Extract the numeric part of the temperature string and convert it to int
df['temp_f'] = df['temperature'].str.extract('(\d+)').astype(int)

# Convert temperature to Celsius and add to new column 'temp_c'
df['temp_c'] = (df['temp_f'] - 32) * 5/9

# Round 'temp_c' to nearest integer and cast to int type
df['temp_c'] = df['temp_c'].round().astype(int)
print(df)

              location       lat        lon  elev_ft temperature humidity  \
0         Portland, OR  45.59578  122.60917       20        36°F      86%   
1        San Diego, CA  32.73361  117.18306       13        57°F      67%   
2           Duluth, MN  46.72000   92.04000      607        34°F      93%   
3      Minneapolis, MN  44.88000   93.23000      840        37°F      86%   
4   Salt Lake City, UT  40.77069  111.96503     4226        30°F      54%   
5           Denver, CO  39.71000  104.76000     5577        40°F      43%   
6    San Francisco, CA  37.77056  122.42694      150        44°F      44%   
7    New York City, NY  40.78000   73.97000      154        50°F      30%   
8         Portland, ME  43.64000   70.30000       72        43°F      53%   
9          Seattle, WA  47.54548  122.31470       20        36°F      80%   
10       Baltimore, MD  39.28000   76.62000       20        57°F      28%   

       wind_speed              barometer     dewpoint vis_miles   wind_chil

In [130]:
# Convert 'humidity' column to float type
df['humidity'] = df['humidity'].str.extract('(\d+)', expand=False).astype(float) / 100

print(df)

              location       lat        lon  elev_ft temperature  humidity  \
0         Portland, OR  45.59578  122.60917       20        36°F      0.86   
1        San Diego, CA  32.73361  117.18306       13        57°F      0.67   
2           Duluth, MN  46.72000   92.04000      607        34°F      0.93   
3      Minneapolis, MN  44.88000   93.23000      840        37°F      0.86   
4   Salt Lake City, UT  40.77069  111.96503     4226        30°F      0.54   
5           Denver, CO  39.71000  104.76000     5577        40°F      0.43   
6    San Francisco, CA  37.77056  122.42694      150        44°F      0.44   
7    New York City, NY  40.78000   73.97000      154        50°F      0.30   
8         Portland, ME  43.64000   70.30000       72        43°F      0.53   
9          Seattle, WA  47.54548  122.31470       20        36°F      0.80   
10       Baltimore, MD  39.28000   76.62000       20        57°F      0.28   

       wind_speed              barometer     dewpoint vis_miles

In [131]:
# Split wind speed values into components and convert speed to int type
df['wind_speed'] = df['wind_speed'].str.extract('(\d+)', expand=False).fillna(0).astype(int)

# Set any missing or non-numeric wind speed values to 0
df['wind_speed'] = df['wind_speed'].replace('Calm', 0)

print(df)

              location       lat        lon  elev_ft temperature  humidity  \
0         Portland, OR  45.59578  122.60917       20        36°F      0.86   
1        San Diego, CA  32.73361  117.18306       13        57°F      0.67   
2           Duluth, MN  46.72000   92.04000      607        34°F      0.93   
3      Minneapolis, MN  44.88000   93.23000      840        37°F      0.86   
4   Salt Lake City, UT  40.77069  111.96503     4226        30°F      0.54   
5           Denver, CO  39.71000  104.76000     5577        40°F      0.43   
6    San Francisco, CA  37.77056  122.42694      150        44°F      0.44   
7    New York City, NY  40.78000   73.97000      154        50°F      0.30   
8         Portland, ME  43.64000   70.30000       72        43°F      0.53   
9          Seattle, WA  47.54548  122.31470       20        36°F      0.80   
10       Baltimore, MD  39.28000   76.62000       20        57°F      0.28   

    wind_speed              barometer     dewpoint vis_miles   

In [135]:
# Convert 'barometer' column to float type, and convert inches to millibars
df['barometer'] = df['barometer'].apply(lambda x: float(x.split()[0]) * 33.8639 if 'in' in x and x != 'NA' else None)

# Round 'barometer' to two decimal places
df['barometer'] = df['barometer'].round(2)



print(df)

              location       lat        lon  elev_ft temperature  humidity  \
0         Portland, OR  45.59578  122.60917       20        36°F      0.86   
1        San Diego, CA  32.73361  117.18306       13        57°F      0.67   
2           Duluth, MN  46.72000   92.04000      607        34°F      0.93   
3      Minneapolis, MN  44.88000   93.23000      840        37°F      0.86   
4   Salt Lake City, UT  40.77069  111.96503     4226        30°F      0.54   
5           Denver, CO  39.71000  104.76000     5577        40°F      0.43   
6    San Francisco, CA  37.77056  122.42694      150        44°F      0.44   
7    New York City, NY  40.78000   73.97000      154        50°F      0.30   
8         Portland, ME  43.64000   70.30000       72        43°F      0.53   
9          Seattle, WA  47.54548  122.31470       20        36°F      0.80   
10       Baltimore, MD  39.28000   76.62000       20        57°F      0.28   

    wind_speed  barometer     dewpoint vis_miles   wind_chill  

In [136]:
# Split 'dewpoint' column into separate 'dewpoint_f' and 'dewpoint_c' columns
df[['dewpoint_f', 'dewpoint_c']] = df['dewpoint'].str.extract('(\d+).*?(\d+)', expand=True).astype(int)

print(df)

              location       lat        lon  elev_ft temperature  humidity  \
0         Portland, OR  45.59578  122.60917       20        36°F      0.86   
1        San Diego, CA  32.73361  117.18306       13        57°F      0.67   
2           Duluth, MN  46.72000   92.04000      607        34°F      0.93   
3      Minneapolis, MN  44.88000   93.23000      840        37°F      0.86   
4   Salt Lake City, UT  40.77069  111.96503     4226        30°F      0.54   
5           Denver, CO  39.71000  104.76000     5577        40°F      0.43   
6    San Francisco, CA  37.77056  122.42694      150        44°F      0.44   
7    New York City, NY  40.78000   73.97000      154        50°F      0.30   
8         Portland, ME  43.64000   70.30000       72        43°F      0.53   
9          Seattle, WA  47.54548  122.31470       20        36°F      0.80   
10       Baltimore, MD  39.28000   76.62000       20        57°F      0.28   

    wind_speed  barometer     dewpoint vis_miles   wind_chill  

In [137]:
# Convert 'vis_miles' column to float type
df['vis_miles'] = df['vis_miles'].str.extract('(\d+\.\d+|\d+)', expand=False).astype(float).round(2)

print(df)

              location       lat        lon  elev_ft temperature  humidity  \
0         Portland, OR  45.59578  122.60917       20        36°F      0.86   
1        San Diego, CA  32.73361  117.18306       13        57°F      0.67   
2           Duluth, MN  46.72000   92.04000      607        34°F      0.93   
3      Minneapolis, MN  44.88000   93.23000      840        37°F      0.86   
4   Salt Lake City, UT  40.77069  111.96503     4226        30°F      0.54   
5           Denver, CO  39.71000  104.76000     5577        40°F      0.43   
6    San Francisco, CA  37.77056  122.42694      150        44°F      0.44   
7    New York City, NY  40.78000   73.97000      154        50°F      0.30   
8         Portland, ME  43.64000   70.30000       72        43°F      0.53   
9          Seattle, WA  47.54548  122.31470       20        36°F      0.80   
10       Baltimore, MD  39.28000   76.62000       20        57°F      0.28   

    wind_speed  barometer     dewpoint  vis_miles   wind_chill 

In [138]:
# Split 'wind_chill' column into separate 'wind_chill_f' and 'wind_chill_c' columns
df[['wind_chill_f', 'wind_chill_c']] = df['wind_chill'].str.extract('(\d+).*?(\d+)', expand=True).astype(float)

print(df)

              location       lat        lon  elev_ft temperature  humidity  \
0         Portland, OR  45.59578  122.60917       20        36°F      0.86   
1        San Diego, CA  32.73361  117.18306       13        57°F      0.67   
2           Duluth, MN  46.72000   92.04000      607        34°F      0.93   
3      Minneapolis, MN  44.88000   93.23000      840        37°F      0.86   
4   Salt Lake City, UT  40.77069  111.96503     4226        30°F      0.54   
5           Denver, CO  39.71000  104.76000     5577        40°F      0.43   
6    San Francisco, CA  37.77056  122.42694      150        44°F      0.44   
7    New York City, NY  40.78000   73.97000      154        50°F      0.30   
8         Portland, ME  43.64000   70.30000       72        43°F      0.53   
9          Seattle, WA  47.54548  122.31470       20        36°F      0.80   
10       Baltimore, MD  39.28000   76.62000       20        57°F      0.28   

    wind_speed  barometer     dewpoint  vis_miles   wind_chill 

In [139]:
# Convert 'last_update' column to datetime type with the desired format and time zone
df['last_update'] = df['last_update'].apply(lambda x: dateutil.parser.parse(x, tzinfos={'CST': dateutil.tz.tzoffset(None, -21600)}))

# Convert 'last_update' column to UTC
df['last_update'] = df['last_update'].apply(lambda x: x.astimezone(dateutil.tz.tzutc()))

print(df)

              location       lat        lon  elev_ft temperature  humidity  \
0         Portland, OR  45.59578  122.60917       20        36°F      0.86   
1        San Diego, CA  32.73361  117.18306       13        57°F      0.67   
2           Duluth, MN  46.72000   92.04000      607        34°F      0.93   
3      Minneapolis, MN  44.88000   93.23000      840        37°F      0.86   
4   Salt Lake City, UT  40.77069  111.96503     4226        30°F      0.54   
5           Denver, CO  39.71000  104.76000     5577        40°F      0.43   
6    San Francisco, CA  37.77056  122.42694      150        44°F      0.44   
7    New York City, NY  40.78000   73.97000      154        50°F      0.30   
8         Portland, ME  43.64000   70.30000       72        43°F      0.53   
9          Seattle, WA  47.54548  122.31470       20        36°F      0.80   
10       Baltimore, MD  39.28000   76.62000       20        57°F      0.28   

    wind_speed  barometer     dewpoint  vis_miles   wind_chill 



In [140]:
# Drop columns that were split into two values
df = df.drop(['temperature', 'dewpoint', 'wind_chill'], axis=1)

print(df)

              location       lat        lon  elev_ft  humidity  wind_speed  \
0         Portland, OR  45.59578  122.60917       20      0.86           0   
1        San Diego, CA  32.73361  117.18306       13      0.67          16   
2           Duluth, MN  46.72000   92.04000      607      0.93           3   
3      Minneapolis, MN  44.88000   93.23000      840      0.86          13   
4   Salt Lake City, UT  40.77069  111.96503     4226      0.54           0   
5           Denver, CO  39.71000  104.76000     5577      0.43          14   
6    San Francisco, CA  37.77056  122.42694      150      0.44           0   
7    New York City, NY  40.78000   73.97000      154      0.30           3   
8         Portland, ME  43.64000   70.30000       72      0.53          20   
9          Seattle, WA  47.54548  122.31470       20      0.80           3   
10       Baltimore, MD  39.28000   76.62000       20      0.28           0   

    barometer  vis_miles               last_update            c

In [141]:
# Move 'city' and 'state' columns after 'location' column
city = df.pop('city')
state = df.pop('state')
temp_f = df.pop('temp_f')
temp_c = df.pop('temp_c')
df.insert(1, 'city', city)
df.insert(2, 'state', state)
df.insert(6, 'temp_f', temp_f)
df.insert(7, 'temp_c', temp_c)

# Move 'last_update' column to last position
last_update = df.pop('last_update')
df['last_update'] = last_update


print(df)

              location            city state       lat        lon  elev_ft  \
0         Portland, OR        Portland    OR  45.59578  122.60917       20   
1        San Diego, CA       San Diego    CA  32.73361  117.18306       13   
2           Duluth, MN          Duluth    MN  46.72000   92.04000      607   
3      Minneapolis, MN     Minneapolis    MN  44.88000   93.23000      840   
4   Salt Lake City, UT  Salt Lake City    UT  40.77069  111.96503     4226   
5           Denver, CO          Denver    CO  39.71000  104.76000     5577   
6    San Francisco, CA   San Francisco    CA  37.77056  122.42694      150   
7    New York City, NY   New York City    NY  40.78000   73.97000      154   
8         Portland, ME        Portland    ME  43.64000   70.30000       72   
9          Seattle, WA         Seattle    WA  47.54548  122.31470       20   
10       Baltimore, MD       Baltimore    MD  39.28000   76.62000       20   

    temp_f  temp_c  humidity  wind_speed  barometer  vis_miles 

In [142]:
df.dtypes

location                         object
city                             object
state                            object
lat                             float64
lon                             float64
elev_ft                           int64
temp_f                            int64
temp_c                            int64
humidity                        float64
wind_speed                        int64
barometer                       float64
vis_miles                       float64
dewpoint_f                        int64
dewpoint_c                        int64
wind_chill_f                    float64
wind_chill_c                    float64
last_update     datetime64[ns, tzutc()]
dtype: object