In [1]:
# Dependencies

import pandas as pd
from pathlib import Path

from calendar import monthrange

In [2]:
# Setting File Path for Csv

weather_filepath = Path('../data/chicago_weather_data_2020_2023_Celcius.csv')

# Read and display Csv data

weather_data = pd.read_csv(weather_filepath)

weather_data.head()

Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,visibility,dew_point,feels_like,...,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,1577836800,2020-01-01 00:00:00 +0000 UTC,-21600,Chicago,41.878114,-87.629798,-2.08,10000.0,-5.63,-9.08,...,0.0,,,,,95,804,Clouds,overcast clouds,04n
1,1577840400,2020-01-01 01:00:00 +0000 UTC,-21600,Chicago,41.878114,-87.629798,-1.99,10000.0,-5.23,-8.99,...,0.0,,,,,75,803,Clouds,broken clouds,04n
2,1577844000,2020-01-01 02:00:00 +0000 UTC,-21600,Chicago,41.878114,-87.629798,-1.87,10000.0,-4.96,-8.87,...,0.0,,,,,100,804,Clouds,overcast clouds,04n
3,1577847600,2020-01-01 03:00:00 +0000 UTC,-21600,Chicago,41.878114,-87.629798,-1.9,10000.0,-4.55,-8.9,...,0.0,,,,,40,802,Clouds,scattered clouds,03n
4,1577851200,2020-01-01 04:00:00 +0000 UTC,-21600,Chicago,41.878114,-87.629798,-2.18,10000.0,-4.67,-9.18,...,0.0,,,,,75,803,Clouds,broken clouds,04n


In [6]:
# Data Quality Testing

# display column count

display(weather_data.count())

# display column types

display(weather_data.dtypes)

# check for null/ duplicates

display(weather_data.isnull().sum())
display(weather_data.duplicated().value_counts())

dt                     37243
dt_iso                 37243
timezone               37243
city_name              37243
lat                    37243
lon                    37243
temp                   37243
visibility             33916
dew_point              37243
feels_like             37243
temp_min               37243
temp_max               37243
pressure               37243
sea_level                  0
grnd_level                 0
humidity               37243
wind_speed             37243
wind_deg               37243
wind_gust              24627
rain_1h                 6344
rain_3h                    0
snow_1h                 1289
snow_3h                    0
clouds_all             37243
weather_id             37243
weather_main           37243
weather_description    37243
weather_icon           37243
dtype: int64

dt                       int64
dt_iso                  object
timezone                 int64
city_name               object
lat                    float64
lon                    float64
temp                   float64
visibility             float64
dew_point              float64
feels_like             float64
temp_min               float64
temp_max               float64
pressure                 int64
sea_level              float64
grnd_level             float64
humidity                 int64
wind_speed             float64
wind_deg                 int64
wind_gust              float64
rain_1h                float64
rain_3h                float64
snow_1h                float64
snow_3h                float64
clouds_all               int64
weather_id               int64
weather_main            object
weather_description     object
weather_icon            object
dtype: object

dt                         0
dt_iso                     0
timezone                   0
city_name                  0
lat                        0
lon                        0
temp                       0
visibility              3327
dew_point                  0
feels_like                 0
temp_min                   0
temp_max                   0
pressure                   0
sea_level              37243
grnd_level             37243
humidity                   0
wind_speed                 0
wind_deg                   0
wind_gust              12616
rain_1h                30899
rain_3h                37243
snow_1h                35954
snow_3h                37243
clouds_all                 0
weather_id                 0
weather_main               0
weather_description        0
weather_icon               0
dtype: int64

False    37243
Name: count, dtype: int64

In [10]:
# remove irrelevant data columns

weather_data_sorted = weather_data.drop(columns=['dt', 'timezone', 'city_name', 'visibility', 
                                        'sea_level', 'grnd_level', 'wind_gust', 'rain_1h', 
                                        'rain_3h', 'snow_1h', 'snow_3h', 'weather_icon'])

weather_data_sorted


Unnamed: 0,dt_iso,lat,lon,temp,dew_point,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,clouds_all,weather_id,weather_main,weather_description
0,2020-01-01 00:00:00 +0000 UTC,41.878114,-87.629798,-2.08,-5.63,-9.08,-2.31,-1.56,1010,74,12.90,260,95,804,Clouds,overcast clouds
1,2020-01-01 01:00:00 +0000 UTC,41.878114,-87.629798,-1.99,-5.23,-8.99,-2.23,-1.54,1010,76,10.30,270,75,803,Clouds,broken clouds
2,2020-01-01 02:00:00 +0000 UTC,41.878114,-87.629798,-1.87,-4.96,-8.87,-2.23,-1.36,1011,77,9.80,270,100,804,Clouds,overcast clouds
3,2020-01-01 03:00:00 +0000 UTC,41.878114,-87.629798,-1.90,-4.55,-8.90,-2.79,-1.14,1011,80,9.30,270,40,802,Clouds,scattered clouds
4,2020-01-01 04:00:00 +0000 UTC,41.878114,-87.629798,-2.18,-4.67,-9.18,-2.78,-1.36,1011,81,8.80,260,75,803,Clouds,broken clouds
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37238,2023-12-31 22:00:00 +0000 UTC,41.878114,-87.629798,0.77,-1.15,-4.38,-0.03,2.22,1016,86,5.81,287,100,600,Snow,light snow
37239,2023-12-31 22:00:00 +0000 UTC,41.878114,-87.629798,0.77,-1.15,-4.38,-0.03,2.22,1016,86,5.81,287,100,701,Mist,mist
37240,2023-12-31 22:00:00 +0000 UTC,41.878114,-87.629798,0.77,-1.15,-4.38,-0.03,2.22,1016,86,5.81,287,100,500,Rain,light rain
37241,2023-12-31 23:00:00 +0000 UTC,41.878114,-87.629798,0.43,-1.17,-4.81,-0.03,1.66,1017,88,5.81,311,100,600,Snow,light snow
