# Exploratory analysis of weather data

| Column | Description                               | Type     |
| ------ | ----------------------------------------- | -------- |
| date   | The date of the measurement               | Datetime |
| tavg   | The average air temperature in °C         | Float    |
| tmin   | The minimum air temperature in °C         | Float    |
| tmax   | The maximum air temperature in °C         | Float    |
| prcp   | The daily precipitation total in mm       | Float    |
| snow   | The snow depth in mm                      | Float    |
| wdir   | The average wind direction in degrees (°) | Float    |
| wspd   | The average wind speed in km/h            | Float    |
| wpgt   | The peak wind gust in km/h                | Float    |
| pres   | The average sea-level air pressure in hPa | Float    |
| tsun   | The daily sunshine total in minutes (m)   | Float    |

In [1]:
WEATHER_DATA_FILE = "./../data/weather_2021.01.01-2022.10.31.csv"

In [2]:
import pandas as pd
df = pd.read_csv(WEATHER_DATA_FILE)
df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d")
df.head(5)

Unnamed: 0,date,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
0,2021-01-01,2.7,1.5,5.7,15.2,,335.0,15.5,31.0,1008.6,
1,2021-01-02,1.3,0.5,2.7,0.8,,336.0,24.7,44.0,1010.1,
2,2021-01-03,0.7,-0.3,1.6,0.0,,327.0,17.1,38.9,1012.1,
3,2021-01-04,0.0,-1.0,1.1,0.0,,329.0,10.6,30.0,1011.4,
4,2021-01-05,0.8,-0.9,2.1,0.0,,338.0,6.2,30.0,1012.1,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 669 entries, 0 to 668
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    669 non-null    datetime64[ns]
 1   tavg    669 non-null    float64       
 2   tmin    669 non-null    float64       
 3   tmax    669 non-null    float64       
 4   prcp    669 non-null    float64       
 5   snow    15 non-null     float64       
 6   wdir    667 non-null    float64       
 7   wspd    667 non-null    float64       
 8   wpgt    655 non-null    float64       
 9   pres    667 non-null    float64       
 10  tsun    0 non-null      float64       
dtypes: datetime64[ns](1), float64(10)
memory usage: 57.6 KB


In [16]:
# profile the dataset
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
import warnings

# silence Future and User warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn.matrix")
warnings.filterwarnings("ignore", category=UserWarning, module="ydata_profiling.model.missing")

profile = ProfileReport(df, title="Weather data")
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

#### Process `snow` column

In [5]:
# fill NAs with 0 for no snow
df['snow'] = df['snow'].fillna(0)

#### Process `tsun` column

In [6]:
print(f"`tsun` column contains {sum(df['tsun'].isna())} missing values out of {df.shape[0]} records")
# drop `tsun` from the dataframe
df.drop(['tsun'], axis=1, inplace=True)

`tsun` column contains 669 missing values out of 669 records


#### Process `date` column

In [7]:
df['date'].describe()

count                    669
mean     2021-12-01 00:00:00
min      2021-01-01 00:00:00
25%      2021-06-17 00:00:00
50%      2021-12-01 00:00:00
75%      2022-05-17 00:00:00
max      2022-10-31 00:00:00
Name: date, dtype: object

#### Impute missing `wpgt` values

In [9]:
df[df['wpgt'].isna()]

Unnamed: 0,date,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres
86,2021-03-28,9.5,1.6,17.2,0.0,0.0,,,,
450,2022-03-27,13.6,5.5,21.2,0.0,0.0,,,,
517,2022-06-02,20.5,15.7,26.9,1.3,0.0,114.0,11.0,,1015.2
518,2022-06-03,23.6,17.0,30.7,0.0,0.0,149.0,11.4,,1013.8
519,2022-06-04,25.8,16.9,33.1,0.0,10.0,192.0,13.3,,1014.0
520,2022-06-05,21.8,17.4,31.3,9.1,10.0,326.0,11.5,,1015.9
521,2022-06-06,20.7,15.6,25.4,4.1,10.0,2.0,8.3,,1018.2
522,2022-06-07,20.0,15.1,24.6,0.2,0.0,356.0,10.4,,1017.4
523,2022-06-08,17.5,14.7,23.5,14.6,0.0,146.0,10.3,,1013.2
524,2022-06-09,16.8,14.4,21.2,15.7,0.0,335.0,17.2,,1018.3


In [10]:
# drop index 86 and 450 as there is no wind speed information
indices_to_drop = [86, 450]
df = df.drop(indices_to_drop)

In [11]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

# separate missing values 
df_missing = df[df['wpgt'].isnull()]
df_not_missing = df.dropna(subset=['wpgt'])

# prep features for regression
X_train = df_not_missing[['wspd']]
y_train = df_not_missing['wpgt']

# Split the data into training and testing sets (e.g., 70/30 split)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

# fit simple regression model
model = LinearRegression()
model.fit(X_train, y_train)

# use model for prediction on the text set
predicted_values_test = model.predict(X_test)

# calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, predicted_values_test)
print(f"Mean Squared Error (MSE) on the test set: {mse}")

# use 'wspd' to predict missing values 
predicted_values = model.predict(df_missing[['wspd']].dropna())

# Fill in the missing values in 'wpgt'
df.loc[df['wpgt'].isnull(), 'wpgt'] = predicted_values


Mean Squared Error (MSE) on the test set: 64.06520887871775


In [12]:
print(f"Variance of `wpgt` is {np.var(df['wpgt'])}")
# as the variance is much larger than MSE so the model should be capturing a significat portion of the variation in the target

Variance of `wpgt` is 206.39397153550095


In [13]:
selected_indices = list(range(517, 529))
df.loc[selected_indices]

Unnamed: 0,date,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres
517,2022-06-02,20.5,15.7,26.9,1.3,0.0,114.0,11.0,35.968264,1015.2
518,2022-06-03,23.6,17.0,30.7,0.0,0.0,149.0,11.4,36.736437,1013.8
519,2022-06-04,25.8,16.9,33.1,0.0,10.0,192.0,13.3,40.38526,1014.0
520,2022-06-05,21.8,17.4,31.3,9.1,10.0,326.0,11.5,36.928481,1015.9
521,2022-06-06,20.7,15.6,25.4,4.1,10.0,2.0,8.3,30.783096,1018.2
522,2022-06-07,20.0,15.1,24.6,0.2,0.0,356.0,10.4,34.816005,1017.4
523,2022-06-08,17.5,14.7,23.5,14.6,0.0,146.0,10.3,34.623961,1013.2
524,2022-06-09,16.8,14.4,21.2,15.7,0.0,335.0,17.2,47.874947,1018.3
525,2022-06-10,18.7,11.0,25.0,0.5,0.0,358.0,14.5,42.689779,1022.2
526,2022-06-11,22.0,13.7,29.1,0.0,0.0,355.0,9.3,32.703529,1021.6


#### Rerun profiling

In [14]:
profile = ProfileReport(df, title="Weather data - cleaned & imputed")
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

### Save cleaned weather dataframe

In [15]:
df.to_csv('../data/weather_cleaned.csv', index=False)
df.to_excel('../data/weather_cleaned.xlsx', index=False)