In [1]:
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
sys.path.append(os.path.abspath('..'))
from src.preprocessing import load_data, clean_column_names, parse_dates

In [3]:
df = load_data('../data/raw/site_1_train_data.csv')
df = clean_column_names(df)
df = parse_dates(df)
df

Unnamed: 0,O3_forecast,NO2_forecast,T_forecast,q_forecast,u_forecast,v_forecast,w_forecast,NO2_satellite,HCHO_satellite,ratio_satellite,O3_target,NO2_target,timestamp
15985,0.12,59.19,21.78,20.14,1.56,-0.07,0.80,,,,5.23,19.40,2019-07-14 00:00:00
15986,0.41,62.09,23.02,20.23,1.39,-0.09,0.79,,,,7.50,17.10,2019-07-14 01:00:00
15987,0.69,64.99,24.61,20.32,1.23,-0.12,0.77,,,,8.93,15.43,2019-07-14 02:00:00
15988,0.98,67.89,24.00,20.40,1.06,-0.15,0.75,,,,11.03,12.62,2019-07-14 03:00:00
15989,1.88,71.59,23.56,21.00,0.94,0.42,0.86,,,,15.57,14.95,2019-07-14 04:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15284,58.71,19.79,26.66,10.16,3.95,0.93,-0.29,,,,40.20,64.50,2024-06-28 20:00:00
15285,59.50,19.57,27.69,10.62,3.85,1.04,-0.38,,,,116.60,66.00,2024-06-28 21:00:00
15286,60.04,19.92,26.20,10.50,3.78,0.92,-0.24,,,,90.23,66.17,2024-06-28 22:00:00
15287,60.58,20.27,24.00,10.37,3.72,0.81,-0.10,,,,116.50,62.23,2024-06-28 23:00:00


In [4]:
print(f"Time Range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"Total Rows: {len(df)}")
print(f"Variables: {df.shape[1]}")

Time Range: 2019-07-14 00:00:00 to 2024-06-30 00:00:00
Total Rows: 25081
Variables: 13


In [5]:
missing = df.isnull().sum()
missing_pct = (df.isnull().sum() / len(df)) * 100
missing_table = pd.DataFrame({'Missing Count': missing, 'Missing %': missing_pct})
missing_table[missing_table['Missing Count'] > 0]

Unnamed: 0,Missing Count,Missing %
NO2_satellite,24254,96.702683
HCHO_satellite,24180,96.407639
ratio_satellite,24287,96.834257


In [6]:
df = df.set_index('timestamp')
hourly_diff = df.index.to_series().diff()
gaps = hourly_diff[hourly_diff > pd.Timedelta(hours=1)]
print("Time Gaps Found:\n", gaps.value_counts().head())

Time Gaps Found:
 timestamp
1 days 01:00:00    214
2 days 01:00:00     98
3 days 01:00:00     43
4 days 01:00:00     20
5 days 01:00:00      5
Name: count, dtype: int64


In [7]:
numeric_cols = ['O3_target', 'NO2_target', 'T_forecast','q_forecast','u_forecast','v_forecast','w_forecast']
df[numeric_cols].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
O3_target,25081.0,26.685846,31.532385,0.2,5.33,14.18,34.95,199.35
NO2_target,25081.0,35.846456,28.289381,0.38,16.47,27.73,47.25,288.57
T_forecast,25081.0,23.90396,8.21207,2.85,17.61,25.6,29.88,44.85
q_forecast,25081.0,12.175782,5.455621,2.71,7.6,10.15,17.51,24.09
u_forecast,25081.0,0.377179,2.11899,-8.52,-1.17,0.67,1.89,8.17
v_forecast,25081.0,-0.484007,1.307774,-6.69,-1.46,-0.61,0.48,5.26
w_forecast,25081.0,-0.054889,1.464262,-9.16,-1.09,-0.08,1.06,5.65
