# FEATURES CREATING

# 0)

In [57]:
import pandas as pd
import numpy as np


In [58]:
df=pd.read_csv('data/data3_encoded.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2922 entries, 0 to 2921
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   date                    2922 non-null   object 
 1   preciptype              2922 non-null   int64  
 2   uvindex                 2922 non-null   int64  
 3   icon_clear-day          2922 non-null   int64  
 4   icon_partly-cloudy-day  2922 non-null   int64  
 5   icon_rain               2922 non-null   int64  
 6   tempmax                 2922 non-null   float64
 7   tempmin                 2922 non-null   float64
 8   temp                    2922 non-null   float64
 9   dew                     2922 non-null   float64
 10  humidity                2922 non-null   float64
 11  precip                  2922 non-null   float64
 12  precipcover             2922 non-null   float64
 13  windgust                2922 non-null   float64
 14  windspeed               2922 non-null   

In [59]:
colnames=df.columns
colnames

Index(['date', 'preciptype', 'uvindex', 'icon_clear-day',
       'icon_partly-cloudy-day', 'icon_rain', 'tempmax', 'tempmin', 'temp',
       'dew', 'humidity', 'precip', 'precipcover', 'windgust', 'windspeed',
       'winddir', 'sealevelpressure', 'cloudcover', 'visibility',
       'solarradiation', 'solarenergy', 'moonphase', 'daylight_minutes',
       'aqipm25'],
      dtype='object')

In [60]:
for col in colnames:
    print(df[col].value_counts())
    print("-"*70)

date
2025-12-31    1
2016-01-01    1
2016-01-02    1
2016-01-03    1
2016-01-04    1
             ..
2016-01-15    1
2016-01-14    1
2016-01-13    1
2016-01-12    1
2016-01-11    1
Name: count, Length: 2922, dtype: int64
----------------------------------------------------------------------
preciptype
1    1740
0    1182
Name: count, dtype: int64
----------------------------------------------------------------------
uvindex
9     779
8     559
10    530
7     432
6     335
5     164
4      66
3      33
2      18
1       4
0       2
Name: count, dtype: int64
----------------------------------------------------------------------
icon_clear-day
0    2544
1     378
Name: count, dtype: int64
----------------------------------------------------------------------
icon_partly-cloudy-day
0    2051
1     871
Name: count, dtype: int64
----------------------------------------------------------------------
icon_rain
1    1662
0    1260
Name: count, dtype: int64
-------------------------------------

In [61]:
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values("date").reset_index(drop=True)


In [62]:
df["date_diff"] = df["date"].diff().dt.days
df["segment_id"] = (df["date_diff"] > 1).cumsum()


In [63]:
df['segment_id'].value_counts()
    

segment_id
0    1461
1    1461
Name: count, dtype: int64

In [64]:
df['date_diff'].value_counts()

date_diff
1.0      2920
732.0       1
Name: count, dtype: int64

In [65]:
length=len(df)
length

2922

In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2922 entries, 0 to 2921
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   date                    2922 non-null   datetime64[ns]
 1   preciptype              2922 non-null   int64         
 2   uvindex                 2922 non-null   int64         
 3   icon_clear-day          2922 non-null   int64         
 4   icon_partly-cloudy-day  2922 non-null   int64         
 5   icon_rain               2922 non-null   int64         
 6   tempmax                 2922 non-null   float64       
 7   tempmin                 2922 non-null   float64       
 8   temp                    2922 non-null   float64       
 9   dew                     2922 non-null   float64       
 10  humidity                2922 non-null   float64       
 11  precip                  2922 non-null   float64       
 12  precipcover             2922 non-null   float64 

# 1)Pure Time Features (Calendar / Cyclical)

In [67]:
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["dayofweek"] = df["date"].dt.dayofweek
df["dayofyear"] = df["date"].dt.dayofyear
df["weekofyear"] = df["date"].dt.isocalendar().week.astype(int)
df["is_weekend"] = (df["dayofweek"] >= 5).astype(int)

# Cyclical encoding
df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)

df["doy_sin"] = np.sin(2 * np.pi * df["dayofyear"] / 365)
df["doy_cos"] = np.cos(2 * np.pi * df["dayofyear"] / 365)


# 2)AQI Lag Features

In [68]:
aqi_lags = [1, 2, 3, 5, 7, 14, 21, 30]

for lag in aqi_lags:
    df[f"aqi_lag_{lag}"] = (
        df.groupby("segment_id")["aqipm25"]
        .shift(lag)
    )


# 3)AQI Rolling Statistics

In [69]:
for w in [3, 7, 14, 30]:
    df[f"aqi_roll_mean_{w}"] = (
        df.groupby("segment_id")["aqipm25"]
        .shift(1)
        .rolling(w)
        .mean()
    )
    
    df[f"aqi_roll_std_{w}"] = (
        df.groupby("segment_id")["aqipm25"]
        .shift(1)
        .rolling(w)
        .std()
    )


In [70]:
df["aqi_roll_min_7"] = (
    df.groupby("segment_id")["aqipm25"]
    .shift(1).rolling(7).min()
)

df["aqi_roll_max_7"] = (
    df.groupby("segment_id")["aqipm25"]
    .shift(1).rolling(7).max()
)


# 4) Weather Lag Features

In [71]:
weather_cols = [
    "temp", "humidity", "dew",
    "windspeed", "winddir", "windgust",
    "precip", "precipcover",
    "cloudcover", "visibility",
    "sealevelpressure",
    "solarradiation", "solarenergy",
    "uvindex", "daylight_minutes",
    "preciptype",
    "icon_clear-day","icon_partly-cloudy-day",
    "icon_rain","tempmax","tempmin",
    "moonphase"
]

In [72]:
[i for i in df.columns if i not in weather_cols]


['date',
 'aqipm25',
 'date_diff',
 'segment_id',
 'year',
 'month',
 'dayofweek',
 'dayofyear',
 'weekofyear',
 'is_weekend',
 'month_sin',
 'month_cos',
 'doy_sin',
 'doy_cos',
 'aqi_lag_1',
 'aqi_lag_2',
 'aqi_lag_3',
 'aqi_lag_5',
 'aqi_lag_7',
 'aqi_lag_14',
 'aqi_lag_21',
 'aqi_lag_30',
 'aqi_roll_mean_3',
 'aqi_roll_std_3',
 'aqi_roll_mean_7',
 'aqi_roll_std_7',
 'aqi_roll_mean_14',
 'aqi_roll_std_14',
 'aqi_roll_mean_30',
 'aqi_roll_std_30',
 'aqi_roll_min_7',
 'aqi_roll_max_7']

In [73]:
for col in weather_cols:
    for lag in [1, 3, 7]:
        df[f"{col}_lag_{lag}"] = (
            df.groupby("segment_id")[col]
            .shift(lag)
        )


# 5)Rolling Weather Trends

In [74]:
df[weather_cols].dtypes

temp                      float64
humidity                  float64
dew                       float64
windspeed                 float64
winddir                   float64
windgust                  float64
precip                    float64
precipcover               float64
cloudcover                float64
visibility                float64
sealevelpressure          float64
solarradiation            float64
solarenergy               float64
uvindex                     int64
daylight_minutes          float64
preciptype                  int64
icon_clear-day              int64
icon_partly-cloudy-day      int64
icon_rain                   int64
tempmax                   float64
tempmin                   float64
moonphase                 float64
dtype: object

In [None]:
for col in weather_cols:

    df[f"{col}_roll_mean_3"] = (
        df.groupby("segment_id")[col]
        .shift(1).rolling(3).mean()
    )


    df[f"{col}_roll_mean_7"] = (
        df.groupby("segment_id")[col]
        .shift(1).rolling(7).mean()
    )
    
    df[f"{col}_roll_mean_14"] = (
        df.groupby("segment_id")[col]
        .shift(1).rolling(14).mean()
    )


  df[f"{col}_roll_mean_7"] = (
  df[f"{col}_roll_mean_14"] = (
  df[f"{col}_roll_mean_3"] = (
  df[f"{col}_roll_mean_7"] = (
  df[f"{col}_roll_mean_14"] = (
  df[f"{col}_roll_mean_3"] = (
  df[f"{col}_roll_mean_7"] = (
  df[f"{col}_roll_mean_14"] = (
  df[f"{col}_roll_mean_3"] = (
  df[f"{col}_roll_mean_7"] = (
  df[f"{col}_roll_mean_14"] = (
  df[f"{col}_roll_mean_3"] = (
  df[f"{col}_roll_mean_7"] = (
  df[f"{col}_roll_mean_14"] = (
  df[f"{col}_roll_mean_3"] = (
  df[f"{col}_roll_mean_7"] = (
  df[f"{col}_roll_mean_14"] = (
  df[f"{col}_roll_mean_3"] = (
  df[f"{col}_roll_mean_7"] = (
  df[f"{col}_roll_mean_14"] = (
  df[f"{col}_roll_mean_3"] = (
  df[f"{col}_roll_mean_7"] = (
  df[f"{col}_roll_mean_14"] = (
  df[f"{col}_roll_mean_3"] = (
  df[f"{col}_roll_mean_7"] = (
  df[f"{col}_roll_mean_14"] = (
  df[f"{col}_roll_mean_3"] = (
  df[f"{col}_roll_mean_7"] = (
  df[f"{col}_roll_mean_14"] = (
  df[f"{col}_roll_mean_3"] = (
  df[f"{col}_roll_mean_7"] = (
  df[f"{col}_roll_mean_14"] =

In [76]:
df["rain_days_last_3"] = (
    df.groupby("segment_id")["preciptype"]
    .shift(1).rolling(3).sum()
)

df["rain_days_last_7"] = (
    df.groupby("segment_id")["preciptype"]
    .shift(1).rolling(7).sum()
)


  df["rain_days_last_3"] = (
  df["rain_days_last_7"] = (


# 6) Physics-Aware Interaction Features

In [77]:
df["temp_humidity_interaction"] = (
    df["temp_lag_1"] * df["humidity_lag_1"]
)

df["wind_dispersion_index"] = (
    df["windspeed_lag_1"] * df["visibility_lag_1"]
)

df["stagnation_index"] = (
    df["sealevelpressure_lag_1"] /
    (df["windspeed_lag_1"] + 1)
)


  df["temp_humidity_interaction"] = (
  df["wind_dispersion_index"] = (
  df["stagnation_index"] = (


# dropiing day d's columns

In [78]:
# df.to_csv('data/data4_featured.csv',index=False)

In [79]:
df.info(max_cols=200)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2922 entries, 0 to 2921
Data columns (total 191 columns):
 #    Column                               Non-Null Count  Dtype         
---   ------                               --------------  -----         
 0    date                                 2922 non-null   datetime64[ns]
 1    preciptype                           2922 non-null   int64         
 2    uvindex                              2922 non-null   int64         
 3    icon_clear-day                       2922 non-null   int64         
 4    icon_partly-cloudy-day               2922 non-null   int64         
 5    icon_rain                            2922 non-null   int64         
 6    tempmax                              2922 non-null   float64       
 7    tempmin                              2922 non-null   float64       
 8    temp                                 2922 non-null   float64       
 9    dew                                  2922 non-null   float64       
 10 

In [80]:
len(weather_cols)

22

In [30]:
helper_cols=['date_diff','segment_id']

In [32]:
df = df.drop(columns=weather_cols)

In [33]:
df = df.drop(columns=helper_cols)

In [38]:
df.info(max_cols=200)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2922 entries, 0 to 2921
Data columns (total 167 columns):
 #    Column                               Non-Null Count  Dtype         
---   ------                               --------------  -----         
 0    date                                 2922 non-null   datetime64[ns]
 1    aqipm25                              2922 non-null   float64       
 2    year                                 2922 non-null   int32         
 3    month                                2922 non-null   int32         
 4    dayofweek                            2922 non-null   int32         
 5    dayofyear                            2922 non-null   int32         
 6    weekofyear                           2922 non-null   int64         
 7    is_weekend                           2922 non-null   int64         
 8    month_sin                            2922 non-null   float64       
 9    month_cos                            2922 non-null   float64       
 10 

In [37]:
191-24

167

In [None]:
# df.to_csv('data/data5_only_lag.csv',index=False)

In [49]:
for i,j in dict(df.isna().sum()).items():
    print(f"{i}-> {j}")

date-> 0
aqipm25-> 0
year-> 0
month-> 0
dayofweek-> 0
dayofyear-> 0
weekofyear-> 0
is_weekend-> 0
month_sin-> 0
month_cos-> 0
doy_sin-> 0
doy_cos-> 0
aqi_lag_1-> 2
aqi_lag_2-> 4
aqi_lag_3-> 6
aqi_lag_5-> 10
aqi_lag_7-> 14
aqi_lag_14-> 28
aqi_lag_21-> 42
aqi_lag_30-> 60
aqi_roll_mean_3-> 6
aqi_roll_std_3-> 6
aqi_roll_mean_7-> 14
aqi_roll_std_7-> 14
aqi_roll_mean_14-> 28
aqi_roll_std_14-> 28
aqi_roll_mean_30-> 60
aqi_roll_std_30-> 60
aqi_roll_min_7-> 14
aqi_roll_max_7-> 14
temp_lag_1-> 2
temp_lag_3-> 6
temp_lag_7-> 14
humidity_lag_1-> 2
humidity_lag_3-> 6
humidity_lag_7-> 14
dew_lag_1-> 2
dew_lag_3-> 6
dew_lag_7-> 14
windspeed_lag_1-> 2
windspeed_lag_3-> 6
windspeed_lag_7-> 14
winddir_lag_1-> 2
winddir_lag_3-> 6
winddir_lag_7-> 14
windgust_lag_1-> 2
windgust_lag_3-> 6
windgust_lag_7-> 14
precip_lag_1-> 2
precip_lag_3-> 6
precip_lag_7-> 14
precipcover_lag_1-> 2
precipcover_lag_3-> 6
precipcover_lag_7-> 14
cloudcover_lag_1-> 2
cloudcover_lag_3-> 6
cloudcover_lag_7-> 14
visibility_lag_1-> 2

In [51]:
print("max na column:  ",max(dict(df.isna().sum()).values()))

max na column:   60


In [55]:
length-60

2862

In [52]:
df = df.dropna().reset_index(drop=True)


In [53]:
len(df)

2862

In [None]:
# df.to_csv('data/data6_only_lag_nona.csv',index=False)