# Check and remove zero and missing values

In [1]:
from google.colab import files
uploaded = files.upload()

Saving model_features.csv to model_features.csv


In [2]:
import pandas as pd
df = pd.read_csv("model_features.csv")
print("Data loaded successfully!")

Data loaded successfully!


In [3]:
import pandas as pd
import numpy as np

# ---------------- LOAD FEATURES ----------------
df = pd.read_csv("model_features.csv")
print(f"‚úÖ Loaded feature dataset: {df.shape}")

# ---------------- ZERO VALUE ANALYSIS ----------------
exclude_cols = ['datetime']
numeric_cols = [c for c in df.columns if c not in exclude_cols]

results = []
for col in numeric_cols:
    zero_count = (df[col] == 0).sum()
    zero_percent = (zero_count / len(df)) * 100
    try:
        corr_with_aqi = df[col].corr(df['aqi'])
    except:
        corr_with_aqi = np.nan
    results.append({
        'column': col,
        'zero_count': zero_count,
        'zero_percent': round(zero_percent, 2),
        'corr_with_aqi': round(corr_with_aqi, 3)
    })

zero_analysis = pd.DataFrame(results).sort_values(by='zero_percent', ascending=False)

# ---------------- REMOVE HIGH ZERO FEATURES ----------------
cols_to_drop = zero_analysis[zero_analysis['zero_percent'] > 80]['column'].tolist()
df.drop(columns=cols_to_drop, inplace=True, errors='ignore')
print(f"üßπ Dropped {len(cols_to_drop)} mostly-zero columns: {cols_to_drop}")

# Replace zeros with NaN for imputation
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].replace(0, np.nan)

# ---------------- FILL MISSING VALUES ----------------
# Recompute month_sin (just in case)
if "month" in df.columns:
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)

# Forward + backward fill
df = df.ffill().bfill()

print("üîç Remaining missing values:", df.isna().sum().sum())

# ---------------- SAVE CLEAN DATA ----------------
df.to_csv("model_features_v1_final.csv", index=False)
print(f"‚úÖ Final cleaned dataset saved ({df.shape[0]} rows, {df.shape[1]} cols)")


‚úÖ Loaded feature dataset: (6577, 158)
üßπ Dropped 3 mostly-zero columns: ['aqi_pct_change', 'aqi_diff_1', 'aqi_roll_std_3']
üîç Remaining missing values: 0
‚úÖ Final cleaned dataset saved (6577 rows, 155 cols)
