# Notebook 5: Model Compression
## Introduction
# Loads models from Notebook 3, compresses for h=6 (chosen for balance), evaluates trade-offs.
# Justification: Dynamic/Float16/Int quantization for LSTM; param reduction/feature selection for RF. Ensures sustainability (lower energy) while maintaining ~95% accuracy.


In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_absolute_error, mean_squared_error
import os
import time

chosen_h = 6
# Load the clean featured data from Notebook 3 (already processed and cleaned)
df = pd.read_csv('/content/drive/MyDrive/sus-lsa/featured_data.csv', index_col=0, parse_dates=True)

# Remove any NaN values
df.dropna(inplace=True)
print(f"Data shape after cleaning: {df.shape}")

train_size = int(len(df) * 0.8)
train_df = df.iloc[:train_size]
test_df = df.iloc[train_size:]
features = [col for col in df.columns if col not in ['pm25_value', 'target']]
X_train = train_df[features]
y_train = train_df['target']  # Use existing target from featured_data.csv
X_test = test_df[features]
y_test = test_df['target']   # Use existing target from featured_data.csv

# Verify no NaN values
print(f"X_test NaN count: {X_test.isnull().sum().sum()}")
print(f"y_test NaN count: {y_test.isnull().sum()}")

compression_results = []

# LSTM Compression (Weight compression instead of TFLite due to CUDA ops)
lstm = tf.keras.models.load_model(f'/content/drive/MyDrive/sus-lsa/lstm_model_h{chosen_h}.h5', custom_objects={'mse': tf.keras.metrics.MeanSquaredError()})

# Original LSTM evaluation
X_test_lstm = X_test.values.reshape(-1, 1, len(features)).astype(np.float32)
start = time.time()
original_preds = lstm.predict(X_test_lstm, verbose=0)
original_time = time.time() - start
original_mae = mean_absolute_error(y_test, original_preds.flatten())
lstm.save(f'/content/drive/MyDrive/sus-lsa/lstm_original_h{chosen_h}.h5')
original_size = os.path.getsize(f'/content/drive/MyDrive/sus-lsa/lstm_original_h{chosen_h}.h5') / (1024*1024)
compression_results.append({'Model': 'LSTM_Original', 'MAE': original_mae, 'Size_MB': original_size, 'Time_s': original_time})
print(f'LSTM Original - MAE: {original_mae:.4f}, Size: {original_size:.2f} MB, Time: {original_time:.2f}s')

# Float16 weight compression
lstm_f16 = tf.keras.models.clone_model(lstm)
weights_f16 = [w.astype(np.float16).astype(np.float32) for w in lstm.get_weights()]
lstm_f16.set_weights(weights_f16)
lstm_f16.compile(optimizer='adam', loss='mse', metrics=['mae'])
lstm_f16.save(f'/content/drive/MyDrive/sus-lsa/lstm_f16_h{chosen_h}.h5')

start = time.time()
f16_preds = lstm_f16.predict(X_test_lstm, verbose=0)
f16_time = time.time() - start
f16_mae = mean_absolute_error(y_test, f16_preds.flatten())
f16_size = os.path.getsize(f'/content/drive/MyDrive/sus-lsa/lstm_f16_h{chosen_h}.h5') / (1024*1024)
compression_results.append({'Model': 'LSTM_Float16', 'MAE': f16_mae, 'Size_MB': f16_size, 'Time_s': f16_time})
print(f'LSTM Float16 - MAE: {f16_mae:.4f}, Size: {f16_size:.2f} MB, Time: {f16_time:.2f}s')

# RF Compression
rf = joblib.load(f'/content/drive/MyDrive/sus-lsa/rf_model_h{chosen_h}.pkl')

# Original RF performance
start = time.time()
original_rf_preds = rf.predict(X_test)
original_rf_time = time.time() - start
original_rf_mae = mean_absolute_error(y_test, original_rf_preds)
original_rf_size = os.path.getsize(f'/content/drive/MyDrive/sus-lsa/rf_model_h{chosen_h}.pkl') / (1024*1024)
compression_results.append({'Model': 'RF_Original', 'MAE': original_rf_mae, 'Size_MB': original_rf_size, 'Time_s': original_rf_time})
print(f'RF Original - MAE: {original_rf_mae:.4f}, Size: {original_rf_size:.2f} MB, Time: {original_rf_time:.2f}s')

# Feature Selection
selector = SelectFromModel(rf, prefit=True)
X_train_sel = selector.transform(X_train)
X_test_sel = selector.transform(X_test)
rf_sel = RandomForestRegressor(n_estimators=50, random_state=42)
rf_sel.fit(X_train_sel, y_train)
joblib.dump(rf_sel, f'/content/drive/MyDrive/sus-lsa/rf_sel_h{chosen_h}.pkl')

# Evaluate RF Feature Selection
start = time.time()
rf_sel_preds = rf_sel.predict(X_test_sel)
rf_sel_time = time.time() - start
rf_sel_mae = mean_absolute_error(y_test, rf_sel_preds)
rf_sel_size = os.path.getsize(f'/content/drive/MyDrive/sus-lsa/rf_sel_h{chosen_h}.pkl') / (1024*1024)
compression_results.append({'Model': 'RF_FeatureSelection', 'MAE': rf_sel_mae, 'Size_MB': rf_sel_size, 'Time_s': rf_sel_time})
print(f'RF Feature Selection - MAE: {rf_sel_mae:.4f}, Size: {rf_sel_size:.2f} MB, Time: {rf_sel_time:.2f}s')

# Param Reduction
rf_red = RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42)
rf_red.fit(X_train, y_train)
joblib.dump(rf_red, f'/content/drive/MyDrive/sus-lsa/rf_red_h{chosen_h}.pkl')

# Evaluate RF Param Reduction
start = time.time()
rf_red_preds = rf_red.predict(X_test)
rf_red_time = time.time() - start
rf_red_mae = mean_absolute_error(y_test, rf_red_preds)
rf_red_size = os.path.getsize(f'/content/drive/MyDrive/sus-lsa/rf_red_h{chosen_h}.pkl') / (1024*1024)
compression_results.append({'Model': 'RF_ParamReduction', 'MAE': rf_red_mae, 'Size_MB': rf_red_size, 'Time_s': rf_red_time})
print(f'RF Param Reduction - MAE: {rf_red_mae:.4f}, Size: {rf_red_size:.2f} MB, Time: {rf_red_time:.2f}s')

pd.DataFrame(compression_results).to_csv('/content/drive/MyDrive/sus-lsa/compression_results.csv', index=False)
print("Compression results saved successfully!")



Data shape after cleaning: (4001, 20)
X_test NaN count: 0
y_test NaN count: 0




LSTM Original - MAE: 149.4593, Size: 0.07 MB, Time: 0.50s
LSTM Float16 - MAE: 149.4593, Size: 0.07 MB, Time: 0.29s
RF Original - MAE: 149.4589, Size: 0.17 MB, Time: 0.01s




RF Feature Selection - MAE: 0.1671, Size: 0.34 MB, Time: 0.01s
RF Param Reduction - MAE: 0.3149, Size: 0.29 MB, Time: 0.01s
Compression results saved successfully!
