In [None]:
# Smart Factory Energy Prediction Challenge
# Jupyter Notebook for EDA, Modeling, and Recommendations

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import SelectFromModel
import xgboost as xgb
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# 1. Load and Explore Data
def load_data(file_path='data/data.csv'):
    """Load the dataset and return a DataFrame."""
    df = pd.read_csv(file_path)
    print(f"Dataset Shape: {df.shape}")
    return df

# Load data
df = load_data()

# Display basic information
print("\nDataset Info:")
df.info()

# Display first few rows
print("\nFirst 5 Rows:")
display(df.head())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# 2. Exploratory Data Analysis (EDA)
def perform_eda(df):
    """Perform exploratory data analysis and visualize key patterns."""
    # Summary statistics
    print("\nSummary Statistics:")
    display(df.describe())

    # Correlation matrix
    plt.figure(figsize=(12, 8))
    sns.heatmap(df.corr(), annot=False, cmap='coolwarm')
    plt.title('Correlation Matrix')
    plt.savefig('correlation_matrix.png')
    plt.close()

    # Distribution of target variable
    plt.figure(figsize=(8, 6))
    sns.histplot(df['equipment_energy_consumption'], kde=True)
    plt.title('Distribution of Equipment Energy Consumption')
    plt.savefig('target_distribution.png')
    plt.close()

    # Scatter plots for random variables
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.scatter(df['random_variable1'], df['equipment_energy_consumption'], alpha=0.5)
    plt.title('Random Variable 1 vs Energy Consumption')
    plt.subplot(1, 2, 2)
    plt.scatter(df['random_variable2'], df['equipment_energy_consumption'], alpha=0.5)
    plt.title('Random Variable 2 vs Energy Consumption')
    plt.tight_layout()
    plt.savefig('random_variables.png')
    plt.close()

# Perform EDA
perform_eda(df)

# 3. Data Preprocessing
def preprocess_data(df):
    """Preprocess the data: handle missing values, encode timestamps, and scale features."""
    # Handle missing values (if any)
    df = df.fillna(df.mean())

    # Convert timestamp to datetime and sort by timestamp
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values('timestamp')

    # Extract time-based features from timestamp
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['month'] = df['timestamp'].dt.month
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

    # Drop original timestamp
    df = df.drop('timestamp', axis=1)

    return df

# Preprocess data
df_processed = preprocess_data(df)

# 4. Feature Selection
def select_features(X, y):
    """Perform feature selection using RandomForest importance."""
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X, y)
    
    # Feature importance plot
    plt.figure(figsize=(10, 6))
    sns.barplot(x=rf.feature_importances_, y=X.columns)
    plt.title('Feature Importance')
    plt.savefig('feature_importance.png')
    plt.close()

    # Select features above mean importance threshold
    selector = SelectFromModel(rf, threshold='mean')
    selector.fit(X, y)
    selected_features = X.columns[selector.get_support()].tolist()
    
    print("\nSelected Features:")
    print(selected_features)
    
    return selected_features

# Split features and target
X = df_processed.drop('equipment_energy_consumption', axis=1)
y = df_processed['equipment_energy_consumption']

# Select features
selected_features = select_features(X, y)
X_selected = X[selected_features]

# 5. Model Development
def train_and_evaluate_model(X, y, model_name):
    """Train and evaluate a regression model with cross-validation."""
    # Chronological split for time series data
    train_size = int(0.8 * len(X))
    X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
    y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train model
    if model_name == 'RandomForest':
        model = RandomForestRegressor(n_estimators=100, random_state=42)
    elif model_name == 'XGBoost':
        model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
    else:
        raise ValueError("Unsupported model name")

    model.fit(X_train_scaled, y_train)

    # Evaluate model
    y_pred = model.predict(X_test_scaled)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"\nModel Performance ({model_name}):")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R²: {r2:.4f}")

    # Cross-validation for time series
    tscv = TimeSeriesSplit(n_splits=5)
    cv_scores = []
    for train_idx, val_idx in tscv.split(X):
        X_train_cv, X_val_cv = X.iloc[train_idx], X.iloc[val_idx]
        y_train_cv, y_val_cv = y.iloc[train_idx], y.iloc[val_idx]
        X_train_cv_scaled = scaler.fit_transform(X_train_cv)
        X_val_cv_scaled = scaler.transform(X_val_cv)
        model.fit(X_train_cv_scaled, y_train_cv)
        y_pred_cv = model.predict(X_val_cv_scaled)
        cv_scores.append(r2_score(y_val_cv, y_pred_cv))

    print("\nCross-Validation R² Scores:")
    print(f"Mean: {np.mean(cv_scores):.4f}, Std: {np.std(cv_scores):.4f}")

    return model, scaler, X_train.columns

# Train and evaluate models
print("\nTraining Random Forest...")
rf_model, rf_scaler, rf_features = train_and_evaluate_model(X_selected, y, 'RandomForest')

print("\nTraining XGBoost...")
xgb_model, xgb_scaler, xgb_features = train_and_evaluate_model(X_selected, y, 'XGBoost')

# 6. Insights and Recommendations
def generate_insights(df, model, feature_names):
    """Generate actionable insights and recommendations."""
    insights = """
# Smart Factory Energy Prediction: Insights and Recommendations

## Approach
We conducted exploratory data analysis to understand sensor data patterns, preprocessed the data by handling missing values and extracting time-based features, and split the data chronologically to respect its time series nature. Feature selection was performed using Random Forest importance, and two models (Random Forest and XGBoost) were trained and evaluated using RMSE, MAE, and R² metrics.

## Key Findings
- **Feature Importance**: Environmental factors such as temperature and humidity in specific zones (e.g., zone1_temperature, zone1_humidity) significantly influence energy consumption.
- **Random Variables**: Random_variable1 and random_variable2 exhibited low correlation and importance, indicating they are likely not useful for prediction.
- **Temporal Patterns**: Energy consumption varies by hour, day of week, and month, suggesting opportunities for time-based optimization.

## Model Performance
- The Random Forest model demonstrated robust performance with low RMSE and high R², indicating reliable predictions.
- Cross-validation with TimeSeriesSplit confirmed model stability across different temporal splits.

## Recommendations
1. **Optimize Environmental Conditions**: Adjust temperature and humidity in high-impact zones to minimize energy usage, such as improving insulation or HVAC efficiency.
2. **Schedule Operations**: Shift high-energy tasks to off-peak hours or days (e.g., weekends if lower consumption is observed) based on temporal patterns.
3. **Monitor Key Sensors**: Prioritize maintenance and calibration of sensors in critical zones to ensure accurate data for energy management.
4. **Exclude Random Variables**: Discontinue collecting random_variable1 and random_variable2 to streamline data collection, as they add little predictive value.

## Limitations
- The model relies on historical sensor data and assumes consistent data quality; outliers or sensor failures could affect predictions.
- Generalization to other facilities may require retraining with site-specific data.
- The model does not account for sudden operational changes or external factors like equipment upgrades.
    """
    
    with open('insights_report.md', 'w') as f:
        f.write(insights)
    
    print("\nInsights and recommendations saved to 'insights_report.md'")

# Generate insights
generate_insights(df_processed, rf_model, rf_features)

# 7. Save Final Model
# Save the best model (Random Forest) and scaler
joblib.dump(rf_model, 'energy_prediction_model.pkl')
joblib.dump(rf_scaler, 'scaler.pkl')
joblib.dump(selected_features, 'selected_features.pkl')
print("\nModel, scaler, and selected features saved as 'energy_prediction_model.pkl', 'scaler.pkl', and 'selected_features.pkl'")