In [7]:
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from xgboost import XGBClassifier
from imblearn.combine import SMOTETomek
import warnings
warnings.filterwarnings("ignore")


In [9]:
# Simulate data (from Modelv5.ipynb)
def simulate_data_with_interactions(n_samples=10000, noise_level=0.1, balance_factor=-1.735):
    np.random.seed(42)
    voltage = np.random.normal(230, 10, n_samples)
    current = np.random.normal(50, 5, n_samples)
    temperature = np.random.normal(40, 5, n_samples)
    load = np.random.normal(70, 10, n_samples)
    time_since_maintenance = np.random.exponential(100, n_samples)
    moisture_level = np.random.normal(30, 5, n_samples)
    lightning_surge = np.random.binomial(1, 0.1, n_samples)
    voltage += np.random.normal(0, noise_level * 10, n_samples)
    current += np.random.normal(0, noise_level * 5, n_samples)
    temperature += np.random.normal(0, noise_level * 5, n_samples)
    load += np.random.normal(0, noise_level * 10, n_samples)
    time_since_maintenance += np.random.normal(0, noise_level * 10, n_samples)
    moisture_level += np.random.normal(0, noise_level * 5, n_samples)
    coef_voltage = 0.05
    coef_current = 0.1
    coef_temperature = 0.4
    coef_load = 0.05
    coef_time = 0.02
    coef_moisture = 0.3
    coef_surge = 0.5
    linear_comb = (
        coef_voltage * (voltage - 230) +
        coef_current * (current - 50) +
        coef_temperature * (temperature - 40) +
        coef_load * (load - 70) +
        coef_time * (time_since_maintenance - 100) +
        coef_moisture * (moisture_level - 30) +
        coef_surge * lightning_surge + balance_factor
    )
    prob_failure = 1 / (1 + np.exp(-linear_comb))
    failure = np.random.binomial(1, prob_failure)
    df = pd.DataFrame({
        'voltage': voltage,
        'current': current,
        'temperature': temperature,
        'load': load,
        'time_since_maintenance': time_since_maintenance,
        'moisture_level': moisture_level,
        'lightning_surge': lightning_surge,
        'failure': failure
    })
    return df


In [11]:
def main():
    # Paths
    MODEL_PATH = 'grid_maintenance_ensemble_model.pkl'
    POLY_PATH = 'poly_transformer.pkl'
    SCALER_PATH = 'scaler.pkl'
    FEATURES_PATH = 'important_features.txt'

    # Generate data
    df = simulate_data_with_interactions()
    print(f"Simulated failure rate: {df['failure'].mean():.2%}")

    # Prepare features and target
    X = df.drop('failure', axis=1)
    y = df['failure']

    # Feature engineering
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    X_poly = poly.fit_transform(X)
    poly_feature_names = poly.get_feature_names_out(X.columns)
    X = pd.DataFrame(X_poly, columns=poly_feature_names)

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X = pd.DataFrame(X_scaled, columns=poly_feature_names)

    # Save transformers
    joblib.dump(poly, POLY_PATH)
    joblib.dump(scaler, SCALER_PATH)

    # Feature selection
    rf_temp = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_temp.fit(X, y)
    important_features = pd.DataFrame({
        'Feature': poly_feature_names,
        'Importance': rf_temp.feature_importances_
    }).sort_values(by='Importance', ascending=False).head(10)['Feature'].values
    X = X[important_features]

    # Save important features
    with open(FEATURES_PATH, 'w') as f:
        f.write('\n'.join(important_features))
    print(f"Important features saved to {FEATURES_PATH}: {important_features}")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

    # Apply SMOTETomek
    smotetomek = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smotetomek.fit_resample(X_train, y_train)
    print("\nClass distribution after SMOTETomek:")
    print(pd.Series(y_train_res).value_counts(normalize=True))

    # Hyperparameter tuning
    # Random Forest
    rf_param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 15, 20],
        'min_samples_split': [5, 10],
        'min_samples_leaf': [2, 5],
        'class_weight': ['balanced', {0: 1, 1: 5}]
    }
    rf_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, n_iter=10, cv=5, scoring='f1', random_state=42)
    rf_search.fit(X_train_res, y_train_res)
    rf_model = rf_search.best_estimator_
    print(f"Random Forest best params: {rf_search.best_params_}")

    # XGBoost
    xgb_param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [5, 7, 10],
        'learning_rate': [0.01, 0.1],
        'subsample': [0.6, 0.8],
        'colsample_bytree': [0.6, 0.8],
        'scale_pos_weight': [1, 5]
    }
    xgb_search = RandomizedSearchCV(XGBClassifier(random_state=42), xgb_param_grid, n_iter=10, cv=5, scoring='f1', random_state=42)
    xgb_search.fit(X_train_res, y_train_res)
    xgb_model = xgb_search.best_estimator_
    print(f"XGBoost best params: {xgb_search.best_params_}")

    # SVM (optimized for faster tuning)
    svm_param_grid = {
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto', 0.1],
        'class_weight': ['balanced', {0: 1, 1: 5}]
    }
    svm_search = RandomizedSearchCV(SVC(probability=True, random_state=42), svm_param_grid, n_iter=5, cv=3, scoring='f1', random_state=42)
    svm_search.fit(X_train_res, y_train_res)
    svm_model = svm_search.best_estimator_
    print(f"SVM best params: {svm_search.best_params_}")

    # Ensemble
    ensemble_model = VotingClassifier(
        estimators=[
            ('rf', rf_model),
            ('xgb', xgb_model),
            ('svm', svm_model)
        ],
        voting='soft',
        weights=[1, 1, 1.2]
    )
    ensemble_model.fit(X_train_res, y_train_res)

    # Save model
    joblib.dump(ensemble_model, MODEL_PATH)
    print(f"Ensemble model saved to {MODEL_PATH}")

    # Test inference
    new_data = pd.DataFrame({
        'voltage': [230, 235, 228],
        'current': [50, 52, 49],
        'temperature': [40, 45, 42],
        'load': [70, 75, 68],
        'time_since_maintenance': [100, 150, 120],
        'moisture_level': [30, 35, 32],
        'lightning_surge': [0, 1, 0]
    })
    try:
        # Apply polynomial features
        X_poly = poly.transform(new_data)
        poly_feature_names = poly.get_feature_names_out(new_data.columns)
        X = pd.DataFrame(X_poly, columns=poly_feature_names)
        # Scale all polynomial features
        X_scaled = scaler.transform(X)
        X_scaled = pd.DataFrame(X_scaled, columns=poly_feature_names)
        # Select important features
        X = X_scaled[important_features]
        # Predict
        probabilities = ensemble_model.predict_proba(X)[:, 1]
        status = ["Failure Present" if prob >= 0.2 else "Failure Not Present" for prob in probabilities]
        print("\nTransformer Failure Predictions:")
        for i, stat in enumerate(status):
            print(f"Transformer {i+1} - Status: {stat}")
    except Exception as e:
        print(f"Error during prediction: {e}")

if __name__ == "__main__":
    main()

Simulated failure rate: 31.07%
Important features saved to important_features.txt: ['temperature moisture_level' 'time_since_maintenance moisture_level'
 'voltage temperature' 'temperature' 'temperature time_since_maintenance'
 'current time_since_maintenance' 'current temperature'
 'voltage time_since_maintenance' 'load time_since_maintenance'
 'voltage moisture_level']

Class distribution after SMOTETomek:
failure
0    0.5
1    0.5
Name: proportion, dtype: float64
Random Forest best params: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 15, 'class_weight': 'balanced'}
XGBoost best params: {'subsample': 0.6, 'scale_pos_weight': 5, 'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.1, 'colsample_bytree': 0.8}
SVM best params: {'gamma': 0.1, 'class_weight': 'balanced', 'C': 1}
Ensemble model saved to grid_maintenance_ensemble_model.pkl

Transformer Failure Predictions:
Transformer 1 - Status: Failure Not Present
Transformer 2 - Status: Failure Pr

In [13]:
!pip show joblib

Name: joblib
Version: 1.4.2
Summary: Lightweight pipelining with Python functions
Home-page: 
Author: 
Author-email: Gael Varoquaux <gael.varoquaux@normalesup.org>
License: BSD 3-Clause
Location: C:\Users\AK\AppData\Roaming\Python\Python312\site-packages
Requires: 
Required-by: imbalanced-learn, nltk, scikit-learn
