In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error

# Load your dataset (replace 'your_dataset.csv' with your actual data)
data = pd.read_csv("C:/Users/PC/Downloads/challenge_22/challenge_22_data/train.csv", delimiter=";", decimal=",", na_values=["#VALEUR!"], index_col="time")
data.index = pd.to_datetime(data.index, format='%d/%m/%Y %H:%M')
data.drop(columns=['Network Frequency (Hz)' , 'CTRL anti givrage' ], inplace=True)
data.dropna(inplace=True)
test = pd.read_csv("C:/Users/PC/Downloads/challenge_22/challenge_22_data/test.csv", delimiter=";", decimal=",", na_values=["#VALEUR!"], index_col="time")
test.index = pd.to_datetime(test.index, format='%d/%m/%Y %H:%M')
test.drop(columns=['Network Frequency (Hz)' , 'CTRL anti givrage' ], inplace=True)

# Assuming your dataset has columns 'Feature1', 'Feature2', ..., 'Feature11', and 'Energy_Output'
# Modify these feature names accordingly to match your dataset

# Separate features (X) and target (y)
X = data.drop(columns="Net Power (MW)")
y = data["Net Power (MW)"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Remove outliers from the target variable (optional)
q1 = np.percentile(y_train, 25)
q3 = np.percentile(y_train, 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
y_train = np.clip(y_train, lower_bound, upper_bound)

# Initialize the models
lgb_model = lgb.LGBMRegressor(random_state=42)
catboost_model = CatBoostRegressor(random_state=42)
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Hyperparameter tuning using RandomizedSearchCV
param_distributions_lgb = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'num_leaves': [20, 30, 40],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]
}

param_distributions_catboost = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bylevel': [0.8, 0.9, 1.0],
    'reg_lambda': [0, 0.1, 0.5]
}

param_distributions_xgb = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]
}

models = {
    'lgb': lgb_model,
    'catboost': catboost_model,
    'xgb': xgb_model
}

best_models = {}

for model_name, model in models.items():
    if model_name == 'lgb':
        random_search = RandomizedSearchCV(estimator=model, param_distributions=param_distributions_lgb, n_iter=50, cv=3, scoring='neg_mean_absolute_error', random_state=42, n_jobs=-1)
    elif model_name == 'catboost':
        random_search = RandomizedSearchCV(estimator=model, param_distributions=param_distributions_catboost, n_iter=50, cv=3, scoring='neg_mean_absolute_error', random_state=42, n_jobs=-1)
    else:
        random_search = RandomizedSearchCV(estimator=model, param_distributions=param_distributions_xgb, n_iter=50, cv=3, scoring='neg_mean_absolute_error', random_state=42, n_jobs=-1)
    
    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_
    best_models[model_name] = best_model

# Evaluate models on training and test data
for model_name, model in best_models.items():
    y_train_pred = model.predict(X_train)
    mae_test = mean_absolute_error(y_train, y_train_pred)
    print(f"MAE on training data for {model_name}: {mae_test:.2f}")
    y_test_pred = model.predict(X_test)
    mae_test = mean_absolute_error(y_test, y_test_pred)
    print(f"MAE on test data for {model_name}: {mae_test:.2f}")

# Rest of the code for predictions and saving to CSV remains the same
selected_model = best_models['lgb']
a_test = test.drop(columns="Net Power (MW)")

df_NetWPower = pd.DataFrame({
    'time': test.index,
    'Net Power (MW)': selected_model.predict(a_test),
})

df_NetWPower.to_csv('data/NetWPower.csv', date_format='%d/%m/%Y %H:%M', index=False, sep=';')
df_NetWPower.head()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 293076, number of used features: 9
[LightGBM] [Info] Start training from score 117.679737
0:	learn: 152.8061054	total: 733ms	remaining: 3m 39s
1:	learn: 137.7112882	total: 1.32s	remaining: 3m 16s
2:	learn: 124.0999966	total: 1.83s	remaining: 3m 1s
3:	learn: 111.8863346	total: 2.28s	remaining: 2m 48s
4:	learn: 100.9443232	total: 2.61s	remaining: 2m 33s
5:	learn: 91.0351237	total: 3.07s	remaining: 2m 30s
6:	learn: 82.1284492	total: 3.74s	remaining: 2m 36s
7:	learn: 74.1154725	total: 4.31s	remaining: 2m 37s
8:	learn: 66.9512809	total: 4.9s	remaining: 2m 38s
9:	learn: 60.4969341	total: 5.44s	remaining: 2m 37s
10:	learn: 54.7052550	total: 5.99s	remaining: 2m 37s
11:	learn: 49.4942620	total: 6.56s	remaining: 2m 37s
12:	learn: 44.7925639	total: 7.05s	remaining: 2m 35s
13:	learn: 40.5737249	total: 7.54s	remaining: 2m 33s
14:	learn: 36.8205600	tot

1 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\PC\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\PC\anaconda3\Lib\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\PC\anaconda3\Lib\site-packages\xgboost\sklearn.py", line 1025, in fit
    self._Booster = train(
                    ^^^^^^
  File "C:\Users\PC\anaconda3\Lib\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
 

MAE on training data for lgb: 1.36
MAE on test data for lgb: 1.36
MAE on training data for catboost: 1.48
MAE on test data for catboost: 1.50
MAE on training data for xgb: 0.78
MAE on test data for xgb: 0.93


Unnamed: 0,time,Net Power (MW)
0,2022-04-01 00:00:00,8.738863
1,2022-04-01 00:01:00,8.738863
2,2022-04-01 00:02:00,8.738863
3,2022-04-01 00:03:00,8.738863
4,2022-04-01 00:04:00,8.738863


In [3]:
selected_model = best_models['xgb']
a_test = test.drop(columns="Net Power (MW)")

df_NetWPower = pd.DataFrame({
    'time': test.index,
    'Net Power (MW)': selected_model.predict(a_test),
})

df_NetWPower.to_csv('data/NetWPower.csv', date_format='%d/%m/%Y %H:%M', index=False, sep=';')
df_NetWPower.head()

Unnamed: 0,time,Net Power (MW)
0,2022-04-01 00:00:00,8.7446
1,2022-04-01 00:01:00,8.75024
2,2022-04-01 00:02:00,8.7446
3,2022-04-01 00:03:00,8.7446
4,2022-04-01 00:04:00,8.75024


In [4]:
selected_model = best_models['catboost']
a_test = test.drop(columns="Net Power (MW)")

df_NetWPower = pd.DataFrame({
    'time': test.index,
    'Net Power (MW)': selected_model.predict(a_test),
})

df_NetWPower.to_csv('data/NetWPower.csv', date_format='%d/%m/%Y %H:%M', index=False, sep=';')
df_NetWPower.head()

Unnamed: 0,time,Net Power (MW)
0,2022-04-01 00:00:00,8.680746
1,2022-04-01 00:01:00,8.680746
2,2022-04-01 00:02:00,8.680746
3,2022-04-01 00:03:00,8.680746
4,2022-04-01 00:04:00,8.680746


In [6]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from hyperopt import hp, tpe, fmin

# Load and preprocess data as before
data = pd.read_csv("C:/Users/PC/Downloads/challenge_22/challenge_22_data/train.csv", delimiter=";", decimal=",", na_values=["#VALEUR!"], index_col="time")
data.index = pd.to_datetime(data.index, format='%d/%m/%Y %H:%M')
data.drop(columns=['Network Frequency (Hz)', 'CTRL anti givrage'], inplace=True)
data.dropna(inplace=True)
test_data = pd.read_csv("C:/Users/PC/Downloads/challenge_22/challenge_22_data/test.csv", delimiter=";", decimal=",", na_values=["#VALEUR!"], index_col="time")
test_data.index = pd.to_datetime(test_data.index, format='%d/%m/%Y %H:%M')
test_data.drop(columns=['Network Frequency (Hz)', 'CTRL anti givrage'], inplace=True)
# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_data_scaled = scaler.transform(test_data)
X = data.drop(columns="Net Power (MW)")
y = data["Net Power (MW)"]
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# Hyperparameter Tuning using Bayesian Optimization
space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'max_depth': hp.choice('max_depth', np.arange(5, 15, dtype=int)),
    'num_leaves': hp.choice('num_leaves', np.arange(30, 150, dtype=int)),
    # Add more hyperparameters here
}

def objective(params):
    model = lgb.LGBMRegressor(**params, random_state=42)
    model.fit(X_scaled, y_train)
    y_val_pred = model.predict(X_val_scaled)
    return mean_absolute_error(y_val, y_val_pred)

best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=50, verbose=1)

# Train the final model with best hyperparameters on the entire training data
final_model = lgb.LGBMRegressor(**best, random_state=42)
final_model.fit(X_scaled, y)

# Make predictions on the test data
y_test_pred = final_model.predict(test_data_scaled)

# Create a DataFrame with the predictions
predictions_df = pd.DataFrame({
    'time': test.index,
    'Net Power (MW)': y_test_pred,})

# Save predictions to a CSV file
predicti_df.to_csv("predicti.csv", index=False , date_format='%d/%m/%Y %H:%M', index=False, sep=';')


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Net Power (MW)
