In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import lightgbm as lgb
import seaborn as sns
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv('public_data.csv')

# Create more sophisticated features
def create_features(df):
    # Time-based features
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
    df['minute_sin'] = np.sin(2 * np.pi * df['minute']/60)
    df['minute_cos'] = np.cos(2 * np.pi * df['minute']/60)

    # Rolling statistics for solar and load
    df['rolling_solar_mean'] = df.groupby('periodID')['solar_becsult_dayahead'].transform(
        lambda x: x.rolling(window=12, min_periods=1).mean())
    df['rolling_load_mean'] = df.groupby('periodID')['rendszerterheles_terv'].transform(
        lambda x: x.rolling(window=12, min_periods=1).mean())

    # Lag features
    df['solar_lag1'] = df.groupby('periodID')['solar_becsult_dayahead'].shift(1)
    df['load_lag1'] = df.groupby('periodID')['rendszerterheles_terv'].shift(1)

    # Additional aggregations
    stat_day = df.groupby(['season', 'weekday', 'hour', 'day_in_period'], as_index=False).agg({
        'target_flag': ['mean', 'std']
    })
    stat_day.columns = ['season', 'weekday', 'hour', 'day_in_period', 'target_day_mean', 'target_day_std']
    df = df.merge(stat_day, on=['season', 'weekday', 'hour', 'day_in_period'], how='left')

    # Interactions
    df['solar_load_ratio'] = df['solar_becsult_dayahead'] / (df['rendszerterheles_terv'] + 1)
    df['weekend'] = df['weekday'].isin([5, 6]).astype(int)

    return df

# Create features
df = create_features(df)

# Original groupby operation
stat = df.groupby(['season','weekday','hour'],as_index=False).agg({
    'target_flag':'mean'
})
stat = stat.rename(columns={"target_flag":"alike_target_avg"})
df = df.merge(stat,on=['season','weekday','hour'],how='left')

# Filter first 4 days
df_filtered = df[df['day_in_period'] < 4]

# Fill NaN values
df_filtered = df_filtered.fillna(method='ffill').fillna(method='bfill')

# Define features
features = [
    'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos',
    'holiday', 'weekday', 'weekend',
    'solar_becsult_dayahead', 'rendszerterheles_terv',
    'alike_target_avg', 'target_day_mean', 'target_day_std',
    'rolling_solar_mean', 'rolling_load_mean',
    'solar_lag1', 'load_lag1',
    'solar_load_ratio'
]

# Get period IDs and create folds
period_ids = df_filtered['periodID'].unique()
np.random.seed(42)
period_ids = np.random.permutation(period_ids)
fold_size = len(period_ids) // 3
folds = [
    period_ids[:fold_size],
    period_ids[fold_size:2*fold_size],
    period_ids[2*fold_size:]
]

# Perform cross validation
results = []
for fold_idx in range(3):
    test_periods = folds[fold_idx]
    train_periods = np.concatenate([folds[i] for i in range(3) if i != fold_idx])

    # Split data
    X_train = df_filtered[df_filtered['periodID'].isin(train_periods)][features]
    y_train = df_filtered[df_filtered['periodID'].isin(train_periods)]['target_flag']

    X_test = df_filtered[df_filtered['periodID'].isin(test_periods)][features]
    y_test = df_filtered[df_filtered['periodID'].isin(test_periods)]['target_flag']

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train model - try both RandomForest and LightGBM
    rf_model = RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    )

    lgb_model = LGBMClassifier(
        n_estimators=200,
        learning_rate=0.05,
        num_leaves=31,
        random_state=42,
        n_jobs=-1
    )

    # Train both models
    rf_model.fit(X_train_scaled, y_train)
    lgb_model.fit(X_train_scaled, y_train)

    # Get predictions
    rf_pred = rf_model.predict_proba(X_test_scaled)[:, 1]
    roc_auc_rf = roc_auc_score(y_test, rf_pred)
    lgb_pred = lgb_model.predict_proba(X_test_scaled)[:, 1]
    roc_auc_lgb = roc_auc_score(y_test, lgb_pred)

    # Ensemble predictions (average of both models)
    y_pred_proba = (rf_pred + lgb_pred) / 2

    roc_auc = roc_auc_score(y_test, y_pred_proba)

    results.append({
        'fold': fold_idx + 1,
        'roc_auc': roc_auc,
        'roc_auc_rf': roc_auc_rf,
        'roc_auc_lgb': roc_auc_lgb,
        'test_periods': len(test_periods),
        'train_periods': len(train_periods)
    })

# Display results
results_df = pd.DataFrame(results)
print("\nResults for each fold:")
print(results_df)
print("\nAverage ROC AUC score:", results_df['roc_auc'].mean())
print("Standard deviation:", results_df['roc_auc'].std())

# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': features,
    'rf_importance': rf_model.feature_importances_,
    'lgb_importance': lgb_model.feature_importances_
})
print("\nTop 10 most important features:")
print(feature_importance.sort_values('rf_importance', ascending=False).head(10))

test_data = df[df['day_in_period'] == 4]
X_submission = test_data[features]
X_submission_scaled = scaler.transform(X_submission)

# Get predictions from both models
rf_pred = rf_model.predict_proba(X_submission_scaled)[:, 1] #0.63567
lgb_pred = lgb_model.predict_proba(X_submission_scaled)[:, 1] #0.60441

# Ensemble predictions
final_predictions = (rf_pred + lgb_pred) / 2 #0.64951

# Create submission DataFrame
submission = pd.DataFrame({
    'rowID': test_data['rowID'],
    'target_flag': final_predictions
})

# Save predictions
submission.to_csv('submission.csv', index=False)


In [None]:
df = pd.read_csv("_data/public_data.csv")
print(df.head())

In [3]:
# 0.64093------------------------------------------------------------------------------
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv('public_data.csv')

# Create more sophisticated features
def create_features(df):
    # Time-based features
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
    df['minute_sin'] = np.sin(2 * np.pi * df['minute']/60)
    df['minute_cos'] = np.cos(2 * np.pi * df['minute']/60)
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['weekday']/7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['weekday']/7)

    # Rolling statistics for solar and load
    for window in [6, 12, 24]:
        df[f'rolling_solar_mean_{window}'] = df.groupby('periodID')['solar_becsult_dayahead'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean())
        df[f'rolling_load_mean_{window}'] = df.groupby('periodID')['rendszerterheles_terv'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean())

    # Lag features
    for lag in [1, 3, 6]:
        df[f'solar_lag_{lag}'] = df.groupby('periodID')['solar_becsult_dayahead'].shift(lag)
        df[f'load_lag_{lag}'] = df.groupby('periodID')['rendszerterheles_terv'].shift(lag)

    # Additional aggregations
    stat_day = df.groupby(['season', 'weekday', 'hour'], as_index=False).agg({
        'target_flag': ['mean', 'std']
    })
    stat_day.columns = ['season', 'weekday', 'hour', 'target_day_mean', 'target_day_std']
    df = df.merge(stat_day, on=['season', 'weekday', 'hour'], how='left')

    # Interactions
    df['solar_load_ratio'] = df['solar_becsult_dayahead'] / (df['rendszerterheles_terv'] + 1)
    df['weekend'] = df['weekday'].isin([5, 6]).astype(int)

    return df

# Create features
df = create_features(df)

# Original groupby operation
stat = df.groupby(['season','weekday','hour'],as_index=False).agg({
    'target_flag':'mean'
})
stat = stat.rename(columns={"target_flag":"alike_target_avg"})
df = df.merge(stat,on=['season','weekday','hour'],how='left')

# Filter first 4 days
df_filtered = df[df['day_in_period'] < 4]

# Fill NaN values
df_filtered = df_filtered.ffill().bfill()

# Define features
features = [
    'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos',
    'day_of_week_sin', 'day_of_week_cos',
    'holyday', 'weekday', 'weekend',
    'solar_becsult_dayahead', 'rendszerterheles_terv',
    'alike_target_avg', 'target_day_mean', 'target_day_std',
    'rolling_solar_mean_6', 'rolling_solar_mean_12', 'rolling_solar_mean_24',
    'rolling_load_mean_6', 'rolling_load_mean_12', 'rolling_load_mean_24',
    'solar_lag_1', 'solar_lag_3', 'solar_lag_6',
    'load_lag_1', 'load_lag_3', 'load_lag_6',
    'solar_load_ratio'
]

# Use GroupKFold for time series data
n_splits = 5
gkf = GroupKFold(n_splits=n_splits)

# Perform cross validation
results = []
for fold, (train_index, val_index) in enumerate(gkf.split(df_filtered, groups=df_filtered['periodID'])):
    X_train = df_filtered.iloc[train_index][features]
    y_train = df_filtered.iloc[train_index]['target_flag']

    X_val = df_filtered.iloc[val_index][features]
    y_val = df_filtered.iloc[val_index]['target_flag']

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Apply SMOTE for handling class imbalance
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Train models with fixed parameters
    rf_model = RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    )
    rf_model.fit(X_train_resampled, y_train_resampled)

    lgb_model = LGBMClassifier(
        n_estimators=200,
        learning_rate=0.05,
        num_leaves=31,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    )
    lgb_model.fit(X_train_resampled, y_train_resampled)

    # Get predictions
    rf_pred = rf_model.predict_proba(X_val_scaled)[:, 1]
    roc_auc_rf = roc_auc_score(y_val, rf_pred)
    lgb_pred = lgb_model.predict_proba(X_val_scaled)[:, 1]
    roc_auc_lgb = roc_auc_score(y_val, lgb_pred)

    # Weighted ensemble predictions
    y_pred_proba = 0.6 * rf_pred + 0.4 * lgb_pred

    roc_auc = roc_auc_score(y_val, y_pred_proba)

    results.append({
        'fold': fold + 1,
        'roc_auc': roc_auc,
        'roc_auc_rf': roc_auc_rf,
        'roc_auc_lgb': roc_auc_lgb,
        'val_samples': len(val_index),
        'train_samples': len(train_index)
    })

# Display results
results_df = pd.DataFrame(results)
print("\nResults for each fold:")
print(results_df)
print("\nAverage ROC AUC score:", results_df['roc_auc'].mean())
print("Standard deviation:", results_df['roc_auc'].std())

# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': features,
    'rf_importance': rf_model.feature_importances_,
    'lgb_importance': lgb_model.feature_importances_
})
print("\nTop 10 most important features:")
print(feature_importance.sort_values('rf_importance', ascending=False).head(10))

# Prepare submission data
test_data = df[df['day_in_period'] == 4]
X_submission = test_data[features]
X_submission_scaled = scaler.transform(X_submission)

# Get predictions from both models
rf_pred = rf_model.predict_proba(X_submission_scaled)[:, 1]
lgb_pred = lgb_model.predict_proba(X_submission_scaled)[:, 1]

# Ensemble predictions
final_predictions = 0.6 * rf_pred + 0.4 * lgb_pred

# Create submission DataFrame
submission = pd.DataFrame({
    'rowID': test_data['rowID'],
    'target_flag': final_predictions
})

# Save predictions
submission.to_csv('bruh5.csv', index=False)

[LightGBM] [Info] Number of positive: 38094, number of negative: 38094
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004171 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6349
[LightGBM] [Info] Number of data points in the train set: 76188, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 37207, number of negative: 37207
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006588 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6094
[LightGBM] [Info] Number of data points in the train set: 74414, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 37476, number of neg

In [11]:
# 0.64014 --------------------------------------------------------------------------------------
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
from bayes_opt import BayesianOptimization
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv('public_data.csv')

# Convert periodID to datetime
df['periodID'] = pd.to_datetime(df['periodID'])

# Create more sophisticated features
def create_features(df):
    # Existing features
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['weekday']/7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['weekday']/7)

    # Rolling statistics for solar and load
    for window in [12, 24, 48]:
        df[f'rolling_solar_mean_{window}'] = df.groupby('periodID')['solar_becsult_dayahead'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean())
        df[f'rolling_load_mean_{window}'] = df.groupby('periodID')['rendszerterheles_terv'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean())
        df[f'rolling_solar_std_{window}'] = df.groupby('periodID')['solar_becsult_dayahead'].transform(
            lambda x: x.rolling(window=window, min_periods=1).std())
        df[f'rolling_load_std_{window}'] = df.groupby('periodID')['rendszerterheles_terv'].transform(
            lambda x: x.rolling(window=window, min_periods=1).std())

    # Lag features
    for lag in [1, 12, 24]:
        df[f'solar_lag_{lag}'] = df.groupby('periodID')['solar_becsult_dayahead'].shift(lag)
        df[f'load_lag_{lag}'] = df.groupby('periodID')['rendszerterheles_terv'].shift(lag)

    # Additional aggregations
    stat_day = df.groupby(['season', 'weekday', 'hour'], as_index=False).agg({
        'target_flag': ['mean', 'std']
    })
    stat_day.columns = ['season', 'weekday', 'hour', 'target_day_mean', 'target_day_std']
    df = df.merge(stat_day, on=['season', 'weekday', 'hour'], how='left')

    # Interactions
    df['solar_load_ratio'] = df['solar_becsult_dayahead'] / (df['rendszerterheles_terv'] + 1)
    df['weekend'] = df['weekday'].isin([5, 6]).astype(int)
    df['solar_load_interaction'] = df['solar_becsult_dayahead'] * df['rendszerterheles_terv']

    # Time-based features
    df['month'] = df['periodID'].dt.month
    df['day_of_month'] = df['periodID'].dt.day

    return df

# Create features
df = create_features(df)

# Original groupby operation
stat = df.groupby(['season','weekday','hour'],as_index=False).agg({
    'target_flag':'mean'
})
stat = stat.rename(columns={"target_flag":"alike_target_avg"})
df = df.merge(stat,on=['season','weekday','hour'],how='left')

# Filter first 4 days
df_filtered = df[df['day_in_period'] < 4]

# Fill NaN values
df_filtered = df_filtered.ffill().bfill()

# Define features
features = [
    'hour_sin', 'hour_cos', 'day_of_week_sin', 'day_of_week_cos',
    'holyday', 'weekend', 'solar_becsult_dayahead', 'rendszerterheles_terv',
    'alike_target_avg', 'target_day_mean', 'target_day_std',
    'rolling_solar_mean_24', 'rolling_load_mean_24',
    'rolling_solar_std_24', 'rolling_load_std_24',
    'solar_lag_1', 'load_lag_1', 'solar_load_ratio',
    'rolling_solar_mean_12', 'rolling_load_mean_12',
    'rolling_solar_mean_48', 'rolling_load_mean_48',
    'solar_load_interaction', 'month', 'day_of_month'
]

# Use TimeSeriesSplit for time series data
tscv = TimeSeriesSplit(n_splits=5)

def optimize_lgbm(num_leaves, feature_fraction, bagging_fraction, max_depth, min_child_samples, min_child_weight):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'num_leaves': int(num_leaves),
        'feature_fraction': max(min(feature_fraction, 1), 0),
        'bagging_fraction': max(min(bagging_fraction, 1), 0),
        'max_depth': int(max_depth),
        'min_child_samples': int(min_child_samples),
        'min_child_weight': min_child_weight
    }
    cv_scores = []
    for train_index, val_index in tscv.split(df_filtered):
        X_train, X_val = df_filtered.iloc[train_index][features], df_filtered.iloc[val_index][features]
        y_train, y_val = df_filtered.iloc[train_index]['target_flag'], df_filtered.iloc[val_index]['target_flag']

        model = LGBMClassifier(**params)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)[:, 1]
        cv_scores.append(roc_auc_score(y_val, preds))
    return np.mean(cv_scores)

# Bayesian optimization for LightGBM
lgbm_bo = BayesianOptimization(
    optimize_lgbm,
    {
        'num_leaves': (20, 100),
        'feature_fraction': (0.1, 1.0),
        'bagging_fraction': (0.1, 1.0),
        'max_depth': (5, 15),
        'min_child_samples': (1, 50),
        'min_child_weight': (1e-5, 1e-1)
    }
)
lgbm_bo.maximize(n_iter=20)

# Get best parameters
best_lgbm_params = {
    'num_leaves': int(lgbm_bo.max['params']['num_leaves']),
    'feature_fraction': max(min(lgbm_bo.max['params']['feature_fraction'], 1), 0),
    'bagging_fraction': max(min(lgbm_bo.max['params']['bagging_fraction'], 1), 0),
    'max_depth': int(lgbm_bo.max['params']['max_depth']),
    'min_child_samples': int(lgbm_bo.max['params']['min_child_samples']),
    'min_child_weight': lgbm_bo.max['params']['min_child_weight']
}

results = []
for fold, (train_index, val_index) in enumerate(tscv.split(df_filtered)):
    X_train = df_filtered.iloc[train_index][features]
    y_train = df_filtered.iloc[train_index]['target_flag']

    X_val = df_filtered.iloc[val_index][features]
    y_val = df_filtered.iloc[val_index]['target_flag']

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Calculate class weights
    class_weights = dict(zip(np.unique(y_train),
                             len(y_train) / (len(np.unique(y_train)) * np.bincount(y_train))))

    # Train models
    cat_model = CatBoostClassifier(iterations=300, learning_rate=0.05, depth=6, l2_leaf_reg=3,
                                   random_seed=42, verbose=0, class_weights=class_weights)
    cat_model.fit(X_train_scaled, y_train)

    lgb_model = LGBMClassifier(**best_lgbm_params, n_estimators=300, random_state=42, n_jobs=-1)
    lgb_model.set_params(**{'class_weight': class_weights})
    lgb_model.fit(X_train_scaled, y_train)

    xgb_model = XGBClassifier(n_estimators=300, learning_rate=0.05, max_depth=6, random_state=42,
                              n_jobs=-1, scale_pos_weight=class_weights[1]/class_weights[0])
    xgb_model.fit(X_train_scaled, y_train)

    # Get predictions
    cat_pred = cat_model.predict_proba(X_val_scaled)[:, 1]
    lgb_pred = lgb_model.predict_proba(X_val_scaled)[:, 1]
    xgb_pred = xgb_model.predict_proba(X_val_scaled)[:, 1]

    # Stack predictions
    stacked_features = np.column_stack((cat_pred, lgb_pred, xgb_pred))
    meta_model = LogisticRegression(class_weight=class_weights)
    meta_model.fit(stacked_features, y_val)

    final_pred = meta_model.predict_proba(stacked_features)[:, 1]

    roc_auc = roc_auc_score(y_val, final_pred)

    results.append({
        'fold': fold + 1,
        'roc_auc': roc_auc,
        'val_samples': len(val_index),
        'train_samples': len(train_index)
    })

# Display results
results_df = pd.DataFrame(results)
print("\nResults for each fold:")
print(results_df)
print("\nAverage ROC AUC score:", results_df['roc_auc'].mean())
print("Standard deviation:", results_df['roc_auc'].std())

# Prepare submission data
test_data = df[df['day_in_period'] == 4]
X_submission = test_data[features]
X_submission_scaled = scaler.transform(X_submission)

# Get predictions from all models
cat_pred = cat_model.predict_proba(X_submission_scaled)[:, 1]
lgb_pred = lgb_model.predict_proba(X_submission_scaled)[:, 1]
xgb_pred = xgb_model.predict_proba(X_submission_scaled)[:, 1]

# Stack predictions
stacked_features = np.column_stack((cat_pred, lgb_pred, xgb_pred))
final_predictions = meta_model.predict_proba(stacked_features)[:, 1]

# Create submission DataFrame
submission = pd.DataFrame({
    'rowID': test_data['rowID'],
    'target_flag': final_predictions
})

# Save predictions
submission.to_csv('bruh8.csv', index=False)

|   iter    |  target   | baggin... | featur... | max_depth | min_ch... | min_ch... | num_le... |
-------------------------------------------------------------------------------------------------
[LightGBM] [Info] Number of positive: 6343, number of negative: 5625
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000727 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3860
[LightGBM] [Info] Number of data points in the train set: 11968, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.529997 -> initscore=0.120131
[LightGBM] [Info] Start training from score 0.120131
[LightGBM] [Info] Number of positive: 12611, number of negative: 11325
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001595 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3895
[LightGBM] [Info] Number of data points in the 

In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectFromModel
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression

# Load the data
df = pd.read_csv('public_data.csv')

# Convert periodID to datetime
df['periodID'] = pd.to_datetime(df['periodID'])

def create_features(df):
    # Time-based features
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['weekday']/7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['weekday']/7)
    df['month_sin'] = np.sin(2 * np.pi * df['periodID'].dt.month/12)
    df['month_cos'] = np.cos(2 * np.pi * df['periodID'].dt.month/12)

    # Rolling statistics for solar and load
    for window in [12, 24, 48, 72, 168]:
        df[f'rolling_solar_mean_{window}'] = df.groupby('periodID')['solar_becsult_dayahead'].transform(
            lambda x: x.ewm(span=window, min_periods=1).mean())
        df[f'rolling_load_mean_{window}'] = df.groupby('periodID')['rendszerterheles_terv'].transform(
            lambda x: x.ewm(span=window, min_periods=1).mean())
        df[f'rolling_solar_std_{window}'] = df.groupby('periodID')['solar_becsult_dayahead'].transform(
            lambda x: x.rolling(window=window, min_periods=1).std())
        df[f'rolling_load_std_{window}'] = df.groupby('periodID')['rendszerterheles_terv'].transform(
            lambda x: x.rolling(window=window, min_periods=1).std())

    # Lag features and differences
    for lag in [1, 24, 48, 168]:
        df[f'solar_lag_{lag}'] = df.groupby('periodID')['solar_becsult_dayahead'].shift(lag)
        df[f'load_lag_{lag}'] = df.groupby('periodID')['rendszerterheles_terv'].shift(lag)
        df[f'solar_diff_{lag}'] = df['solar_becsult_dayahead'] - df[f'solar_lag_{lag}']
        df[f'load_diff_{lag}'] = df['rendszerterheles_terv'] - df[f'load_lag_{lag}']

    # Additional aggregations
    stat_day = df.groupby(['season', 'weekday', 'hour'], as_index=False).agg({
        'target_flag': ['mean', 'std']
    })
    stat_day.columns = ['season', 'weekday', 'hour', 'target_day_mean', 'target_day_std']
    df = df.merge(stat_day, on=['season', 'weekday', 'hour'], how='left')

    # Interactions
    df['solar_load_ratio'] = df['solar_becsult_dayahead'] / (df['rendszerterheles_terv'] + 1)
    df['weekend'] = df['weekday'].isin([5, 6]).astype(int)
    df['solar_load_interaction'] = df['solar_becsult_dayahead'] * df['rendszerterheles_terv']

    # Time-based features
    df['day_of_year'] = df['periodID'].dt.dayofyear
    df['week_of_year'] = df['periodID'].dt.isocalendar().week
    df['is_weekend'] = df['weekday'].isin([5, 6]).astype(int)

    return df

# Create features
df = create_features(df)

# Original groupby operation
stat = df.groupby(['season','weekday','hour'],as_index=False).agg({
    'target_flag':'mean'
})
stat = stat.rename(columns={"target_flag":"alike_target_avg"})
df = df.merge(stat,on=['season','weekday','hour'],how='left')

# Filter first 4 days
df_filtered = df[df['day_in_period'] < 4]

# Fill NaN values
df_filtered = df_filtered.ffill().bfill()

# Define features
features = [
    'hour_sin', 'hour_cos', 'day_of_week_sin', 'day_of_week_cos', 'month_sin', 'month_cos',
    'holyday', 'weekend', 'is_weekend', 'solar_becsult_dayahead', 'rendszerterheles_terv',
    'alike_target_avg', 'target_day_mean', 'target_day_std',
    'rolling_solar_mean_24', 'rolling_load_mean_24',
    'rolling_solar_std_24', 'rolling_load_std_24',
    'solar_lag_1', 'load_lag_1', 'solar_load_ratio',
    'rolling_solar_mean_12', 'rolling_load_mean_12',
    'rolling_solar_mean_48', 'rolling_load_mean_48',
    'rolling_solar_mean_72', 'rolling_load_mean_72',
    'rolling_solar_mean_168', 'rolling_load_mean_168',
    'solar_load_interaction', 'day_of_year', 'week_of_year',
    'solar_diff_24', 'load_diff_24', 'solar_diff_168', 'load_diff_168'
]

# Use TimeSeriesSplit for time series data
tscv = TimeSeriesSplit(n_splits=5)

# Perform cross validation
results = []
feature_importance_list = []

for fold, (train_index, val_index) in enumerate(tscv.split(df_filtered)):
    X_train = df_filtered.iloc[train_index][features]
    y_train = df_filtered.iloc[train_index]['target_flag']

    X_val = df_filtered.iloc[val_index][features]
    y_val = df_filtered.iloc[val_index]['target_flag']

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Feature selection
    selector = SelectFromModel(estimator=LGBMClassifier(n_estimators=100, random_state=42), max_features=30)
    X_train_selected = selector.fit_transform(X_train_scaled, y_train)
    X_val_selected = selector.transform(X_val_scaled)

    # Calculate class weights
    class_weights = dict(zip(np.unique(y_train),
                             len(y_train) / (len(np.unique(y_train)) * np.bincount(y_train))))

    # Define base models
    cat_model = CatBoostClassifier(iterations=500, learning_rate=0.03, depth=6, l2_leaf_reg=3,
                                   random_seed=42, verbose=0, class_weights=class_weights)
    lgb_model = LGBMClassifier(n_estimators=500, learning_rate=0.03, num_leaves=31,
                               random_state=42, class_weight=class_weights)
    xgb_model = XGBClassifier(n_estimators=500, learning_rate=0.03, max_depth=6, random_state=42,
                              scale_pos_weight=class_weights[1]/class_weights[0])
    rf_model = RandomForestClassifier(n_estimators=500, max_depth=10, random_state=42,
                                      class_weight=class_weights)

    # Create stacking ensemble
    stacking_model = StackingClassifier(
        estimators=[('cat', cat_model), ('lgb', lgb_model), ('xgb', xgb_model), ('rf', rf_model)],
        final_estimator=LogisticRegression(class_weight=class_weights),
        cv=3
    )

    # Fit stacking model
    stacking_model.fit(X_train_selected, y_train)

    # Make predictions
    y_pred = stacking_model.predict_proba(X_val_selected)[:, 1]

    roc_auc = roc_auc_score(y_val, y_pred)

    results.append({
        'fold': fold + 1,
        'roc_auc': roc_auc,
        'val_samples': len(val_index),
        'train_samples': len(train_index)
    })

    # Collect feature importance (using LightGBM as an example)
    feature_importance_list.append(pd.DataFrame({
        'feature': selector.get_feature_names_out(features),
        'importance': stacking_model.named_estimators_['lgb'].feature_importances_
    }))

# Display results
results_df = pd.DataFrame(results)
print("\nResults for each fold:")
print(results_df)
print("\nAverage ROC AUC score:", results_df['roc_auc'].mean())
print("Standard deviation:", results_df['roc_auc'].std())

# Display feature importance
feature_importance = pd.concat(feature_importance_list, ignore_index=True)
mean_feature_importance = feature_importance.groupby('feature').mean().sort_values('importance', ascending=False)
print("\nTop 15 most important features:")
print(mean_feature_importance.head(15))

# Prepare submission data
test_data = df[df['day_in_period'] == 4]
X_submission = test_data[features]
X_submission_scaled = scaler.transform(X_submission)
X_submission_selected = selector.transform(X_submission_scaled)

final_predictions = stacking_model.predict_proba(X_submission_selected)[:, 1]

submission = pd.DataFrame({
    'rowID': test_data['rowID'],
    'target_flag': final_predictions
})

submission.to_csv('submission.csv', index=False)

[LightGBM] [Info] Number of positive: 6343, number of negative: 5625
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002055 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5970
[LightGBM] [Info] Number of data points in the train set: 11968, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.529997 -> initscore=0.120131
[LightGBM] [Info] Start training from score 0.120131




[LightGBM] [Info] Number of positive: 6343, number of negative: 5625
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000616 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3655
[LightGBM] [Info] Number of data points in the train set: 11968, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 4228, number of negative: 3750
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000411 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3655
[LightGBM] [Info] Number of data points in the train set: 7978, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499961 -> initscore=-0.000158
[LightGBM] [Info] Start training from score -0.000158
[LightGBM] [Info]

ValueError: input_features is not equal to feature_names_in_

In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
import xgboost as xgb

# Load the data
df = pd.read_csv('public_data.csv')

def create_features(df):
    # Time-based features
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
    df['minute_sin'] = np.sin(2 * np.pi * df['minute']/60)
    df['minute_cos'] = np.cos(2 * np.pi * df['minute']/60)

    # Enhanced rolling statistics with multiple windows
    for window in [4, 8, 12]:
        df[f'rolling_solar_mean_{window}'] = df.groupby('periodID')['solar_becsult_dayahead'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean())
        df[f'rolling_load_mean_{window}'] = df.groupby('periodID')['rendszerterheles_terv'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean())

    # Multiple lag features
    for lag in [1, 2]:
        df[f'solar_lag_{lag}'] = df.groupby('periodID')['solar_becsult_dayahead'].shift(lag)
        df[f'load_lag_{lag}'] = df.groupby('periodID')['rendszerterheles_terv'].shift(lag)

    # Target encoding with smoothing to reduce overfitting
    def smooth_target_mean(group, alpha=5):
        """Smoothed target encoding with regularization"""
        counts = group.size
        means = group.mean()
        global_mean = means.mean()
        return (means * counts + global_mean * alpha) / (counts + alpha)

    # Target encoding by different groups with smoothing
    for group_cols in [
        ['season', 'hour'],
        ['weekday', 'hour'],
        ['season', 'weekday']
    ]:
        name = '_'.join(group_cols)
        agg = df.groupby(group_cols)['target_flag'].agg(['mean', 'std']).reset_index()
        agg['smoothed_mean'] = smooth_target_mean(agg['mean'])
        df[f'target_mean_{name}'] = df[group_cols].merge(
            agg[group_cols + ['smoothed_mean']],
            on=group_cols,
            how='left'
        )['smoothed_mean']

        if agg['std'].notna().any():  # Only add std if it exists
            df[f'target_std_{name}'] = df[group_cols].merge(
                agg[group_cols + ['std']],
                on=group_cols,
                how='left'
            )['std']

    # Core features and interactions
    df['solar_load_ratio'] = df['solar_becsult_dayahead'] / (df['rendszerterheles_terv'] + 1)
    df['weekend'] = df['weekday'].isin([5, 6]).astype(int)

    # New interaction features
    df['solar_hour'] = df['solar_becsult_dayahead'] * np.sin(2 * np.pi * df['hour']/24)
    df['load_hour'] = df['rendszerterheles_terv'] * np.sin(2 * np.pi * df['hour']/24)
    df['solar_season'] = df['solar_becsult_dayahead'] * df['season']
    df['load_season'] = df['rendszerterheles_terv'] * df['season']

    return df

df = create_features(df)

# Filter first 4 days
df_filtered = df[df['day_in_period'] < 4]

# Fill NaN values
df_filtered = df_filtered.fillna(method='ffill').fillna(method='bfill')

# Define features with new additions
features = [
    # Base time features
    'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos',
    'holyday', 'weekday', 'weekend',

    # Core predictors
    'solar_becsult_dayahead', 'rendszerterheles_terv',

    # Rolling means with different windows
    'rolling_solar_mean_4', 'rolling_solar_mean_8', 'rolling_solar_mean_12',
    'rolling_load_mean_4', 'rolling_load_mean_8', 'rolling_load_mean_12',

    # Lags
    'solar_lag_1', 'solar_lag_2',
    'load_lag_1', 'load_lag_2',

    # Target encodings (smoothed)
    'target_mean_season_hour', 'target_std_season_hour',
    'target_mean_weekday_hour', 'target_std_weekday_hour',
    'target_mean_season_weekday', 'target_std_season_weekday',

    # Interactions
    'solar_load_ratio', 'solar_hour', 'load_hour',
    'solar_season', 'load_season'
]

# Rest of the data preparation
period_ids = df_filtered['periodID'].unique()
np.random.seed(42)
period_ids = np.random.permutation(period_ids)
fold_size = len(period_ids) // 3
folds = [period_ids[:fold_size],
         period_ids[fold_size:2*fold_size],
         period_ids[2*fold_size:]]

results = []
for fold_idx in range(3):
    test_periods = folds[fold_idx]
    train_periods = np.concatenate([folds[i] for i in range(3) if i != fold_idx])

    X_train = df_filtered[df_filtered['periodID'].isin(train_periods)][features]
    y_train = df_filtered[df_filtered['periodID'].isin(train_periods)]['target_flag']
    X_test = df_filtered[df_filtered['periodID'].isin(test_periods)][features]
    y_test = df_filtered[df_filtered['periodID'].isin(test_periods)]['target_flag']

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # XGBoost with balanced parameters
    xgb_model = xgb.XGBClassifier(
        n_estimators=150,
        learning_rate=0.07,
        max_depth=5,
        min_child_weight=2,
        gamma=0.1,
        subsample=0.85,
        colsample_bytree=0.85,
        colsample_bylevel=0.85,  # Added level-wise column sampling
        reg_alpha=0.1,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
        use_label_encoder=False,
        eval_metric='logloss'
    )

    # LightGBM with balanced a faszomat
    lgb_model = LGBMClassifier(
        n_estimators=150,
        learning_rate=0.05,
        num_leaves=25,
        max_depth=6,
        min_child_samples=20,
        subsample=0.85,
        colsample_bytree=0.85,
        reg_alpha=0.1,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1
    )

    # Train with early stopping
    eval_set_xgb = [(X_train_scaled, y_train)]
    eval_set_lgb = [(X_train_scaled, y_train)]

    xgb_model.fit(X_train_scaled, y_train,
                 eval_set=eval_set_xgb)

    lgb_model.fit(X_train_scaled, y_train,
                 eval_set=eval_set_lgb)

    # Predictions
    xgb_pred = xgb_model.predict_proba(X_test_scaled)[:, 1]
    roc_auc_xgb = roc_auc_score(y_test, xgb_pred)
    lgb_pred = lgb_model.predict_proba(X_test_scaled)[:, 1]
    roc_auc_lgb = roc_auc_score(y_test, lgb_pred)

    # Weighted ensemble (slightly favoring LightGBM)
    y_pred_proba = 0.45 * xgb_pred + 0.55 * lgb_pred
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    results.append({
        'fold': fold_idx + 1,
        'roc_auc': roc_auc,
        'roc_auc_xgb': roc_auc_xgb,
        'roc_auc_lgb': roc_auc_lgb,
        'test_periods': len(test_periods),
        'train_periods': len(train_periods)
    })

# Results display
results_df = pd.DataFrame(results)
print("\nResults for each fold:")
print(results_df)
print("\nAverage ROC AUC score:", results_df['roc_auc'].mean())
print("Standard deviation:", results_df['roc_auc'].std())

# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'xgb_importance': xgb_model.feature_importances_,
    'lgb_importance': lgb_model.feature_importances_
})
print("\nTop 10 most important features:")
print(feature_importance.sort_values('xgb_importance', ascending=False).head(10))

# Submission predictions
test_data = df[df['day_in_period'] == 4]
X_submission = test_data[features]
X_submission_scaled = scaler.transform(X_submission)

xgb_pred = xgb_model.predict_proba(X_submission_scaled)[:, 1]
lgb_pred = lgb_model.predict_proba(X_submission_scaled)[:, 1]
final_predictions = 0.45 * xgb_pred + 0.55 * lgb_pred

submission = pd.DataFrame({
    'rowID': test_data['rowID'],
    'target_flag': final_predictions
})

submission.to_csv('submission.csv', index=False)

  df_filtered = df_filtered.fillna(method='ffill').fillna(method='bfill')


[0]	validation_0-logloss:0.62765
[1]	validation_0-logloss:0.62224
[2]	validation_0-logloss:0.61715
[3]	validation_0-logloss:0.61262
[4]	validation_0-logloss:0.60872
[5]	validation_0-logloss:0.60524
[6]	validation_0-logloss:0.60194
[7]	validation_0-logloss:0.59898
[8]	validation_0-logloss:0.59653
[9]	validation_0-logloss:0.59403
[10]	validation_0-logloss:0.59163
[11]	validation_0-logloss:0.58959
[12]	validation_0-logloss:0.58755
[13]	validation_0-logloss:0.58576
[14]	validation_0-logloss:0.58411
[15]	validation_0-logloss:0.58274
[16]	validation_0-logloss:0.58137
[17]	validation_0-logloss:0.58015
[18]	validation_0-logloss:0.57889
[19]	validation_0-logloss:0.57782
[20]	validation_0-logloss:0.57665
[21]	validation_0-logloss:0.57543
[22]	validation_0-logloss:0.57441
[23]	validation_0-logloss:0.57337
[24]	validation_0-logloss:0.57266
[25]	validation_0-logloss:0.57194
[26]	validation_0-logloss:0.57105
[27]	validation_0-logloss:0.57009


Parameters: { "use_label_encoder" } are not used.



[28]	validation_0-logloss:0.56909
[29]	validation_0-logloss:0.56849
[30]	validation_0-logloss:0.56783
[31]	validation_0-logloss:0.56714
[32]	validation_0-logloss:0.56640
[33]	validation_0-logloss:0.56566
[34]	validation_0-logloss:0.56513
[35]	validation_0-logloss:0.56454
[36]	validation_0-logloss:0.56363
[37]	validation_0-logloss:0.56300
[38]	validation_0-logloss:0.56240
[39]	validation_0-logloss:0.56189
[40]	validation_0-logloss:0.56141
[41]	validation_0-logloss:0.56099
[42]	validation_0-logloss:0.56012
[43]	validation_0-logloss:0.55962
[44]	validation_0-logloss:0.55884
[45]	validation_0-logloss:0.55831
[46]	validation_0-logloss:0.55771
[47]	validation_0-logloss:0.55687
[48]	validation_0-logloss:0.55652
[49]	validation_0-logloss:0.55604
[50]	validation_0-logloss:0.55547
[51]	validation_0-logloss:0.55499
[52]	validation_0-logloss:0.55443
[53]	validation_0-logloss:0.55406
[54]	validation_0-logloss:0.55372
[55]	validation_0-logloss:0.55325
[56]	validation_0-logloss:0.55265
[57]	validatio

Parameters: { "use_label_encoder" } are not used.



[26]	validation_0-logloss:0.58380
[27]	validation_0-logloss:0.58312
[28]	validation_0-logloss:0.58256
[29]	validation_0-logloss:0.58197
[30]	validation_0-logloss:0.58146
[31]	validation_0-logloss:0.58019
[32]	validation_0-logloss:0.57905
[33]	validation_0-logloss:0.57839
[34]	validation_0-logloss:0.57765
[35]	validation_0-logloss:0.57694
[36]	validation_0-logloss:0.57646
[37]	validation_0-logloss:0.57587
[38]	validation_0-logloss:0.57480
[39]	validation_0-logloss:0.57430
[40]	validation_0-logloss:0.57362
[41]	validation_0-logloss:0.57255
[42]	validation_0-logloss:0.57191
[43]	validation_0-logloss:0.57121
[44]	validation_0-logloss:0.57065
[45]	validation_0-logloss:0.57004
[46]	validation_0-logloss:0.56969
[47]	validation_0-logloss:0.56901
[48]	validation_0-logloss:0.56869
[49]	validation_0-logloss:0.56811
[50]	validation_0-logloss:0.56782
[51]	validation_0-logloss:0.56727
[52]	validation_0-logloss:0.56679
[53]	validation_0-logloss:0.56647
[54]	validation_0-logloss:0.56629
[55]	validatio

Parameters: { "use_label_encoder" } are not used.



[25]	validation_0-logloss:0.58582
[26]	validation_0-logloss:0.58489
[27]	validation_0-logloss:0.58389
[28]	validation_0-logloss:0.58291
[29]	validation_0-logloss:0.58234
[30]	validation_0-logloss:0.58187
[31]	validation_0-logloss:0.58116
[32]	validation_0-logloss:0.58009
[33]	validation_0-logloss:0.57948
[34]	validation_0-logloss:0.57871
[35]	validation_0-logloss:0.57775
[36]	validation_0-logloss:0.57713
[37]	validation_0-logloss:0.57651
[38]	validation_0-logloss:0.57613
[39]	validation_0-logloss:0.57547
[40]	validation_0-logloss:0.57510
[41]	validation_0-logloss:0.57435
[42]	validation_0-logloss:0.57401
[43]	validation_0-logloss:0.57364
[44]	validation_0-logloss:0.57320
[45]	validation_0-logloss:0.57268
[46]	validation_0-logloss:0.57211
[47]	validation_0-logloss:0.57179
[48]	validation_0-logloss:0.57096
[49]	validation_0-logloss:0.57040
[50]	validation_0-logloss:0.57006
[51]	validation_0-logloss:0.56934
[52]	validation_0-logloss:0.56889
[53]	validation_0-logloss:0.56861
[54]	validatio

In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
import xgboost as xgb

# Load the data
df = pd.read_csv('public_data.csv')

def create_features(df):
    # Time-based features
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
    df['minute_sin'] = np.sin(2 * np.pi * df['minute']/60)
    df['minute_cos'] = np.cos(2 * np.pi * df['minute']/60)

    # Enhanced rolling statistics with multiple windows
    for window in [4, 8, 12]:
        # Mean
        df[f'rolling_solar_mean_{window}'] = df.groupby('periodID')['solar_becsult_dayahead'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean())
        df[f'rolling_load_mean_{window}'] = df.groupby('periodID')['rendszerterheles_terv'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean())

        # New: Standard deviation
        df[f'rolling_solar_std_{window}'] = df.groupby('periodID')['solar_becsult_dayahead'].transform(
            lambda x: x.rolling(window=window, min_periods=1).std())
        df[f'rolling_load_std_{window}'] = df.groupby('periodID')['rendszerterheles_terv'].transform(
            lambda x: x.rolling(window=window, min_periods=1).std())

    # Multiple lag features
    for lag in [1, 2]:
        # Basic lags
        df[f'solar_lag_{lag}'] = df.groupby('periodID')['solar_becsult_dayahead'].shift(lag)
        df[f'load_lag_{lag}'] = df.groupby('periodID')['rendszerterheles_terv'].shift(lag)

        # New: Lag differences and ratios
        df[f'solar_lag_diff_{lag}'] = df['solar_becsult_dayahead'] - df[f'solar_lag_{lag}']
        df[f'load_lag_diff_{lag}'] = df['rendszerterheles_terv'] - df[f'load_lag_{lag}']
        df[f'solar_lag_ratio_{lag}'] = df['solar_becsult_dayahead'] / (df[f'solar_lag_{lag}'] + 1)
        df[f'load_lag_ratio_{lag}'] = df['rendszerterheles_terv'] / (df[f'load_lag_{lag}'] + 1)

    # Target encoding with smoothing to reduce overfitting
    def smooth_target_mean(group, alpha=5):
        counts = group.size
        means = group.mean()
        global_mean = means.mean()
        return (means * counts + global_mean * alpha) / (counts + alpha)

    # Target encoding by different groups with smoothing
    for group_cols in [
        ['season', 'hour'],
        ['weekday', 'hour'],
        ['season', 'weekday']
    ]:
        name = '_'.join(group_cols)
        agg = df.groupby(group_cols)['target_flag'].agg(['mean', 'std']).reset_index()
        agg['smoothed_mean'] = smooth_target_mean(agg['mean'])
        df[f'target_mean_{name}'] = df[group_cols].merge(
            agg[group_cols + ['smoothed_mean']],
            on=group_cols,
            how='left'
        )['smoothed_mean']

        if agg['std'].notna().any():
            df[f'target_std_{name}'] = df[group_cols].merge(
                agg[group_cols + ['std']],
                on=group_cols,
                how='left'
            )['std']

    # Core features and interactions
    df['solar_load_ratio'] = df['solar_becsult_dayahead'] / (df['rendszerterheles_terv'] + 1)
    df['weekend'] = df['weekday'].isin([5, 6]).astype(int)

    # Interaction features
    df['solar_hour'] = df['solar_becsult_dayahead'] * np.sin(2 * np.pi * df['hour']/24)
    df['load_hour'] = df['rendszerterheles_terv'] * np.sin(2 * np.pi * df['hour']/24)
    df['solar_season'] = df['solar_becsult_dayahead'] * df['season']
    df['load_season'] = df['rendszerterheles_terv'] * df['season']

    return df

df = create_features(df)

# Filter first 4 days
df_filtered = df[df['day_in_period'] < 4]

# Fill NaN values
df_filtered = df_filtered.fillna(method='ffill').fillna(method='bfill')

# Define features with new additions
features = [
    # Base time features
    'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos',
    'holyday', 'weekday', 'weekend',

    # Core predictors
    'solar_becsult_dayahead', 'rendszerterheles_terv',

    # Rolling means with different windows
    'rolling_solar_mean_4', 'rolling_solar_mean_8', 'rolling_solar_mean_12',
    'rolling_load_mean_4', 'rolling_load_mean_8', 'rolling_load_mean_12',

    # New: Rolling standard deviations
    'rolling_solar_std_4', 'rolling_solar_std_8', 'rolling_solar_std_12',
    'rolling_load_std_4', 'rolling_load_std_8', 'rolling_load_std_12',

    # Lags and new lag features
    'solar_lag_1', 'solar_lag_2',
    'load_lag_1', 'load_lag_2',
    'solar_lag_diff_1', 'solar_lag_diff_2',
    'load_lag_diff_1', 'load_lag_diff_2',
    'solar_lag_ratio_1', 'solar_lag_ratio_2',
    'load_lag_ratio_1', 'load_lag_ratio_2',

    # Target encodings (smoothed)
    'target_mean_season_hour', 'target_std_season_hour',
    'target_mean_weekday_hour', 'target_std_weekday_hour',
    'target_mean_season_weekday', 'target_std_season_weekday',

    # Interactions
    'solar_load_ratio', 'solar_hour', 'load_hour',
    'solar_season', 'load_season'
]

# Rest of the data preparation
period_ids = df_filtered['periodID'].unique()
np.random.seed(42)
period_ids = np.random.permutation(period_ids)
fold_size = len(period_ids) // 3
folds = [period_ids[:fold_size],
         period_ids[fold_size:2*fold_size],
         period_ids[2*fold_size:]]

results = []
for fold_idx in range(3):
    test_periods = folds[fold_idx]
    train_periods = np.concatenate([folds[i] for i in range(3) if i != fold_idx])

    X_train = df_filtered[df_filtered['periodID'].isin(train_periods)][features]
    y_train = df_filtered[df_filtered['periodID'].isin(train_periods)]['target_flag']
    X_test = df_filtered[df_filtered['periodID'].isin(test_periods)][features]
    y_test = df_filtered[df_filtered['periodID'].isin(test_periods)]['target_flag']

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Using our best performing parameters
    xgb_model = xgb.XGBClassifier(
        n_estimators=150,
        learning_rate=0.07,
        max_depth=5,
        min_child_weight=2,
        gamma=0.1,
        subsample=0.85,
        colsample_bytree=0.85,
        colsample_bylevel=0.85,
        reg_alpha=0.1,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
        use_label_encoder=False,
        eval_metric='logloss'
    )

    lgb_model = LGBMClassifier(
        n_estimators=150,
        learning_rate=0.05,
        num_leaves=25,
        max_depth=6,
        min_child_samples=20,
        subsample=0.85,
        colsample_bytree=0.85,
        reg_alpha=0.1,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1
    )

    # Train with early stopping
    eval_set_xgb = [(X_train_scaled, y_train)]
    eval_set_lgb = [(X_train_scaled, y_train)]

    xgb_model.fit(X_train_scaled, y_train,
                 eval_set=eval_set_xgb)

    lgb_model.fit(X_train_scaled, y_train,
                 eval_set=eval_set_lgb)

    # Predictions
    xgb_pred = xgb_model.predict_proba(X_test_scaled)[:, 1]
    roc_auc_xgb = roc_auc_score(y_test, xgb_pred)
    lgb_pred = lgb_model.predict_proba(X_test_scaled)[:, 1]
    roc_auc_lgb = roc_auc_score(y_test, lgb_pred)

    # Using our best ensemble weights
    y_pred_proba = 0.45 * xgb_pred + 0.55 * lgb_pred
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    results.append({
        'fold': fold_idx + 1,
        'roc_auc': roc_auc,
        'roc_auc_xgb': roc_auc_xgb,
        'roc_auc_lgb': roc_auc_lgb,
        'test_periods': len(test_periods),
        'train_periods': len(train_periods)
    })

# Results display
results_df = pd.DataFrame(results)
print("\nResults for each fold:")
print(results_df)
print("\nAverage ROC AUC score:", results_df['roc_auc'].mean())
print("Standard deviation:", results_df['roc_auc'].std())

# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'xgb_importance': xgb_model.feature_importances_,
    'lgb_importance': lgb_model.feature_importances_
})
print("\nTop 10 most important features:")
print(feature_importance.sort_values('xgb_importance', ascending=False).head(10))

# Submission predictions
test_data = df[df['day_in_period'] == 4]
X_submission = test_data[features]
X_submission_scaled = scaler.transform(X_submission)

xgb_pred = xgb_model.predict_proba(X_submission_scaled)[:, 1]
lgb_pred = lgb_model.predict_proba(X_submission_scaled)[:, 1]
final_predictions = 0.45 * xgb_pred + 0.55 * lgb_pred

submission = pd.DataFrame({
    'rowID': test_data['rowID'],
    'target_flag': final_predictions
})

submission.to_csv('brh14.csv', index=False)

  df_filtered = df_filtered.fillna(method='ffill').fillna(method='bfill')


[0]	validation_0-logloss:0.62752
[1]	validation_0-logloss:0.62204
[2]	validation_0-logloss:0.61702
[3]	validation_0-logloss:0.61263
[4]	validation_0-logloss:0.60848
[5]	validation_0-logloss:0.60494
[6]	validation_0-logloss:0.60177
[7]	validation_0-logloss:0.59876
[8]	validation_0-logloss:0.59607
[9]	validation_0-logloss:0.59362
[10]	validation_0-logloss:0.59143
[11]	validation_0-logloss:0.58941
[12]	validation_0-logloss:0.58740
[13]	validation_0-logloss:0.58568
[14]	validation_0-logloss:0.58409
[15]	validation_0-logloss:0.58238
[16]	validation_0-logloss:0.58086
[17]	validation_0-logloss:0.57957
[18]	validation_0-logloss:0.57824
[19]	validation_0-logloss:0.57692
[20]	validation_0-logloss:0.57577
[21]	validation_0-logloss:0.57469
[22]	validation_0-logloss:0.57320
[23]	validation_0-logloss:0.57229
[24]	validation_0-logloss:0.57141


Parameters: { "use_label_encoder" } are not used.



[25]	validation_0-logloss:0.57034
[26]	validation_0-logloss:0.56950
[27]	validation_0-logloss:0.56882
[28]	validation_0-logloss:0.56815
[29]	validation_0-logloss:0.56720
[30]	validation_0-logloss:0.56605
[31]	validation_0-logloss:0.56520
[32]	validation_0-logloss:0.56436
[33]	validation_0-logloss:0.56346
[34]	validation_0-logloss:0.56288
[35]	validation_0-logloss:0.56222
[36]	validation_0-logloss:0.56159
[37]	validation_0-logloss:0.56095
[38]	validation_0-logloss:0.56030
[39]	validation_0-logloss:0.55928
[40]	validation_0-logloss:0.55872
[41]	validation_0-logloss:0.55808
[42]	validation_0-logloss:0.55770
[43]	validation_0-logloss:0.55705
[44]	validation_0-logloss:0.55661
[45]	validation_0-logloss:0.55572
[46]	validation_0-logloss:0.55516
[47]	validation_0-logloss:0.55453
[48]	validation_0-logloss:0.55421
[49]	validation_0-logloss:0.55389
[50]	validation_0-logloss:0.55312
[51]	validation_0-logloss:0.55266
[52]	validation_0-logloss:0.55196
[53]	validation_0-logloss:0.55146
[54]	validatio

Parameters: { "use_label_encoder" } are not used.



[24]	validation_0-logloss:0.58291
[25]	validation_0-logloss:0.58211
[26]	validation_0-logloss:0.58140
[27]	validation_0-logloss:0.58063
[28]	validation_0-logloss:0.57975
[29]	validation_0-logloss:0.57881
[30]	validation_0-logloss:0.57818
[31]	validation_0-logloss:0.57753
[32]	validation_0-logloss:0.57684
[33]	validation_0-logloss:0.57610
[34]	validation_0-logloss:0.57555
[35]	validation_0-logloss:0.57486
[36]	validation_0-logloss:0.57393
[37]	validation_0-logloss:0.57335
[38]	validation_0-logloss:0.57291
[39]	validation_0-logloss:0.57224
[40]	validation_0-logloss:0.57170
[41]	validation_0-logloss:0.57125
[42]	validation_0-logloss:0.57043
[43]	validation_0-logloss:0.56941
[44]	validation_0-logloss:0.56856
[45]	validation_0-logloss:0.56799
[46]	validation_0-logloss:0.56765
[47]	validation_0-logloss:0.56697
[48]	validation_0-logloss:0.56646
[49]	validation_0-logloss:0.56596
[50]	validation_0-logloss:0.56570
[51]	validation_0-logloss:0.56515
[52]	validation_0-logloss:0.56484
[53]	validatio

Parameters: { "use_label_encoder" } are not used.



[20]	validation_0-logloss:0.58870
[21]	validation_0-logloss:0.58751
[22]	validation_0-logloss:0.58624
[23]	validation_0-logloss:0.58506
[24]	validation_0-logloss:0.58401
[25]	validation_0-logloss:0.58315
[26]	validation_0-logloss:0.58233
[27]	validation_0-logloss:0.58158
[28]	validation_0-logloss:0.58056
[29]	validation_0-logloss:0.57975
[30]	validation_0-logloss:0.57918
[31]	validation_0-logloss:0.57848
[32]	validation_0-logloss:0.57774
[33]	validation_0-logloss:0.57693
[34]	validation_0-logloss:0.57642
[35]	validation_0-logloss:0.57580
[36]	validation_0-logloss:0.57527
[37]	validation_0-logloss:0.57471
[38]	validation_0-logloss:0.57429
[39]	validation_0-logloss:0.57353
[40]	validation_0-logloss:0.57300
[41]	validation_0-logloss:0.57257
[42]	validation_0-logloss:0.57187
[43]	validation_0-logloss:0.57130
[44]	validation_0-logloss:0.57078
[45]	validation_0-logloss:0.57033
[46]	validation_0-logloss:0.56981
[47]	validation_0-logloss:0.56933
[48]	validation_0-logloss:0.56883
[49]	validatio

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
import xgboost as xgb

# Load the data
df = pd.read_csv('public_data.csv')

def create_features(df):
    # Time-based features
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
    df['minute_sin'] = np.sin(2 * np.pi * df['minute']/60)
    df['minute_cos'] = np.cos(2 * np.pi * df['minute']/60)

    # Enhanced rolling statistics with multiple windows
    for window in [4, 8, 12]:
        # Mean
        df[f'rolling_solar_mean_{window}'] = df.groupby('periodID')['solar_becsult_dayahead'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean())
        df[f'rolling_load_mean_{window}'] = df.groupby('periodID')['rendszerterheles_terv'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean())

        # Standard deviation
        df[f'rolling_solar_std_{window}'] = df.groupby('periodID')['solar_becsult_dayahead'].transform(
            lambda x: x.rolling(window=window, min_periods=1).std())
        df[f'rolling_load_std_{window}'] = df.groupby('periodID')['rendszerterheles_terv'].transform(
            lambda x: x.rolling(window=window, min_periods=1).std())

    # NEW: Exponential moving averages
    for alpha in [0.3, 0.5, 0.7]:
        df[f'solar_ema_{alpha}'] = df.groupby('periodID')['solar_becsult_dayahead'].transform(
            lambda x: x.ewm(alpha=alpha, adjust=False).mean())
        df[f'load_ema_{alpha}'] = df.groupby('periodID')['rendszerterheles_terv'].transform(
            lambda x: x.ewm(alpha=alpha, adjust=False).mean())

    # Multiple lag features
    for lag in [1, 2]:
        # Basic lags
        df[f'solar_lag_{lag}'] = df.groupby('periodID')['solar_becsult_dayahead'].shift(lag)
        df[f'load_lag_{lag}'] = df.groupby('periodID')['rendszerterheles_terv'].shift(lag)

        # Lag differences and ratios
        df[f'solar_lag_diff_{lag}'] = df['solar_becsult_dayahead'] - df[f'solar_lag_{lag}']
        df[f'load_lag_diff_{lag}'] = df['rendszerterheles_terv'] - df[f'load_lag_{lag}']
        df[f'solar_lag_ratio_{lag}'] = df['solar_becsult_dayahead'] / (df[f'solar_lag_{lag}'] + 1)
        df[f'load_lag_ratio_{lag}'] = df['rendszerterheles_terv'] / (df[f'load_lag_{lag}'] + 1)

    # Target encoding with smoothing
    def smooth_target_mean(group, alpha=5):
        counts = group.size
        means = group.mean()
        global_mean = means.mean()
        return (means * counts + global_mean * alpha) / (counts + alpha)

    for group_cols in [
        ['season', 'hour'],
        ['weekday', 'hour'],
        ['season', 'weekday']
    ]:
        name = '_'.join(group_cols)
        agg = df.groupby(group_cols)['target_flag'].agg(['mean', 'std']).reset_index()
        agg['smoothed_mean'] = smooth_target_mean(agg['mean'])
        df[f'target_mean_{name}'] = df[group_cols].merge(
            agg[group_cols + ['smoothed_mean']],
            on=group_cols,
            how='left'
        )['smoothed_mean']

        if agg['std'].notna().any():
            df[f'target_std_{name}'] = df[group_cols].merge(
                agg[group_cols + ['std']],
                on=group_cols,
                how='left'
            )['std']

    # Core features and interactions
    df['solar_load_ratio'] = df['solar_becsult_dayahead'] / (df['rendszerterheles_terv'] + 1)
    df['weekend'] = df['weekday'].isin([5, 6]).astype(int)

    # Basic Interactions
    df['solar_hour'] = df['solar_becsult_dayahead'] * np.sin(2 * np.pi * df['hour']/24)
    df['load_hour'] = df['rendszerterheles_terv'] * np.sin(2 * np.pi * df['hour']/24)
    df['solar_season'] = df['solar_becsult_dayahead'] * df['season']
    df['load_season'] = df['rendszerterheles_terv'] * df['season']

    # NEW: Cross-features between lag ratios and time
    df['load_ratio_hour'] = df['load_lag_ratio_1'] * df['hour_sin']
    df['solar_ratio_hour'] = df['solar_lag_ratio_1'] * df['hour_sin']
    df['load_ratio_season'] = df['load_lag_ratio_1'] * df['season']
    df['solar_ratio_season'] = df['solar_lag_ratio_1'] * df['season']

    return df

df = create_features(df)

# Filter first 4 days
df_filtered = df[df['day_in_period'] < 4]

# Fill NaN values
df_filtered = df_filtered.fillna(method='ffill').fillna(method='bfill')

# Define features including new ones
features = [
    # Base time features
    'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos',
    'holyday', 'weekday', 'weekend',

    # Core predictors
    'solar_becsult_dayahead', 'rendszerterheles_terv',

    # Rolling means and std
    'rolling_solar_mean_4', 'rolling_solar_mean_8', 'rolling_solar_mean_12',
    'rolling_load_mean_4', 'rolling_load_mean_8', 'rolling_load_mean_12',
    'rolling_solar_std_4', 'rolling_solar_std_8', 'rolling_solar_std_12',
    'rolling_load_std_4', 'rolling_load_std_8', 'rolling_load_std_12',

    # NEW: EMAs
    'solar_ema_0.3', 'solar_ema_0.5', 'solar_ema_0.7',
    'load_ema_0.3', 'load_ema_0.5', 'load_ema_0.7',

    # Lags and lag features
    'solar_lag_1', 'solar_lag_2',
    'load_lag_1', 'load_lag_2',
    'solar_lag_diff_1', 'solar_lag_diff_2',
    'load_lag_diff_1', 'load_lag_diff_2',
    'solar_lag_ratio_1', 'solar_lag_ratio_2',
    'load_lag_ratio_1', 'load_lag_ratio_2',

    # Target encodings
    'target_mean_season_hour', 'target_std_season_hour',
    'target_mean_weekday_hour', 'target_std_weekday_hour',
    'target_mean_season_weekday', 'target_std_season_weekday',

    # Interactions
    'solar_load_ratio', 'solar_hour', 'load_hour',
    'solar_season', 'load_season',

    # NEW: Cross-features
    'load_ratio_hour', 'solar_ratio_hour',
    'load_ratio_season', 'solar_ratio_season'
]

# Data preparation and model parameters stay the same
period_ids = df_filtered['periodID'].unique()
np.random.seed(42)
period_ids = np.random.permutation(period_ids)
fold_size = len(period_ids) // 3
folds = [period_ids[:fold_size],
         period_ids[fold_size:2*fold_size],
         period_ids[2*fold_size:]]

results = []
for fold_idx in range(3):
    test_periods = folds[fold_idx]
    train_periods = np.concatenate([folds[i] for i in range(3) if i != fold_idx])

    X_train = df_filtered[df_filtered['periodID'].isin(train_periods)][features]
    y_train = df_filtered[df_filtered['periodID'].isin(train_periods)]['target_flag']
    X_test = df_filtered[df_filtered['periodID'].isin(test_periods)][features]
    y_test = df_filtered[df_filtered['periodID'].isin(test_periods)]['target_flag']

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Using our best performing parameters
    xgb_model = xgb.XGBClassifier(
        n_estimators=150,
        learning_rate=0.07,
        max_depth=5,
        min_child_weight=2,
        gamma=0.1,
        subsample=0.85,
        colsample_bytree=0.85,
        colsample_bylevel=0.85,
        reg_alpha=0.1,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
        use_label_encoder=False,
        eval_metric='logloss'
    )

    lgb_model = LGBMClassifier(
        n_estimators=150,
        learning_rate=0.05,
        num_leaves=25,
        max_depth=6,
        min_child_samples=20,
        subsample=0.85,
        colsample_bytree=0.85,
        reg_alpha=0.1,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1
    )

    # Train with early stopping
    eval_set_xgb = [(X_train_scaled, y_train)]
    eval_set_lgb = [(X_train_scaled, y_train)]

    xgb_model.fit(X_train_scaled, y_train,
                 eval_set=eval_set_xgb)

    lgb_model.fit(X_train_scaled, y_train,
                 eval_set=eval_set_lgb)

    # Predictions
    xgb_pred = xgb_model.predict_proba(X_test_scaled)[:, 1]
    roc_auc_xgb = roc_auc_score(y_test, xgb_pred)
    lgb_pred = lgb_model.predict_proba(X_test_scaled)[:, 1]
    roc_auc_lgb = roc_auc_score(y_test, lgb_pred)

    # Using our best ensemble weights
    y_pred_proba = 0.45 * xgb_pred + 0.55 * lgb_pred
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    results.append({
        'fold': fold_idx + 1,
        'roc_auc': roc_auc,
        'roc_auc_xgb': roc_auc_xgb,
        'roc_auc_lgb': roc_auc_lgb,
        'test_periods': len(test_periods),
        'train_periods': len(train_periods)
    })

# Results display
results_df = pd.DataFrame(results)
print("\nResults for each fold:")
print(results_df)
print("\nAverage ROC AUC score:", results_df['roc_auc'].mean())
print("Standard deviation:", results_df['roc_auc'].std())

# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'xgb_importance': xgb_model.feature_importances_,
    'lgb_importance': lgb_model.feature_importances_
})
print("\nTop 10 most important features:")
print(feature_importance.sort_values('xgb_importance', ascending=False).head(10))

# Submission predictions
test_data = df[df['day_in_period'] == 4]
X_submission = test_data[features]
X_submission_scaled = scaler.transform(X_submission)

xgb_pred = xgb_model.predict_proba(X_submission_scaled)[:, 1]
lgb_pred = lgb_model.predict_proba(X_submission_scaled)[:, 1]
final_predictions = 0.45 * xgb_pred + 0.55 * lgb_pred

submission = pd.DataFrame({
    'rowID': test_data['rowID'],
    'target_flag': final_predictions
})

submission.to_csv('brh15.csv', index=False)

  df_filtered = df_filtered.fillna(method='ffill').fillna(method='bfill')


[0]	validation_0-logloss:0.62756
[1]	validation_0-logloss:0.62214
[2]	validation_0-logloss:0.61691
[3]	validation_0-logloss:0.61230
[4]	validation_0-logloss:0.60817
[5]	validation_0-logloss:0.60444
[6]	validation_0-logloss:0.60124
[7]	validation_0-logloss:0.59833
[8]	validation_0-logloss:0.59559
[9]	validation_0-logloss:0.59297
[10]	validation_0-logloss:0.59080
[11]	validation_0-logloss:0.58871
[12]	validation_0-logloss:0.58685
[13]	validation_0-logloss:0.58509
[14]	validation_0-logloss:0.58321
[15]	validation_0-logloss:0.58131
[16]	validation_0-logloss:0.57973
[17]	validation_0-logloss:0.57831
[18]	validation_0-logloss:0.57693
[19]	validation_0-logloss:0.57579
[20]	validation_0-logloss:0.57461


Parameters: { "use_label_encoder" } are not used.



[21]	validation_0-logloss:0.57351
[22]	validation_0-logloss:0.57212
[23]	validation_0-logloss:0.57115
[24]	validation_0-logloss:0.57017
[25]	validation_0-logloss:0.56932
[26]	validation_0-logloss:0.56857
[27]	validation_0-logloss:0.56738
[28]	validation_0-logloss:0.56647
[29]	validation_0-logloss:0.56557
[30]	validation_0-logloss:0.56488
[31]	validation_0-logloss:0.56408
[32]	validation_0-logloss:0.56345
[33]	validation_0-logloss:0.56267
[34]	validation_0-logloss:0.56204
[35]	validation_0-logloss:0.56135
[36]	validation_0-logloss:0.55999
[37]	validation_0-logloss:0.55939
[38]	validation_0-logloss:0.55845
[39]	validation_0-logloss:0.55795
[40]	validation_0-logloss:0.55745
[41]	validation_0-logloss:0.55674
[42]	validation_0-logloss:0.55627
[43]	validation_0-logloss:0.55584
[44]	validation_0-logloss:0.55527
[45]	validation_0-logloss:0.55425
[46]	validation_0-logloss:0.55375
[47]	validation_0-logloss:0.55319
[48]	validation_0-logloss:0.55221
[49]	validation_0-logloss:0.55176
[50]	validatio

Parameters: { "use_label_encoder" } are not used.



[24]	validation_0-logloss:0.58241
[25]	validation_0-logloss:0.58160
[26]	validation_0-logloss:0.58079
[27]	validation_0-logloss:0.58005
[28]	validation_0-logloss:0.57951
[29]	validation_0-logloss:0.57855
[30]	validation_0-logloss:0.57782
[31]	validation_0-logloss:0.57662
[32]	validation_0-logloss:0.57585
[33]	validation_0-logloss:0.57532
[34]	validation_0-logloss:0.57478
[35]	validation_0-logloss:0.57396
[36]	validation_0-logloss:0.57338
[37]	validation_0-logloss:0.57274
[38]	validation_0-logloss:0.57206
[39]	validation_0-logloss:0.57151
[40]	validation_0-logloss:0.57109
[41]	validation_0-logloss:0.57052
[42]	validation_0-logloss:0.56991
[43]	validation_0-logloss:0.56885
[44]	validation_0-logloss:0.56817
[45]	validation_0-logloss:0.56734
[46]	validation_0-logloss:0.56696
[47]	validation_0-logloss:0.56587
[48]	validation_0-logloss:0.56512
[49]	validation_0-logloss:0.56457
[50]	validation_0-logloss:0.56423
[51]	validation_0-logloss:0.56376
[52]	validation_0-logloss:0.56304
[53]	validatio

Parameters: { "use_label_encoder" } are not used.



[15]	validation_0-logloss:0.59441
[16]	validation_0-logloss:0.59291
[17]	validation_0-logloss:0.59154
[18]	validation_0-logloss:0.59032
[19]	validation_0-logloss:0.58916
[20]	validation_0-logloss:0.58803
[21]	validation_0-logloss:0.58663
[22]	validation_0-logloss:0.58578
[23]	validation_0-logloss:0.58483
[24]	validation_0-logloss:0.58365
[25]	validation_0-logloss:0.58273
[26]	validation_0-logloss:0.58185
[27]	validation_0-logloss:0.58070
[28]	validation_0-logloss:0.57996
[29]	validation_0-logloss:0.57921
[30]	validation_0-logloss:0.57862
[31]	validation_0-logloss:0.57765
[32]	validation_0-logloss:0.57682
[33]	validation_0-logloss:0.57611
[34]	validation_0-logloss:0.57555
[35]	validation_0-logloss:0.57505
[36]	validation_0-logloss:0.57447
[37]	validation_0-logloss:0.57381
[38]	validation_0-logloss:0.57320
[39]	validation_0-logloss:0.57244
[40]	validation_0-logloss:0.57180
[41]	validation_0-logloss:0.57127
[42]	validation_0-logloss:0.57074
[43]	validation_0-logloss:0.57019
[44]	validatio

In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
import xgboost as xgb

# Load the data
df = pd.read_csv('public_data.csv')

def create_features(df):
    # Time-based features
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
    df['minute_sin'] = np.sin(2 * np.pi * df['minute']/60)
    df['minute_cos'] = np.cos(2 * np.pi * df['minute']/60)

    # Rolling statistics
    for window in [4, 8, 12]:
        df[f'rolling_solar_mean_{window}'] = df.groupby('periodID')['solar_becsult_dayahead'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean())
        df[f'rolling_load_mean_{window}'] = df.groupby('periodID')['rendszerterheles_terv'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean())
        df[f'rolling_solar_std_{window}'] = df.groupby('periodID')['solar_becsult_dayahead'].transform(
            lambda x: x.rolling(window=window, min_periods=1).std())
        df[f'rolling_load_std_{window}'] = df.groupby('periodID')['rendszerterheles_terv'].transform(
            lambda x: x.rolling(window=window, min_periods=1).std())

    # Enhanced EMAs and EMA-based features
    for alpha in [0.3, 0.5, 0.7]:
        # Basic EMAs
        df[f'solar_ema_{alpha}'] = df.groupby('periodID')['solar_becsult_dayahead'].transform(
            lambda x: x.ewm(alpha=alpha, adjust=False).mean())
        df[f'load_ema_{alpha}'] = df.groupby('periodID')['rendszerterheles_terv'].transform(
            lambda x: x.ewm(alpha=alpha, adjust=False).mean())

        # NEW: EMA differences and ratios
        df[f'solar_ema_diff_{alpha}'] = df['solar_becsult_dayahead'] - df[f'solar_ema_{alpha}']
        df[f'load_ema_diff_{alpha}'] = df['rendszerterheles_terv'] - df[f'load_ema_{alpha}']
        df[f'solar_ema_ratio_{alpha}'] = df['solar_becsult_dayahead'] / (df[f'solar_ema_{alpha}'] + 1)
        df[f'load_ema_ratio_{alpha}'] = df['rendszerterheles_terv'] / (df[f'load_ema_{alpha}'] + 1)

    # Lag features
    for lag in [1, 2]:
        df[f'solar_lag_{lag}'] = df.groupby('periodID')['solar_becsult_dayahead'].shift(lag)
        df[f'load_lag_{lag}'] = df.groupby('periodID')['rendszerterheles_terv'].shift(lag)
        df[f'solar_lag_diff_{lag}'] = df['solar_becsult_dayahead'] - df[f'solar_lag_{lag}']
        df[f'load_lag_diff_{lag}'] = df['rendszerterheles_terv'] - df[f'load_lag_{lag}']
        df[f'solar_lag_ratio_{lag}'] = df['solar_becsult_dayahead'] / (df[f'solar_lag_{lag}'] + 1)
        df[f'load_lag_ratio_{lag}'] = df['rendszerterheles_terv'] / (df[f'load_lag_{lag}'] + 1)

    # Target encoding with smoothing
    def smooth_target_mean(group, alpha=5):
        counts = group.size
        means = group.mean()
        global_mean = means.mean()
        return (means * counts + global_mean * alpha) / (counts + alpha)

    for group_cols in [
        ['season', 'hour'],
        ['weekday', 'hour'],
        ['season', 'weekday']
    ]:
        name = '_'.join(group_cols)
        agg = df.groupby(group_cols)['target_flag'].agg(['mean', 'std']).reset_index()
        agg['smoothed_mean'] = smooth_target_mean(agg['mean'])
        df[f'target_mean_{name}'] = df[group_cols].merge(
            agg[group_cols + ['smoothed_mean']],
            on=group_cols,
            how='left'
        )['smoothed_mean']

        if agg['std'].notna().any():
            df[f'target_std_{name}'] = df[group_cols].merge(
                agg[group_cols + ['std']],
                on=group_cols,
                how='left'
            )['std']

    # Core features and basic interactions
    df['solar_load_ratio'] = df['solar_becsult_dayahead'] / (df['rendszerterheles_terv'] + 1)
    df['weekend'] = df['weekday'].isin([5, 6]).astype(int)
    df['solar_hour'] = df['solar_becsult_dayahead'] * np.sin(2 * np.pi * df['hour']/24)
    df['load_hour'] = df['rendszerterheles_terv'] * np.sin(2 * np.pi * df['hour']/24)
    df['solar_season'] = df['solar_becsult_dayahead'] * df['season']
    df['load_season'] = df['rendszerterheles_terv'] * df['season']

    # Existing cross-features
    df['load_ratio_hour'] = df['load_lag_ratio_1'] * df['hour_sin']
    df['solar_ratio_hour'] = df['solar_lag_ratio_1'] * df['hour_sin']
    df['load_ratio_season'] = df['load_lag_ratio_1'] * df['season']
    df['solar_ratio_season'] = df['solar_lag_ratio_1'] * df['season']

    # NEW: More granular temporal interactions
    df['load_ratio_hour_season'] = df['load_lag_ratio_1'] * df['hour_sin'] * df['season']
    df['solar_ratio_hour_season'] = df['solar_lag_ratio_1'] * df['hour_sin'] * df['season']

    # NEW: EMA and time interactions
    df['load_ema_hour'] = df['load_ema_0.5'] * df['hour_sin']
    df['solar_ema_hour'] = df['solar_ema_0.5'] * df['hour_sin']

    return df

df = create_features(df)

# Filter first 4 days
df_filtered = df[df['day_in_period'] < 4]

# Fill NaN values
df_filtered = df_filtered.fillna(method='ffill').fillna(method='bfill')

# Define features including new ones
features = [
    # Base time features
    'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos',
    'holyday', 'weekday', 'weekend',

    # Core predictors
    'solar_becsult_dayahead', 'rendszerterheles_terv',

    # Rolling features
    'rolling_solar_mean_4', 'rolling_solar_mean_8', 'rolling_solar_mean_12',
    'rolling_load_mean_4', 'rolling_load_mean_8', 'rolling_load_mean_12',
    'rolling_solar_std_4', 'rolling_solar_std_8', 'rolling_solar_std_12',
    'rolling_load_std_4', 'rolling_load_std_8', 'rolling_load_std_12',

    # EMAs and EMA-based features
    'solar_ema_0.3', 'solar_ema_0.5', 'solar_ema_0.7',
    'load_ema_0.3', 'load_ema_0.5', 'load_ema_0.7',
    'solar_ema_diff_0.3', 'solar_ema_diff_0.5', 'solar_ema_diff_0.7',
    'load_ema_diff_0.3', 'load_ema_diff_0.5', 'load_ema_diff_0.7',
    'solar_ema_ratio_0.3', 'solar_ema_ratio_0.5', 'solar_ema_ratio_0.7',
    'load_ema_ratio_0.3', 'load_ema_ratio_0.5', 'load_ema_ratio_0.7',

    # Lag features
    'solar_lag_1', 'solar_lag_2',
    'load_lag_1', 'load_lag_2',
    'solar_lag_diff_1', 'solar_lag_diff_2',
    'load_lag_diff_1', 'load_lag_diff_2',
    'solar_lag_ratio_1', 'solar_lag_ratio_2',
    'load_lag_ratio_1', 'load_lag_ratio_2',

    # Target encodings
    'target_mean_season_hour', 'target_std_season_hour',
    'target_mean_weekday_hour', 'target_std_weekday_hour',
    'target_mean_season_weekday', 'target_std_season_weekday',

    # Interactions
    'solar_load_ratio', 'solar_hour', 'load_hour',
    'solar_season', 'load_season',
    'load_ratio_hour', 'solar_ratio_hour',
    'load_ratio_season', 'solar_ratio_season',

    # NEW: Additional interactions
    'load_ratio_hour_season', 'solar_ratio_hour_season',
    'load_ema_hour', 'solar_ema_hour'
]

# Keep rest of the code same (data preparation, model parameters, training loop)
period_ids = df_filtered['periodID'].unique()
np.random.seed(42)
period_ids = np.random.permutation(period_ids)
fold_size = len(period_ids) // 3
folds = [period_ids[:fold_size],
         period_ids[fold_size:2*fold_size],
         period_ids[2*fold_size:]]

results = []
for fold_idx in range(3):
    test_periods = folds[fold_idx]
    train_periods = np.concatenate([folds[i] for i in range(3) if i != fold_idx])

    X_train = df_filtered[df_filtered['periodID'].isin(train_periods)][features]
    y_train = df_filtered[df_filtered['periodID'].isin(train_periods)]['target_flag']
    X_test = df_filtered[df_filtered['periodID'].isin(test_periods)][features]
    y_test = df_filtered[df_filtered['periodID'].isin(test_periods)]['target_flag']

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    xgb_model = xgb.XGBClassifier(
        n_estimators=150,
        learning_rate=0.07,
        max_depth=5,
        min_child_weight=2,
        gamma=0.1,
        subsample=0.85,
        colsample_bytree=0.85,
        colsample_bylevel=0.85,
        reg_alpha=0.1,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
        use_label_encoder=False,
        eval_metric='logloss'
    )

    lgb_model = LGBMClassifier(
        n_estimators=150,
        learning_rate=0.05,
        num_leaves=25,
        max_depth=6,
        min_child_samples=20,
        subsample=0.85,
        colsample_bytree=0.85,
        reg_alpha=0.1,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1
    )

    eval_set_xgb = [(X_train_scaled, y_train)]
    eval_set_lgb = [(X_train_scaled, y_train)]

    xgb_model.fit(X_train_scaled, y_train,
                 eval_set=eval_set_xgb,)

    lgb_model.fit(X_train_scaled, y_train,
                 eval_set=eval_set_lgb)

    xgb_pred = xgb_model.predict_proba(X_test_scaled)[:, 1]
    roc_auc_xgb = roc_auc_score(y_test, xgb_pred)
    lgb_pred = lgb_model.predict_proba(X_test_scaled)[:, 1]
    roc_auc_lgb = roc_auc_score(y_test, lgb_pred)

    y_pred_proba = 0.45 * xgb_pred + 0.55 * lgb_pred
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    results.append({
        'fold': fold_idx + 1,
        'roc_auc': roc_auc,
        'roc_auc_xgb': roc_auc_xgb,
        'roc_auc_lgb': roc_auc_lgb,
        'test_periods': len(test_periods),
        'train_periods': len(train_periods)
    })

# Results display
results_df = pd.DataFrame(results)
print("\nResults for each fold:")
print(results_df)
print("\nAverage ROC AUC score:", results_df['roc_auc'].mean())
print("Standard deviation:", results_df['roc_auc'].std())

# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'xgb_importance': xgb_model.feature_importances_,
    'lgb_importance': lgb_model.feature_importances_
})
print("\nTop 10 most important features:")
print(feature_importance.sort_values('xgb_importance', ascending=False).head(10))

# Submission predictions
test_data = df[df['day_in_period'] == 4]
X_submission = test_data[features]
X_submission_scaled = scaler.transform(X_submission)

xgb_pred = xgb_model.predict_proba(X_submission_scaled)[:, 1]
lgb_pred = lgb_model.predict_proba(X_submission_scaled)[:, 1]
final_predictions = 0.45 * xgb_pred + 0.55 * lgb_pred

submission = pd.DataFrame({
    'rowID': test_data['rowID'],
    'target_flag': final_predictions
})

submission.to_csv('brh16.csv', index=False)

  df_filtered = df_filtered.fillna(method='ffill').fillna(method='bfill')


[0]	validation_0-logloss:0.62745
[1]	validation_0-logloss:0.62204
[2]	validation_0-logloss:0.61681
[3]	validation_0-logloss:0.61240
[4]	validation_0-logloss:0.60821
[5]	validation_0-logloss:0.60462
[6]	validation_0-logloss:0.60139
[7]	validation_0-logloss:0.59844
[8]	validation_0-logloss:0.59568
[9]	validation_0-logloss:0.59319
[10]	validation_0-logloss:0.59107
[11]	validation_0-logloss:0.58907
[12]	validation_0-logloss:0.58692
[13]	validation_0-logloss:0.58517
[14]	validation_0-logloss:0.58357


Parameters: { "use_label_encoder" } are not used.



[15]	validation_0-logloss:0.58182
[16]	validation_0-logloss:0.58001
[17]	validation_0-logloss:0.57866
[18]	validation_0-logloss:0.57731
[19]	validation_0-logloss:0.57606
[20]	validation_0-logloss:0.57494
[21]	validation_0-logloss:0.57382
[22]	validation_0-logloss:0.57270
[23]	validation_0-logloss:0.57155
[24]	validation_0-logloss:0.57072
[25]	validation_0-logloss:0.56997
[26]	validation_0-logloss:0.56894
[27]	validation_0-logloss:0.56822
[28]	validation_0-logloss:0.56750
[29]	validation_0-logloss:0.56665
[30]	validation_0-logloss:0.56576
[31]	validation_0-logloss:0.56474
[32]	validation_0-logloss:0.56393
[33]	validation_0-logloss:0.56284
[34]	validation_0-logloss:0.56206
[35]	validation_0-logloss:0.56070
[36]	validation_0-logloss:0.55958
[37]	validation_0-logloss:0.55874
[38]	validation_0-logloss:0.55827
[39]	validation_0-logloss:0.55785
[40]	validation_0-logloss:0.55736
[41]	validation_0-logloss:0.55660
[42]	validation_0-logloss:0.55586
[43]	validation_0-logloss:0.55529
[44]	validatio

Parameters: { "use_label_encoder" } are not used.



[14]	validation_0-logloss:0.59359
[15]	validation_0-logloss:0.59213
[16]	validation_0-logloss:0.59056
[17]	validation_0-logloss:0.58924
[18]	validation_0-logloss:0.58779
[19]	validation_0-logloss:0.58662
[20]	validation_0-logloss:0.58536
[21]	validation_0-logloss:0.58419
[22]	validation_0-logloss:0.58313
[23]	validation_0-logloss:0.58210
[24]	validation_0-logloss:0.58096
[25]	validation_0-logloss:0.58009
[26]	validation_0-logloss:0.57916
[27]	validation_0-logloss:0.57836
[28]	validation_0-logloss:0.57780
[29]	validation_0-logloss:0.57675
[30]	validation_0-logloss:0.57578
[31]	validation_0-logloss:0.57524
[32]	validation_0-logloss:0.57455
[33]	validation_0-logloss:0.57390
[34]	validation_0-logloss:0.57334
[35]	validation_0-logloss:0.57193
[36]	validation_0-logloss:0.57132
[37]	validation_0-logloss:0.57077
[38]	validation_0-logloss:0.56996
[39]	validation_0-logloss:0.56953
[40]	validation_0-logloss:0.56903
[41]	validation_0-logloss:0.56864
[42]	validation_0-logloss:0.56753
[43]	validatio

Parameters: { "use_label_encoder" } are not used.



[20]	validation_0-logloss:0.58760
[21]	validation_0-logloss:0.58646
[22]	validation_0-logloss:0.58533
[23]	validation_0-logloss:0.58398
[24]	validation_0-logloss:0.58300
[25]	validation_0-logloss:0.58225
[26]	validation_0-logloss:0.58134
[27]	validation_0-logloss:0.58050
[28]	validation_0-logloss:0.57954
[29]	validation_0-logloss:0.57877
[30]	validation_0-logloss:0.57823
[31]	validation_0-logloss:0.57717
[32]	validation_0-logloss:0.57663
[33]	validation_0-logloss:0.57542
[34]	validation_0-logloss:0.57489
[35]	validation_0-logloss:0.57403
[36]	validation_0-logloss:0.57344
[37]	validation_0-logloss:0.57241
[38]	validation_0-logloss:0.57203
[39]	validation_0-logloss:0.57137
[40]	validation_0-logloss:0.57081
[41]	validation_0-logloss:0.57025
[42]	validation_0-logloss:0.56983
[43]	validation_0-logloss:0.56920
[44]	validation_0-logloss:0.56866
[45]	validation_0-logloss:0.56818
[46]	validation_0-logloss:0.56737
[47]	validation_0-logloss:0.56692
[48]	validation_0-logloss:0.56643
[49]	validatio

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
import xgboost as xgb

# Load the data
df = pd.read_csv('public_data.csv')

def create_features(df):
    # Time-based features
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
    df['minute_sin'] = np.sin(2 * np.pi * df['minute']/60)
    df['minute_cos'] = np.cos(2 * np.pi * df['minute']/60)

    # Enhanced rolling statistics with multiple windows
    for window in [4, 8, 12]:
        # Mean
        df[f'rolling_solar_mean_{window}'] = df.groupby('periodID')['solar_becsult_dayahead'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean())
        df[f'rolling_load_mean_{window}'] = df.groupby('periodID')['rendszerterheles_terv'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean())

        # Standard deviation
        df[f'rolling_solar_std_{window}'] = df.groupby('periodID')['solar_becsult_dayahead'].transform(
            lambda x: x.rolling(window=window, min_periods=1).std())
        df[f'rolling_load_std_{window}'] = df.groupby('periodID')['rendszerterheles_terv'].transform(
            lambda x: x.rolling(window=window, min_periods=1).std())

    # Exponential moving averages
    for alpha in [0.3, 0.5, 0.7]:
        df[f'solar_ema_{alpha}'] = df.groupby('periodID')['solar_becsult_dayahead'].transform(
            lambda x: x.ewm(alpha=alpha, adjust=False).mean())
        df[f'load_ema_{alpha}'] = df.groupby('periodID')['rendszerterheles_terv'].transform(
            lambda x: x.ewm(alpha=alpha, adjust=False).mean())

    # Multiple lag features
    for lag in [1, 2]:
        # Basic lags
        df[f'solar_lag_{lag}'] = df.groupby('periodID')['solar_becsult_dayahead'].shift(lag)
        df[f'load_lag_{lag}'] = df.groupby('periodID')['rendszerterheles_terv'].shift(lag)

        # Lag differences and ratios
        df[f'solar_lag_diff_{lag}'] = df['solar_becsult_dayahead'] - df[f'solar_lag_{lag}']
        df[f'load_lag_diff_{lag}'] = df['rendszerterheles_terv'] - df[f'load_lag_{lag}']
        df[f'solar_lag_ratio_{lag}'] = df['solar_becsult_dayahead'] / (df[f'solar_lag_{lag}'] + 1)
        df[f'load_lag_ratio_{lag}'] = df['rendszerterheles_terv'] / (df[f'load_lag_{lag}'] + 1)

    # Target encoding with increased smoothing
    def smooth_target_mean(group, alpha=8):  # Increased smoothing
        counts = group.size
        means = group.mean()
        global_mean = means.mean()
        return (means * counts + global_mean * alpha) / (counts + alpha)

    for group_cols in [
        ['season', 'hour'],
        ['weekday', 'hour'],
        ['season', 'weekday']
    ]:
        name = '_'.join(group_cols)
        agg = df.groupby(group_cols)['target_flag'].agg(['mean', 'std']).reset_index()
        agg['smoothed_mean'] = smooth_target_mean(agg['mean'])
        df[f'target_mean_{name}'] = df[group_cols].merge(
            agg[group_cols + ['smoothed_mean']],
            on=group_cols,
            how='left'
        )['smoothed_mean']

        if agg['std'].notna().any():
            df[f'target_std_{name}'] = df[group_cols].merge(
                agg[group_cols + ['std']],
                on=group_cols,
                how='left'
            )['std']

    # Core features and interactions
    df['solar_load_ratio'] = df['solar_becsult_dayahead'] / (df['rendszerterheles_terv'] + 1)
    df['weekend'] = df['weekday'].isin([5, 6]).astype(int)
    df['solar_hour'] = df['solar_becsult_dayahead'] * np.sin(2 * np.pi * df['hour']/24)
    df['load_hour'] = df['rendszerterheles_terv'] * np.sin(2 * np.pi * df['hour']/24)
    df['solar_season'] = df['solar_becsult_dayahead'] * df['season']
    df['load_season'] = df['rendszerterheles_terv'] * df['season']

    # Cross-features
    df['load_ratio_hour'] = df['load_lag_ratio_1'] * df['hour_sin']
    df['solar_ratio_hour'] = df['solar_lag_ratio_1'] * df['hour_sin']
    df['load_ratio_season'] = df['load_lag_ratio_1'] * df['season']
    df['solar_ratio_season'] = df['solar_lag_ratio_1'] * df['season']

    return df

# Create initial features
df = create_features(df)
df_filtered = df[df['day_in_period'] < 4]
df_filtered = df_filtered.fillna(method='ffill').fillna(method='bfill')

# Initial feature list
initial_features = [
    'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos',
    'holyday', 'weekday', 'weekend',
    'solar_becsult_dayahead', 'rendszerterheles_terv',
    'rolling_solar_mean_4', 'rolling_solar_mean_8', 'rolling_solar_mean_12',
    'rolling_load_mean_4', 'rolling_load_mean_8', 'rolling_load_mean_12',
    'rolling_solar_std_4', 'rolling_solar_std_8', 'rolling_solar_std_12',
    'rolling_load_std_4', 'rolling_load_std_8', 'rolling_load_std_12',
    'solar_ema_0.3', 'solar_ema_0.5', 'solar_ema_0.7',
    'load_ema_0.3', 'load_ema_0.5', 'load_ema_0.7',
    'solar_lag_1', 'solar_lag_2', 'load_lag_1', 'load_lag_2',
    'solar_lag_diff_1', 'solar_lag_diff_2',
    'load_lag_diff_1', 'load_lag_diff_2',
    'solar_lag_ratio_1', 'solar_lag_ratio_2',
    'load_lag_ratio_1', 'load_lag_ratio_2',
    'target_mean_season_hour', 'target_std_season_hour',
    'target_mean_weekday_hour', 'target_std_weekday_hour',
    'target_mean_season_weekday', 'target_std_season_weekday',
    'solar_load_ratio', 'solar_hour', 'load_hour',
    'solar_season', 'load_season',
    'load_ratio_hour', 'solar_ratio_hour',
    'load_ratio_season', 'solar_ratio_season'
]

# Function to get feature importance and select top features
def get_important_features(X, y, feature_names, threshold=0.01):
    # Train initial models
    xgb_init = xgb.XGBClassifier(random_state=42, n_jobs=-1)
    lgb_init = LGBMClassifier(random_state=42, n_jobs=-1)

    xgb_init.fit(X, y)
    lgb_init.fit(X, y)

    # Get feature importance
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'xgb_importance': xgb_init.feature_importances_,
        'lgb_importance': lgb_init.feature_importances_
    })

    # Normalize and combine importances
    importance_df['combined_importance'] = (
        0.5 * importance_df['xgb_importance'] / importance_df['xgb_importance'].max() +
        0.5 * importance_df['lgb_importance'] / importance_df['lgb_importance'].max()
    )

    # Select features above threshold
    selected_features = importance_df[importance_df['combined_importance'] > threshold]['feature'].tolist()
    return selected_features

# Prepare data for feature selection
X_init = df_filtered[initial_features]
y_init = df_filtered['target_flag']
scaler = StandardScaler()
X_init_scaled = scaler.fit_transform(X_init)

# Get important features
selected_features = get_important_features(X_init_scaled, y_init, initial_features)

# Cross validation with selected features
period_ids = df_filtered['periodID'].unique()
np.random.seed(42)
period_ids = np.random.permutation(period_ids)
fold_size = len(period_ids) // 3
folds = [
    period_ids[:fold_size],
    period_ids[fold_size:2*fold_size],
    period_ids[2*fold_size:]
]

results = []
for fold_idx in range(3):
    test_periods = folds[fold_idx]
    train_periods = np.concatenate([folds[i] for i in range(3) if i != fold_idx])

    X_train = df_filtered[df_filtered['periodID'].isin(train_periods)][selected_features]
    y_train = df_filtered[df_filtered['periodID'].isin(train_periods)]['target_flag']
    X_test = df_filtered[df_filtered['periodID'].isin(test_periods)][selected_features]
    y_test = df_filtered[df_filtered['periodID'].isin(test_periods)]['target_flag']

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Refined parameters for XGBoost
    xgb_model = xgb.XGBClassifier(
        n_estimators=180,
        learning_rate=0.065,
        max_depth=5,
        min_child_weight=3,
        gamma=0.08,
        subsample=0.9,
        colsample_bytree=0.9,
        colsample_bylevel=0.9,
        reg_alpha=0.15,
        reg_lambda=1.2,
        random_state=42,
        n_jobs=-1,
        use_label_encoder=False,
        eval_metric='logloss'
    )

    # Refined parameters for LightGBM
    lgb_model = LGBMClassifier(
        n_estimators=180,
        learning_rate=0.045,
        num_leaves=28,
        max_depth=6,
        min_child_samples=25,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_alpha=0.15,
        reg_lambda=1.2,
        random_state=42,
        n_jobs=-1
    )

    eval_set_xgb = [(X_train_scaled, y_train)]
    eval_set_lgb = [(X_train_scaled, y_train)]

    xgb_model.fit(X_train_scaled, y_train,
                 eval_set=eval_set_xgb)

    lgb_model.fit(X_train_scaled, y_train,
                 eval_set=eval_set_lgb)

    xgb_pred = xgb_model.predict_proba(X_test_scaled)[:, 1]
    roc_auc_xgb = roc_auc_score(y_test, xgb_pred)
    lgb_pred = lgb_model.predict_proba(X_test_scaled)[:, 1]
    roc_auc_lgb = roc_auc_score(y_test, lgb_pred)

    y_pred_proba = 0.45 * xgb_pred + 0.55 * lgb_pred
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    results.append({
        'fold': fold_idx + 1,
        'roc_auc': roc_auc,
        'roc_auc_xgb': roc_auc_xgb,
        'roc_auc_lgb': roc_auc_lgb,
        'test_periods': len(test_periods),
        'train_periods': len(train_periods)
    })

# Results display
results_df = pd.DataFrame(results)
print("\nResults for each fold:")
print(results_df)
print("\nAverage ROC AUC score:", results_df['roc_auc'].mean())
print("Standard deviation:", results_df['roc_auc'].std())
print("\nSelected features:", len(selected_features))
print("\nTop 10 most important features:")
feature_importance = pd.DataFrame({
    'feature': selected_features,
    'xgb_importance': xgb_model.feature_importances_,
    'lgb_importance': lgb_model.feature_importances_
})
print(feature_importance.sort_values('xgb_importance', ascending=False).head(10))

# Submission predictions
test_data = df[df['day_in_period'] == 4]
X_submission = test_data[selected_features]
X_submission_scaled = scaler.transform(X_submission)

xgb_pred = xgb_model.predict_proba(X_submission_scaled)[:, 1]
lgb_pred = lgb_model.predict_proba(X_submission_scaled)[:, 1]
final_predictions = 0.45 * xgb_pred + 0.55 * lgb_pred

submission = pd.DataFrame({
    'rowID': test_data['rowID'],
    'target_flag': final_predictions
})

submission.to_csv('brh17.csv', index=False)

  df_filtered = df_filtered.fillna(method='ffill').fillna(method='bfill')


[LightGBM] [Info] Number of positive: 47277, number of negative: 24531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005347 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11004
[LightGBM] [Info] Number of data points in the train set: 71808, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.658381 -> initscore=0.656086
[LightGBM] [Info] Start training from score 0.656086
[0]	validation_0-logloss:0.62778
[1]	validation_0-logloss:0.62260
[2]	validation_0-logloss:0.61768
[3]	validation_0-logloss:0.61346
[4]	validation_0-logloss:0.60963
[5]	validation_0-logloss:0.60626
[6]	validation_0-logloss:0.60317
[7]	validation_0-logloss:0.60049
[8]	validation_0-logloss:0.59795
[9]	validation_0-logloss:0.59558
[10]	validation_0-logloss:0.59352
[11]	validation_0-logloss:0.59156
[12]	validation_0-logloss:0.58954
[13]	valida

Parameters: { "use_label_encoder" } are not used.



[20]	validation_0-logloss:0.57769
[21]	validation_0-logloss:0.57668
[22]	validation_0-logloss:0.57549
[23]	validation_0-logloss:0.57420
[24]	validation_0-logloss:0.57331
[25]	validation_0-logloss:0.57246
[26]	validation_0-logloss:0.57167
[27]	validation_0-logloss:0.57046
[28]	validation_0-logloss:0.56960
[29]	validation_0-logloss:0.56867
[30]	validation_0-logloss:0.56796
[31]	validation_0-logloss:0.56730
[32]	validation_0-logloss:0.56656
[33]	validation_0-logloss:0.56583
[34]	validation_0-logloss:0.56525
[35]	validation_0-logloss:0.56452
[36]	validation_0-logloss:0.56380
[37]	validation_0-logloss:0.56284
[38]	validation_0-logloss:0.56227
[39]	validation_0-logloss:0.56167
[40]	validation_0-logloss:0.56111
[41]	validation_0-logloss:0.56069
[42]	validation_0-logloss:0.55997
[43]	validation_0-logloss:0.55935
[44]	validation_0-logloss:0.55861
[45]	validation_0-logloss:0.55808
[46]	validation_0-logloss:0.55726
[47]	validation_0-logloss:0.55631
[48]	validation_0-logloss:0.55577
[49]	validatio

Parameters: { "use_label_encoder" } are not used.



[21]	validation_0-logloss:0.58697
[22]	validation_0-logloss:0.58590
[23]	validation_0-logloss:0.58464
[24]	validation_0-logloss:0.58376
[25]	validation_0-logloss:0.58297
[26]	validation_0-logloss:0.58204
[27]	validation_0-logloss:0.58124
[28]	validation_0-logloss:0.58049
[29]	validation_0-logloss:0.57953
[30]	validation_0-logloss:0.57876
[31]	validation_0-logloss:0.57806
[32]	validation_0-logloss:0.57747
[33]	validation_0-logloss:0.57646
[34]	validation_0-logloss:0.57602
[35]	validation_0-logloss:0.57549
[36]	validation_0-logloss:0.57471
[37]	validation_0-logloss:0.57413
[38]	validation_0-logloss:0.57369
[39]	validation_0-logloss:0.57296
[40]	validation_0-logloss:0.57240
[41]	validation_0-logloss:0.57179
[42]	validation_0-logloss:0.57088
[43]	validation_0-logloss:0.57043
[44]	validation_0-logloss:0.56940
[45]	validation_0-logloss:0.56883
[46]	validation_0-logloss:0.56767
[47]	validation_0-logloss:0.56703
[48]	validation_0-logloss:0.56646
[49]	validation_0-logloss:0.56586
[50]	validatio

Parameters: { "use_label_encoder" } are not used.



[22]	validation_0-logloss:0.58809
[23]	validation_0-logloss:0.58725
[24]	validation_0-logloss:0.58636
[25]	validation_0-logloss:0.58516
[26]	validation_0-logloss:0.58396
[27]	validation_0-logloss:0.58307
[28]	validation_0-logloss:0.58187
[29]	validation_0-logloss:0.58101
[30]	validation_0-logloss:0.57990
[31]	validation_0-logloss:0.57926
[32]	validation_0-logloss:0.57871
[33]	validation_0-logloss:0.57819
[34]	validation_0-logloss:0.57754
[35]	validation_0-logloss:0.57700
[36]	validation_0-logloss:0.57635
[37]	validation_0-logloss:0.57570
[38]	validation_0-logloss:0.57519
[39]	validation_0-logloss:0.57463
[40]	validation_0-logloss:0.57408
[41]	validation_0-logloss:0.57352
[42]	validation_0-logloss:0.57305
[43]	validation_0-logloss:0.57263
[44]	validation_0-logloss:0.57197
[45]	validation_0-logloss:0.57153
[46]	validation_0-logloss:0.57116
[47]	validation_0-logloss:0.57059
[48]	validation_0-logloss:0.56999
[49]	validation_0-logloss:0.56928
[50]	validation_0-logloss:0.56837
[51]	validatio