In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

2024-12-09 01:34:23.384152: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733679263.461460    2500 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733679263.480785    2500 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-09 01:34:23.664351: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
csv_file_path = "data/train.csv"
data = pd.read_csv(csv_file_path, sep=';', encoding='utf-8', index_col=False)

numerical_columns = ['T', 'Po', 'U', 'Ff', 'sinα', 'Ho', 'ALLSKY_SFC_SW_DWN']

for column in numerical_columns:
    data[column] = data[column].astype(str).str.replace(',', '.').astype(float)

data['MO'] = data['MO'].astype(int)
data['DY'] = data['DY'].astype(int)

data['DayOfYear'] = pd.to_datetime(
    data[['YEAR', 'MO', 'DY']].astype(str).agg('-'.join, axis=1), errors='coerce'
).dt.dayofyear.fillna(0).astype(int)

data['sin_month'] = np.sin(2 * np.pi * data['MO'] / 12)
data['cos_month'] = np.cos(2 * np.pi * data['MO'] / 12)

data['sin_hour'] = np.sin(2 * np.pi * data['HR'] / 24)
data['cos_hour'] = np.cos(2 * np.pi * data['HR'] / 24)
data['sin_day_year'] = np.sin(2 * np.pi * data['DayOfYear'] / 365)
data['cos_day_year'] = np.cos(2 * np.pi * data['DayOfYear'] / 365)

features = ['sin_month', 'cos_month', 'sin_hour', 'cos_hour', 'sin_day_year', 'cos_day_year',
            'T', 'Po', 'U', 'Ff', 'sinα', 'Ho']
target = 'ALLSKY_SFC_SW_DWN'

X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Обучение RandomForest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_mae = mean_absolute_error(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
rf_r2 = r2_score(y_test, rf_pred)

Training RandomForest...


In [None]:
# Обучение XGBoost
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_mae = mean_absolute_error(y_test, xgb_pred)
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_pred))
xgb_r2 = r2_score(y_test, xgb_pred)



Training XGBoost...


In [None]:
# Обучение LightGBM
lgbm_model = LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
lgbm_model.fit(X_train, y_train)
lgbm_pred = lgbm_model.predict(X_test)
lgbm_mae = mean_absolute_error(y_test, lgbm_pred)
lgbm_rmse = np.sqrt(mean_squared_error(y_test, lgbm_pred))
lgbm_r2 = r2_score(y_test, lgbm_pred)


Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002233 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1843
[LightGBM] [Info] Number of data points in the train set: 107048, number of used features: 12
[LightGBM] [Info] Start training from score 147.017514


In [21]:
print("\nModel Performance:")
results = {
    'RandomForest': {'MAE': rf_mae, 'RMSE': rf_rmse, 'R²': rf_r2},
    'XGBoost': {'MAE': xgb_mae, 'RMSE': xgb_rmse, 'R²': xgb_r2},
    'LightGBM': {'MAE': lgbm_mae, 'RMSE': lgbm_rmse, 'R²': lgbm_r2},
}

results_df = pd.DataFrame(results).T
print(results_df)


Model Performance:
                    MAE       RMSE        R²
RandomForest  25.127739  53.244435  0.938451
XGBoost       26.023440  52.976058  0.939070
LightGBM      26.062043  52.972918  0.939077


In [None]:
# Загружаем данные
test = "test.csv"
data_test = pd.read_csv(test, sep=';', encoding='utf-8', index_col=False)
print(f"Original number of rows: {len(data_test)}")

# Преобразуем данные в числовой формат, заменяя запятые на точки
num_columns = ['T', 'Po', 'U', 'Ff', 'sinα', 'Ho']
for column in num_columns:
    data_test[column] = data_test[column].astype(str).str.replace(',', '.').astype(float)

# Преобразуем месяц и день в целочисленные значения
data_test['MO'] = data_test['MO'].astype(int)
data_test['DY'] = data_test['DY'].astype(int)

# Создаем новый столбец для дня года
data_test['DayOfYear'] = pd.to_datetime(
    data_test[['YEAR', 'MO', 'DY']].astype(str).agg('-'.join, axis=1), errors='coerce'
).dt.dayofyear.fillna(0).astype(int)

# Добавляем синусоидальные и косинусоидальные признаки для месяца, часа и дня года
data_test['sin_month'] = np.sin(2 * np.pi * data_test['MO'] / 12)
data_test['cos_month'] = np.cos(2 * np.pi * data_test['MO'] / 12)

data_test['sin_hour'] = np.sin(2 * np.pi * data_test['HR'] / 24)
data_test['cos_hour'] = np.cos(2 * np.pi * data_test['HR'] / 24)
data_test['sin_day_year'] = np.sin(2 * np.pi * data_test['DayOfYear'] / 365)
data_test['cos_day_year'] = np.cos(2 * np.pi * data_test['DayOfYear'] / 365)

# Указываем, какие признаки будем использовать
features = ['sin_month', 'cos_month', 'sin_hour', 'cos_hour', 'sin_day_year', 'cos_day_year',
            'T', 'Po', 'U', 'Ff', 'sinα', 'Ho']
print(f"Number of rows after transformations: {len(data_test)}")

# Получаем входные данные (X) для предсказаний
X_test = data_test[features]

# **Здесь должны быть уже обученные модели в вашей среде**:
# Пример:
# rf_model, xgb_model, lgbm_model, nn_model уже обучены и находятся в памяти

# Получаем предсказания от каждой модели
rf_pred = rf_model.predict(X_test)       # RandomForest
xgb_pred = xgb_model.predict(X_test)     # XGBoost
lgbm_pred = lgbm_model.predict(X_test)   # LightGBM    # Neural Network

# Создаем DataFrame для результатов
results = data_test[['YEAR', 'MO', 'DY', 'HR']]  # Добавляем информацию о дате

# Добавляем предсказания каждой модели
results['RandomForest_Prediction'] = rf_pred
results['XGBoost_Prediction'] = xgb_pred
results['LightGBM_Prediction'] = lgbm_pred

# Сохраняем результаты в новый CSV файл
output_file = 'predictions_output.csv'  # Путь для выходного файла
results.to_csv(output_file, index=False, sep=';')

print(f"Predictions saved to {output_file}")


Original number of rows: 8760
Number of rows after transformations: 8760
Predictions saved to predictions_output.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['RandomForest_Prediction'] = rf_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['XGBoost_Prediction'] = xgb_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['LightGBM_Prediction'] = lgbm_pred
