In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
file = 'data/01.01.2009-30.07.2024.csv'
df = pd.read_csv(file, sep=';', encoding='utf-8', index_col=False) 

In [3]:
df = df.iloc[::-1].reset_index(drop=True)

df['Местное время в Иркутске'] = pd.to_datetime(df['Местное время в Иркутске'], format='%d.%m.%Y %H:%M')
df['YEAR'] = df['Местное время в Иркутске'].dt.year
df['MO'] = df['Местное время в Иркутске'].dt.month
df['DY'] = df['Местное время в Иркутске'].dt.day
df['HR'] = df['Местное время в Иркутске'].dt.hour
df = df.drop(columns=['Местное время в Иркутске'])
drop_columns = ['P', 'Pa', 'ff10', 'ff3', 'Tn', 'Tx', 'VV', 'Td', 'E', 'Tg', "E'", 'sss', 'WW','W2','RRR','tR', 'DD', 'Cl','H','Cm','Ch']
df = df.drop(columns=drop_columns, errors='ignore')
cols = ['YEAR', 'MO', 'DY', 'HR'] + [col for col in df.columns if col not in ['YEAR', 'MO', 'DY', 'HR']]
df = df[cols]

In [33]:
df.head()

Unnamed: 0,YEAR,MO,DY,HR,T,Po,U,Ff,N,W1
0,2009,2,25,20,-4.5,722.4,39.0,2.0,100%.,
1,2009,2,25,23,-6.6,724.4,53.0,3.0,60%.,
2,2009,2,26,2,-8.7,726.0,86.0,2.0,100%.,Ливень (ливни).
3,2009,2,26,5,-10.2,727.3,85.0,2.0,100%.,Ливень (ливни).
4,2009,2,26,8,-11.8,728.2,90.0,1.0,100%.,Ливень (ливни).


In [4]:
json_file = 'data/unique.json'
with open(json_file , 'r', encoding='utf-8') as f:
    replacement_rules = json.load(f)
    
for column, replacements in replacement_rules.items():
    if column in df.columns:
        df[column] = df[column].replace(replacements)
df["W1"] = df["W1"].fillna(0.0)
df["Nh"] = df["Nh"].fillna(0.0)


In [20]:
df.head()

Unnamed: 0,YEAR,MO,DY,HR,T,Po,U,Ff,N,W1,Nh
0,2009,2,25,20,-4.5,722.4,39.0,2.0,1.0,0.0,0.25
1,2009,2,25,23,-6.6,724.4,53.0,3.0,0.6,0.0,0.6
2,2009,2,26,2,-8.7,726.0,86.0,2.0,1.0,1.0,1.0
3,2009,2,26,5,-10.2,727.3,85.0,2.0,1.0,1.0,1.0
4,2009,2,26,8,-11.8,728.2,90.0,1.0,1.0,1.0,1.0


In [5]:
df[['YEAR', 'MO', 'DY', 'HR']] = df[['YEAR', 'MO', 'DY', 'HR']].apply(pd.to_numeric, errors='coerce')


data_cleaned = df.dropna(subset=['YEAR', 'MO', 'DY', 'HR'])

data_cleaned['datetime'] = pd.to_datetime(data_cleaned[['YEAR', 'MO', 'DY', 'HR']].astype(str).agg('-'.join, axis=1), errors='coerce')


data_cleaned.set_index('datetime', inplace=True)

hourly_data = pd.DataFrame()

hourly_data['T'] = data_cleaned['T'].resample('H').interpolate(method='linear')
hourly_data['Po'] = data_cleaned['Po'].resample('H').interpolate(method='linear')
hourly_data['U'] = data_cleaned['U'].resample('H').interpolate(method='linear')
hourly_data['Ff'] = data_cleaned['Ff'].resample('H').interpolate(method='linear')
hourly_data['N'] = data_cleaned['N'].resample('H').interpolate(method='linear')
hourly_data['W1'] = data_cleaned['W1'].resample('H').interpolate(method='linear')
hourly_data['Nh'] = data_cleaned['Nh'].resample('H').interpolate(method='linear')

hourly_data['YEAR'] = hourly_data.index.year
hourly_data['MO'] = hourly_data.index.month
hourly_data['DY'] = hourly_data.index.day
hourly_data['HR'] = hourly_data.index.hour

df = hourly_data[['YEAR', 'MO', 'DY', 'HR', 'T', 'Po', 'U', 'Ff', 'N', 'W1', 'Nh']]
df = df.reset_index(drop=True)


In [6]:
df.head()

Unnamed: 0,YEAR,MO,DY,HR,T,Po,U,Ff,N,W1,Nh
0,2009,2,25,20,-4.5,722.4,39.0,2.0,1.0,0.0,0.25
1,2009,2,25,21,-5.2,723.066667,43.666667,2.333333,0.866667,0.0,0.366667
2,2009,2,25,22,-5.9,723.733333,48.333333,2.666667,0.733333,0.0,0.483333
3,2009,2,25,23,-6.6,724.4,53.0,3.0,0.6,0.0,0.6
4,2009,2,26,0,-7.3,724.933333,64.0,2.666667,0.733333,0.333333,0.733333


In [7]:
rad = pd.read_csv("data/sun.csv", delimiter=';', encoding='utf-8', index_col=False)
df = pd.merge(df, rad[['MO', 'DY', 'HR', 'SZA']], on=['MO', 'DY', 'HR'], how='left')
df['SZA'] = df['SZA'].astype(float)

In [41]:
df.head()

Unnamed: 0,YEAR,MO,DY,HR,T,Po,U,Ff,N,W1,Nh,SZA,a,Ho
0,2009,2,25,20,-4.5,722.4,39.0,2.0,1.0,0.0,0.25,90.0,-9.0,0.0
1,2009,2,25,21,-5.2,723.066667,43.666667,2.333333,0.866667,0.0,0.366667,90.0,-9.0,0.0
2,2009,2,25,22,-5.9,723.733333,48.333333,2.666667,0.733333,0.0,0.483333,90.0,-9.0,0.0
3,2009,2,25,23,-6.6,724.4,53.0,3.0,0.6,0.0,0.6,90.0,-9.0,0.0
4,2009,2,26,0,-7.3,724.933333,64.0,2.666667,0.733333,0.333333,0.733333,90.0,-9.0,0.0


In [8]:
solar = pd.read_csv("data/rad_zvc.csv", delimiter=',', encoding='utf-8', index_col=False)
df = pd.merge(df, solar[['YEAR', 'MO', 'DY', 'HR', 'ALLSKY_SFC_SW_DIFF', 'ALLSKY_SFC_SW_DWN']], on=['YEAR', 'MO', 'DY', 'HR'], how='inner')

In [9]:
df.head()

Unnamed: 0,YEAR,MO,DY,HR,T,Po,U,Ff,N,W1,Nh,SZA,ALLSKY_SFC_SW_DIFF,ALLSKY_SFC_SW_DWN
0,2009,2,25,20,-4.5,722.4,39.0,2.0,1.0,0.0,0.25,90.0,0.0,0.0
1,2009,2,25,21,-5.2,723.066667,43.666667,2.333333,0.866667,0.0,0.366667,90.0,0.0,0.0
2,2009,2,25,22,-5.9,723.733333,48.333333,2.666667,0.733333,0.0,0.483333,90.0,0.0,0.0
3,2009,2,25,23,-6.6,724.4,53.0,3.0,0.6,0.0,0.6,90.0,0.0,0.0
4,2009,2,26,0,-7.3,724.933333,64.0,2.666667,0.733333,0.333333,0.733333,90.0,0.0,0.0


In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

In [11]:
data = df.copy()

numerical_columns = ['T' ,'Po', 'U', 'Ff', 'N', 'W1', 'SZA', 'Nh', 'ALLSKY_SFC_SW_DIFF', 'ALLSKY_SFC_SW_DWN']

for column in numerical_columns:
    data[column] = data[column].astype(str).str.replace(',', '.').astype(float)

data['MO'] = data['MO'].astype(int)
data['DY'] = data['DY'].astype(int)

data['DayOfYear'] = pd.to_datetime(
    data[['YEAR', 'MO', 'DY']].astype(str).agg('-'.join, axis=1), errors='coerce'
).dt.dayofyear.fillna(0).astype(int)

data['sin_month'] = np.sin(2 * np.pi * data['MO'] / 12)
data['cos_month'] = np.cos(2 * np.pi * data['MO'] / 12)

data['sin_hour'] = np.sin(2 * np.pi * data['HR'] / 24)
data['cos_hour'] = np.cos(2 * np.pi * data['HR'] / 24)
data['sin_day_year'] = np.sin(2 * np.pi * data['DayOfYear'] / 365)
data['cos_day_year'] = np.cos(2 * np.pi * data['DayOfYear'] / 365)

features = ['sin_month', 'cos_month', 'sin_hour', 'cos_hour', 'sin_day_year', 'cos_day_year',
            'T', 'Po', 'U', 'Ff', 'SZA', 'N', 'W1', 'Nh']
target = ['ALLSKY_SFC_SW_DIFF', 'ALLSKY_SFC_SW_DWN']

X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_test = y_test.values if isinstance(y_test, pd.DataFrame) else y_test

# X_train = X[data['YEAR'] <= 2022]
# X_test = X[data['YEAR'] > 2022]

# y_train = y[data['YEAR'] <= 2022]
# y_test = y[data['YEAR'] > 2022]

In [12]:
X_test.head()

Unnamed: 0,sin_month,cos_month,sin_hour,cos_hour,sin_day_year,cos_day_year,T,Po,U,Ff,SZA,N,W1,Nh
27187,0.866025,-0.5,-0.707107,-0.7071068,0.99888,-0.047321,6.533333,724.4,38.333333,4.0,63.04,0.6,0.0,0.2
75167,-1.0,-1.83697e-16,-0.965926,0.258819,-0.991114,-0.133015,8.366667,717.966667,68.333333,1.666667,90.0,0.65,0.0,0.333333
106299,0.866025,-0.5,-0.258819,0.9659258,0.982927,-0.183998,-0.1,727.3,67.0,1.0,90.0,0.25,0.0,0.25
86566,0.5,0.8660254,-1.0,-1.83697e-16,0.188227,0.982126,-11.566667,723.366667,71.0,4.0,90.0,1.0,0.333333,1.0
80696,0.5,-0.8660254,0.866025,0.5,0.763889,-0.645348,3.0,714.7,57.333333,2.0,88.0,1.0,0.0,0.166667


## Лес 

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
multi_target_rf = MultiOutputRegressor(rf)
multi_target_rf.fit(X_train, y_train)

In [17]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_test = y_test.values if isinstance(y_test, pd.DataFrame) else y_test

y_pred = multi_target_rf.predict(X_test)

for i in range(y_test.shape[1]): 
    print(f"Оценка для целевой переменной на валидации{target[i]}:")
    print(f"  MAE: {mean_absolute_error(y_test[:, i], y_pred[:, i]):.4f}")
    print(f"  MSE: {mean_squared_error(y_test[:, i], y_pred[:, i]):.4f}")
    print(f"  R^2: {r2_score(y_test[:, i], y_pred[:, i]):.4f}")
    print("-" * 40)
y_train = np.array(y_train)
y_pred = multi_target_rf.predict(X_train)
for i in range(y_train.shape[1]): 
    print(f"Оценка для целевой переменной на трейне {target[i]}:")
    print(f"  MAE: {mean_absolute_error(y_train[:, i], y_pred[:, i]):.4f}")
    print(f"  MSE: {mean_squared_error(y_train[:, i], y_pred[:, i]):.4f}")
    print(f"  R^2: {r2_score(y_train[:, i], y_pred[:, i]):.4f}")
    print("-" * 40)
df = pd.DataFrame(y_pred)

Оценка для целевой переменной на валидацииALLSKY_SFC_SW_DIFF:
  MAE: 15.7128
  MSE: 1060.9791
  R^2: 0.8896
----------------------------------------
Оценка для целевой переменной на валидацииALLSKY_SFC_SW_DWN:
  MAE: 18.6524
  MSE: 1680.9230
  R^2: 0.9633
----------------------------------------
Оценка для целевой переменной на трейне ALLSKY_SFC_SW_DIFF:
  MAE: 5.7744
  MSE: 144.5774
  R^2: 0.9846
----------------------------------------
Оценка для целевой переменной на трейне ALLSKY_SFC_SW_DWN:
  MAE: 6.9047
  MSE: 232.1002
  R^2: 0.9949
----------------------------------------


In [83]:
import joblib
joblib.dump(multi_target_rf, "model.pkl")

['model.pkl']

## xgboost

In [18]:
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures, StandardScaler
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_test)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [19]:
import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, learning_rate=0.05)
multi_target_model = MultiOutputRegressor(model)
multi_target_model.fit(X_train_scaled, y_train)

In [20]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_test = np.array(y_test)
y_pred = multi_target_model.predict(X_test_scaled)
for i in range(y_test.shape[1]): 
    print(f"Оценка для целевой переменной на тесте {target[i]}:")
    print(f"  MAE: {mean_absolute_error(y_test[:, i], y_pred[:, i]):.4f}")
    print(f"  MSE: {mean_squared_error(y_test[:, i], y_pred[:, i]):.4f}")
    print(f"  R^2: {r2_score(y_test[:, i], y_pred[:, i]):.4f}")
    print("-" * 40)
y_train = np.array(y_train)
y_pred = multi_target_model.predict(X_train_scaled)
for i in range(y_train.shape[1]): 
    print(f"Оценка для целевой переменной на трейне {target[i]}:")
    print(f"  MAE: {mean_absolute_error(y_train[:, i], y_pred[:, i]):.4f}")
    print(f"  MSE: {mean_squared_error(y_train[:, i], y_pred[:, i]):.4f}")
    print(f"  R^2: {r2_score(y_train[:, i], y_pred[:, i]):.4f}")
    print("-" * 40)
df = pd.DataFrame(y_pred)

Оценка для целевой переменной на тесте ALLSKY_SFC_SW_DIFF:
  MAE: 15.6976
  MSE: 991.8699
  R^2: 0.8968
----------------------------------------
Оценка для целевой переменной на тесте ALLSKY_SFC_SW_DWN:
  MAE: 18.0395
  MSE: 1469.4888
  R^2: 0.9679
----------------------------------------
Оценка для целевой переменной на трейне ALLSKY_SFC_SW_DIFF:
  MAE: 12.5512
  MSE: 596.5092
  R^2: 0.9363
----------------------------------------
Оценка для целевой переменной на трейне ALLSKY_SFC_SW_DWN:
  MAE: 14.5377
  MSE: 881.8279
  R^2: 0.9805
----------------------------------------
