In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression


In [37]:
csv_file_path = "data/train.csv"
data = pd.read_csv(csv_file_path, sep=';', encoding='utf-8', index_col=False)

numerical_columns = ['T', 'Po', 'U', 'Ff', 'sinα', 'Ho', 'ALLSKY_SFC_SW_DWN']

for column in numerical_columns:
    data[column] = data[column].astype(str).str.replace(',', '.').astype(float)

data['MO'] = data['MO'].astype(int)
data['DY'] = data['DY'].astype(int)

data['DayOfYear'] = pd.to_datetime(
    data[['YEAR', 'MO', 'DY']].astype(str).agg('-'.join, axis=1), errors='coerce'
).dt.dayofyear.fillna(0).astype(int)

data['sin_month'] = np.sin(2 * np.pi * data['MO'] / 12)
data['cos_month'] = np.cos(2 * np.pi * data['MO'] / 12)

data['sin_hour'] = np.sin(2 * np.pi * data['HR'] / 24)
data['cos_hour'] = np.cos(2 * np.pi * data['HR'] / 24)
data['sin_day_year'] = np.sin(2 * np.pi * data['DayOfYear'] / 365)
data['cos_day_year'] = np.cos(2 * np.pi * data['DayOfYear'] / 365)

features = ['sin_month', 'cos_month', 'sin_hour', 'cos_hour', 'sin_day_year', 'cos_day_year',
            'T', 'Po', 'U', 'Ff', 'sinα', 'Ho', 'N', 'W1']
target = 'ALLSKY_SFC_SW_DWN'

X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [38]:
# Обучение RandomForest
rf_model = RandomForestRegressor(n_estimators=500, max_depth = 25, min_samples_split = 7, min_samples_leaf = 3, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_mae = mean_absolute_error(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
rf_r2 = r2_score(y_test, rf_pred)

In [39]:
print("\nModel Performance:")
results = {
    'RandomForest': {'MAE': rf_mae, 'RMSE': rf_rmse, 'R²': rf_r2},
}

results_df = pd.DataFrame(results).T
print(results_df)


Model Performance:
                    MAE       RMSE        R²
RandomForest  18.780195  39.351607  0.965637


In [40]:
test = "test.csv"
data_test = pd.read_csv(test, sep=';', encoding='utf-8', index_col=False)
print(f"Original number of rows: {len(data_test)}")

num_columns = ['T', 'Po', 'U', 'Ff', 'sinα', 'Ho']
for column in num_columns:
    data_test[column] = data_test[column].astype(str).str.replace(',', '.').astype(float)

data_test['MO'] = data_test['MO'].astype(int)
data_test['DY'] = data_test['DY'].astype(int)

data_test['DayOfYear'] = pd.to_datetime(
    data_test[['YEAR', 'MO', 'DY']].astype(str).agg('-'.join, axis=1), errors='coerce'
).dt.dayofyear.fillna(0).astype(int)

data_test['sin_month'] = np.sin(2 * np.pi * data_test['MO'] / 12)
data_test['cos_month'] = np.cos(2 * np.pi * data_test['MO'] / 12)

data_test['sin_hour'] = np.sin(2 * np.pi * data_test['HR'] / 24)
data_test['cos_hour'] = np.cos(2 * np.pi * data_test['HR'] / 24)
data_test['sin_day_year'] = np.sin(2 * np.pi * data_test['DayOfYear'] / 365)
data_test['cos_day_year'] = np.cos(2 * np.pi * data_test['DayOfYear'] / 365)

features = ['sin_month', 'cos_month', 'sin_hour', 'cos_hour', 'sin_day_year', 'cos_day_year',
            'T', 'Po', 'U', 'Ff', 'sinα', 'Ho', 'N', 'W1']
print(f"Number of rows after transformations: {len(data_test)}")

X_test = data_test[features]

rf_pred = rf_model.predict(X_test)

results = data_test[['YEAR', 'MO', 'DY', 'HR']]

results['RandomForest_Prediction'] = rf_pred

output_file = 'predictions_output.csv'
results.to_csv(output_file, index=False, sep=';')

print(f"Predictions saved to {output_file}")


Original number of rows: 8760
Number of rows after transformations: 8760
Predictions saved to predictions_output.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['RandomForest_Prediction'] = rf_pred
