In [10]:
import json
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

In [11]:
file_paths = ['data/2022.json', 'data/2023.json', 'data/2024.json']
all_data = []
for file_path in file_paths:
    with open(file_path, "r") as f:
        data = json.load(f)
        all_data.extend(data)

In [12]:
df = pd.DataFrame(all_data)
df.head()

Unnamed: 0,latitude,longitude,acq_date,acq_time,bright_t31,frp,version,confidence,instrument,type,track,daynight,scan,satellite,brightness
0,-20.0463,143.7023,2022-01-01,28,309.2,68.3,6.03,97,MODIS,0,1.1,D,1.1,Terra,356.2
1,-20.048,143.713,2022-01-01,28,303.1,12.7,6.03,33,MODIS,0,1.1,D,1.1,Terra,328.6
2,-19.0926,138.2415,2022-01-01,28,299.2,68.6,6.03,74,MODIS,0,1.6,D,2.7,Terra,332.3
3,-20.0541,143.6899,2022-01-01,28,311.0,29.9,6.03,85,MODIS,0,1.1,D,1.1,Terra,343.3
4,-20.0798,143.7292,2022-01-01,28,307.4,20.2,6.03,57,MODIS,0,1.1,D,1.1,Terra,332.9


In [13]:
print(df.columns)

Index(['latitude', 'longitude', 'acq_date', 'acq_time', 'bright_t31', 'frp',
       'version', 'confidence', 'instrument', 'type', 'track', 'daynight',
       'scan', 'satellite', 'brightness'],
      dtype='object')


In [14]:
print(df.isnull().sum())

latitude            0
longitude           0
acq_date            0
acq_time            0
bright_t31          0
frp                 0
version             0
confidence          0
instrument          0
type          2276662
track               0
daynight            0
scan                0
satellite           0
brightness          0
dtype: int64


In [15]:
df=df.drop(columns="type")

In [16]:
print(df.isnull().sum())

latitude      0
longitude     0
acq_date      0
acq_time      0
bright_t31    0
frp           0
version       0
confidence    0
instrument    0
track         0
daynight      0
scan          0
satellite     0
brightness    0
dtype: int64


In [17]:
X = df[['latitude', 'longitude', 'bright_t31','acq_date', 'daynight']]
y = df['confidence'].astype(float) / 100  # Целевая переменная нормализована как процент

# Преобразование категориальных признаков (daynight)
encoder = OneHotEncoder(sparse_output=False)
daynight_encoded = encoder.fit_transform(X[['daynight']])
daynight_columns = encoder.get_feature_names_out(['daynight'])
daynight_df = pd.DataFrame(daynight_encoded, columns=daynight_columns, index=X.index)
X = pd.concat([X.drop(columns=['daynight']), daynight_df], axis=1)

In [18]:
scaler = MinMaxScaler()
X[['latitude', 'longitude', 'bright_t31']] = scaler.fit_transform(X[['latitude', 'longitude', 'bright_t31']])

In [19]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [20]:
model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
model.fit(X_train, y_train)


In [21]:
results = {}

In [22]:
# Оценка на валидационных данных
y_val_pred_rf = model.predict(X_val)
mae_rf_val = mean_absolute_error(y_val, y_val_pred_rf)
mse_rf_val = mean_squared_error(y_val, y_val_pred_rf)

# Оценка на тестовых данных
y_test_pred_rf = model.predict(X_test)
mae_rf_test = mean_absolute_error(y_test, y_test_pred_rf)
mse_rf_test = mean_squared_error(y_test, y_test_pred_rf)

# Сохранение результатов
results['RandomForest'] = {
    'MAE_val': mae_rf_val,
    'MSE_val': mse_rf_val,
    'MAE_test': mae_rf_test,
    'MSE_test': mse_rf_test
}

In [23]:
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_val = imputer.transform(X_val)
X_test = imputer.transform(X_test)

# Обучение и оценка GradientBoostingRegressor
model_gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model_gb.fit(X_train, y_train)

# Оценка на валидационных данных
y_val_pred_gb = model_gb.predict(X_val)
mae_gb_val = mean_absolute_error(y_val, y_val_pred_gb)
mse_gb_val = mean_squared_error(y_val, y_val_pred_gb)

# Оценка на тестовых данных
y_test_pred_gb = model_gb.predict(X_test)
mae_gb_test = mean_absolute_error(y_test, y_test_pred_gb)
mse_gb_test = mean_squared_error(y_test, y_test_pred_gb)

# Сохранение результатов
results['GradientBoosting'] = {
    'MAE_val': mae_gb_val,
    'MSE_val': mse_gb_val,
    'MAE_test': mae_gb_test,
    'MSE_test': mse_gb_test
}

In [None]:
from sklearn.ensemble import LGBMRegressor

In [None]:
# Обучение и оценка LightGBM
model_lgb = LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model_lgb.fit(X_train, y_train)

# Оценка на валидационных данных
y_val_pred_lgb = model_lgb.predict(X_val)
mae_lgb_val = mean_absolute_error(y_val, y_val_pred_lgb)
mse_lgb_val = mean_squared_error(y_val, y_val_pred_lgb)

# Оценка на тестовых данных
y_test_pred_lgb = model_lgb.predict(X_test)
mae_lgb_test = mean_absolute_error(y_test, y_test_pred_lgb)
mse_lgb_test = mean_squared_error(y_test, y_test_pred_lgb)

# Сохранение результатов
results['LightGBM'] = {
    'MAE_val': mae_lgb_val,
    'MSE_val': mse_lgb_val,
    'MAE_test': mae_lgb_test,
    'MSE_test': mse_lgb_test
}

In [None]:
# Вывод результатов
for model_name, metrics in results.items():
    print(f"{model_name}:")
    print(f"  Валидационные данные - MAE: {metrics['MAE_val']:.4f}, MSE: {metrics['MSE_val']:.4f}")
    print(f"  Тестовые данные - MAE: {metrics['MAE_test']:.4f}, MSE: {metrics['MSE_test']:.4f}")

# Визуализация результатов
models = list(results.keys())
mae_val_scores = [results[model]['MAE_val'] for model in models]
mse_val_scores = [results[model]['MSE_val'] for model in models]
mae_test_scores = [results[model]['MAE_test'] for model in models]
mse_test_scores = [results[model]['MSE_test'] for model in models]