## Загрузка необходимых библиотек

In [1]:
import pandas as pd
from pandas import MultiIndex
import numpy as np
import glob
import datetime
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from catboost import Pool, CatBoostRegressor
import xgboost as xgb

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler

  from pandas import MultiIndex, Int64Index


## 1. Создание набора данных для обучения предиктивной модели

In [2]:
## Создание объединенного датасета
folder = 'INPUT'
sets = []

# чтение файлов с данными и добавленние их в общий df
for f in glob.glob("{}/*fo.csv.zip".format(folder)):
    try:
        sample = pd.DataFrame(pd.read_csv(f, compression='zip', sep=';', header=0, quotechar='"'))
        sets.append(sample)
    except:
        pass
weather = pd.concat(sets, ignore_index=True)
weather.drop_duplicates(keep='first', inplace=True, ignore_index=True) # удаление дубликатов по строкам
weather['date'] = pd.to_datetime(weather['date']) # формат дата-время для столбца
weather['week_num'] = weather['date'].dt.isocalendar().week # добавление столбца с номером недели

In [3]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 394322 entries, 0 to 394321
Data columns (total 65 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   date                       394322 non-null  datetime64[ns]
 1   years                      394322 non-null  float64       
 2   min_air_temp               394289 non-null  float64       
 3   max_air_temp               394289 non-null  float64       
 4   avg_air_temp               394289 non-null  float64       
 5   sum_air_temp               394322 non-null  float64       
 6   min_temp                   378162 non-null  float64       
 7   sum_min_temperature        394322 non-null  float64       
 8   max_temp                   382514 non-null  float64       
 9   sum_max_temperature        394322 non-null  float64       
 10  min_dew_point_temp         388880 non-null  float64       
 11  max_dew_point_temp         388880 non-null  float64 

### Заполнение пустых значений

In [4]:
## Средние значения показателей для заполнения значений Nan
means = pd.DataFrame(weather.groupby(['meteoid', 'week_num']).mean()[['min_air_temp', 'max_air_temp', 'avg_air_temp',
       'min_temp', 'max_temp', 'min_dew_point_temp', 'max_dew_point_temp',
       'avg_dew_point_temp', 'min_po_press', 'max_po_press', 'avg_po_press',
       'min_p_press', 'max_p_press', 'avg_p_press', 'min_wind_speed', 'max_wind_speed', 'avg_wind_speed']].reset_index()) 

In [5]:
## Переименование новых столбцов
drop_col = ['m_'+str(x) for x in list(means.columns)]
means.columns = drop_col
means.rename(columns={"m_meteoid": "meteoid", "m_week_num": "week_num"}, inplace=True)
means.head()

Unnamed: 0,meteoid,week_num,m_min_air_temp,m_max_air_temp,m_avg_air_temp,m_min_temp,m_max_temp,m_min_dew_point_temp,m_max_dew_point_temp,m_avg_dew_point_temp,m_min_po_press,m_max_po_press,m_avg_po_press,m_min_p_press,m_max_p_press,m_avg_p_press,m_min_wind_speed,m_max_wind_speed,m_avg_wind_speed
0,23847,1,-26.978261,-8.913043,-17.923995,-27.247826,-9.665217,-29.891304,-10.813043,-20.334382,754.282609,769.652174,762.541746,757.378261,773.178261,765.932988,0.608696,5.043478,2.476455
1,23847,2,-33.63125,-9.68125,-21.093685,-33.4125,-9.8875,-36.84375,-11.45,-23.594809,749.7125,770.325,760.437798,753.0125,773.9125,763.878273,0.125,4.5,1.990422
2,23847,3,-31.375,-10.15625,-19.804777,-31.5,-10.725,-34.34375,-12.00625,-22.331685,753.41875,773.0625,764.594088,756.74375,776.6375,768.02362,0.25,4.6875,2.292005
3,23847,4,-34.5625,-10.6875,-22.5385,-34.74375,-11.13125,-37.7125,-12.975,-25.42291,751.275,775.89375,764.258526,754.575,779.54375,767.72139,0.1875,4.9375,2.076968
4,23847,5,-32.129412,-9.923529,-20.337034,-31.6,-9.982353,-35.288235,-11.794118,-23.015659,754.558824,772.164706,764.848248,757.894118,775.705882,768.313507,0.352941,5.058824,2.109536


In [6]:
## Добавление новых стлбцов
weather = pd.merge(weather, means, how='left', on=['meteoid', 'week_num'])

In [7]:
## Заполнение Nan-значений
# Температура воздуха
# -замена пустых значений средними многолетними значениями по соответствующим неделям
weather.loc[pd.isnull(weather['min_air_temp']), 'min_air_temp'] = weather.loc[pd.isnull(weather['min_air_temp']), 'm_min_air_temp']
weather.loc[pd.isnull(weather['max_air_temp']), 'max_air_temp'] = weather.loc[pd.isnull(weather['max_air_temp']), 'm_max_air_temp']
weather.loc[pd.isnull(weather['avg_air_temp']), 'avg_air_temp'] = weather.loc[pd.isnull(weather['avg_air_temp']), 'm_avg_air_temp']

# -замена через зависимости от других показателей
weather.loc[pd.isnull(weather['min_temp']), 'min_temp'] = weather.loc[pd.isnull(weather['min_temp']), 'min_air_temp']*1.05
weather.loc[pd.isnull(weather['max_temp']), 'max_temp'] = weather.loc[pd.isnull(weather['max_temp']), 'max_air_temp']*1.05

# -приведение показателя из суммы температур по часам к сумме температур по дням
weather.loc[pd.notnull(weather['sum_air_temp']), 'sum_air_temp'] = weather.loc[pd.notnull(weather['sum_air_temp']), 'sum_air_temp']/8 

In [8]:
## Заполнение Nan-значений
# Атмосферное давление
# -замена пустых значений средними многолетними значениями по соответствующим неделям
weather.loc[pd.isnull(weather['min_po_press']), 'min_po_press'] = weather.loc[pd.isnull(weather['min_po_press']), 'm_min_po_press']
weather.loc[pd.isnull(weather['max_po_press']), 'max_po_press'] = weather.loc[pd.isnull(weather['max_po_press']), 'm_max_po_press']
weather.loc[pd.isnull(weather['avg_po_press']), 'avg_po_press'] = weather.loc[pd.isnull(weather['avg_po_press']), 'm_avg_po_press']

weather.loc[pd.isnull(weather['min_p_press']), 'min_p_press'] = weather.loc[pd.isnull(weather['min_p_press']), 'm_min_p_press']
weather.loc[pd.isnull(weather['max_p_press']), 'max_p_press'] = weather.loc[pd.isnull(weather['max_p_press']), 'm_max_p_press']
weather.loc[pd.isnull(weather['avg_p_press']), 'avg_p_press'] = weather.loc[pd.isnull(weather['avg_p_press']), 'm_avg_p_press']

# пустые значения барической тенденции приводятся к 0
weather.loc[pd.isnull(weather['min_baric_tendency']), 'min_baric_tendency'] = 0
weather.loc[pd.isnull(weather['max_baric_tendency']), 'max_baric_tendency'] = 0
weather.loc[pd.isnull(weather['avg_baric_tendency']), 'avg_baric_tendency'] = 0

In [9]:
## Заполнение Nan-значений
# Температура точки росы
# -замена через зависимости от других показателей
t_dew_min = weather.loc[pd.isnull(weather['min_dew_point_temp']), 'min_air_temp']-((1-(weather.loc[pd.isnull(weather['min_dew_point_temp']), 'min_u_humidity']/100))/0.05)
t_dew_max = weather.loc[pd.isnull(weather['max_dew_point_temp']), 'max_air_temp']-((1-(weather.loc[pd.isnull(weather['max_dew_point_temp']), 'max_u_humidity']/100))/0.05)
t_dew_avg = weather.loc[pd.isnull(weather['avg_dew_point_temp']), 'avg_air_temp']-((1-(weather.loc[pd.isnull(weather['avg_dew_point_temp']), 'avg_u_humidity']/100))/0.05)

weather.loc[pd.isnull(weather['min_dew_point_temp']), 'min_dew_point_temp'] = t_dew_min
weather.loc[pd.isnull(weather['max_dew_point_temp']), 'max_dew_point_temp'] = t_dew_max
weather.loc[pd.isnull(weather['avg_dew_point_temp']), 'avg_dew_point_temp'] = t_dew_avg

# -замена пустых значений средними многолетними значениями по соответствующим неделям
weather.loc[pd.isnull(weather['min_dew_point_temp']), 'min_dew_point_temp'] = weather.loc[pd.isnull(weather['min_dew_point_temp']), 'm_min_dew_point_temp']
weather.loc[pd.isnull(weather['max_dew_point_temp']), 'max_dew_point_temp'] = weather.loc[pd.isnull(weather['max_dew_point_temp']), 'm_max_dew_point_temp']
weather.loc[pd.isnull(weather['avg_dew_point_temp']), 'avg_dew_point_temp'] = weather.loc[pd.isnull(weather['avg_dew_point_temp']), 'm_avg_dew_point_temp']

# все отрицательные значения температуры точки росы приводятся к 0
weather.loc[(weather['min_dew_point_temp'] < 0), 'min_dew_point_temp'] = 0
weather.loc[(weather['max_dew_point_temp'] < 0), 'max_dew_point_temp'] = 0
weather.loc[(weather['avg_dew_point_temp'] < 0), 'avg_dew_point_temp'] = 0

In [10]:
## Заполнение Nan-значений
# Относительная влажность
# -замена через зависимости от других показателей
hum_min = (1-0.05*(weather.loc[pd.isnull(weather['min_u_humidity']), 'min_air_temp']-weather.loc[pd.isnull(weather['min_u_humidity']), 'min_dew_point_temp']))*100
hum_max = (1-0.05*(weather.loc[pd.isnull(weather['max_u_humidity']), 'max_air_temp']-weather.loc[pd.isnull(weather['max_u_humidity']), 'max_dew_point_temp']))*100
hum_avg = (1-0.05*(weather.loc[pd.isnull(weather['avg_u_humidity']), 'avg_air_temp']-weather.loc[pd.isnull(weather['avg_u_humidity']), 'avg_dew_point_temp']))*100

weather.loc[pd.isnull(weather['min_u_humidity']), 'min_u_humidity'] = hum_min
weather.loc[pd.isnull(weather['max_u_humidity']), 'max_u_humidity'] = hum_max
weather.loc[pd.isnull(weather['avg_u_humidity']), 'avg_u_humidity'] = hum_avg

# оставшиеся пустые значения относительной влажности приводятся к 0
weather.loc[pd.isnull(weather['min_u_humidity']), 'min_u_humidity'] = 0
weather.loc[pd.isnull(weather['max_u_humidity']), 'max_u_humidity'] = 0
weather.loc[pd.isnull(weather['avg_u_humidity']), 'avg_u_humidity'] = 0

# все отрицательные значения относительной влажности приводятся к 0
weather.loc[(weather['min_u_humidity'] < 0), 'min_u_humidity'] = 0
weather.loc[(weather['max_u_humidity'] < 0), 'max_u_humidity'] = 0
weather.loc[(weather['avg_u_humidity'] < 0), 'avg_u_humidity'] = 0

In [11]:
## Заполнение Nan-значений
# Скорость ветра
# -замена пустых значений средними многолетними значениями по соответствующим неделям
weather.loc[pd.isnull(weather['min_wind_speed']), 'min_wind_speed'] = weather.loc[pd.isnull(weather['min_wind_speed']), 'm_min_wind_speed']
weather.loc[pd.isnull(weather['max_wind_speed']), 'max_wind_speed'] = weather.loc[pd.isnull(weather['max_wind_speed']), 'm_max_wind_speed']
weather.loc[pd.isnull(weather['avg_wind_speed']), 'avg_wind_speed'] = weather.loc[pd.isnull(weather['avg_wind_speed']), 'm_avg_wind_speed']

# пустые значения максимального порыва ветра до измерения приводятся к 0
weather.loc[pd.isnull(weather['min_wind_gust_before']), 'min_wind_gust_before'] = 0
weather.loc[pd.isnull(weather['max_wind_gust_before']), 'max_wind_gust_before'] = 0
weather.loc[pd.isnull(weather['avg_wind_gust_before']), 'avg_wind_gust_before'] = 0

# пустые значения максимального порыва ветра между измерениями приводятся к 0
weather.loc[pd.isnull(weather['min_wind_gust_between']), 'min_wind_gust_between'] = 0
weather.loc[pd.isnull(weather['max_wind_gust_between']), 'max_wind_gust_between'] = 0
weather.loc[pd.isnull(weather['avg_wind_gust_between']), 'avg_wind_gust_between'] = 0

# -приведение показателя из суммы скорости ветра по часам к сумме скорости ветра по дням
weather.loc[pd.notnull(weather['sum_wind_speed']), 'sum_wind_speed'] = weather.loc[pd.notnull(weather['sum_wind_speed']), 'sum_wind_speed']/8

In [12]:
## Заполнение Nan-значений
# Температура почвы
# -замена через зависимости от других показателей
weather.loc[pd.isnull(weather['max_ground_temp']), 'max_ground_temp'] = weather.loc[pd.isnull(weather['max_ground_temp']), 'max_air_temp']*1.11
weather.loc[pd.isnull(weather['min_ground_temp']), 'min_ground_temp'] = weather.loc[pd.isnull(weather['min_ground_temp']), 'min_air_temp']*0.90
weather.loc[pd.isnull(weather['avg_ground_temp']), 'avg_ground_temp'] = weather.loc[pd.isnull(weather['avg_ground_temp']), 'avg_air_temp']*0.98

In [13]:
## Заполнение Nan-значений
# Осадки
# пустые значения кол-ва осадков приводятся к 0
weather.loc[pd.isnull(weather['sum_precipitation']), 'sum_precipitation'] = 0
weather.loc[pd.isnull(weather['avg_precipitation']), 'avg_precipitation'] = 0
weather.loc[pd.isnull(weather['avg_precipitation_time']), 'avg_precipitation_time'] = 0

In [14]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 394322 entries, 0 to 394321
Data columns (total 82 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   date                       394322 non-null  datetime64[ns]
 1   years                      394322 non-null  float64       
 2   min_air_temp               394322 non-null  float64       
 3   max_air_temp               394322 non-null  float64       
 4   avg_air_temp               394322 non-null  float64       
 5   sum_air_temp               394322 non-null  float64       
 6   min_temp                   394322 non-null  float64       
 7   sum_min_temperature        394322 non-null  float64       
 8   max_temp                   394322 non-null  float64       
 9   sum_max_temperature        394322 non-null  float64       
 10  min_dew_point_temp         394322 non-null  float64       
 11  max_dew_point_temp         394322 non-null  float64 

### Добавление новых features

In [15]:
# Функция для расчета гидротермического коэффициента Селянинова (ГТК)
def gtk_calc(row):
    try:
        gtk = (row.sum_precipitation*10)/row.sum_air_temp
    except ZeroDivisionError:
        gtk = 0
    return gtk

In [16]:
# Добавление нового столбца с значением ГТК
weather['GTK'] = weather.apply(gtk_calc, axis=1)

In [17]:
# Добавление новых столбцов диапазон значений от min до max
weather['diff_air_temp'] = weather['max_air_temp']-weather['min_air_temp'] # температура воздуха
weather['diff_po_press'] = weather['max_po_press']-weather['min_po_press'] # атмосферное давление
weather['diff_baric_tendency'] = weather['max_baric_tendency']-weather['min_baric_tendency'] # барическая тенденция
weather['diff_u_humidity'] = weather['max_u_humidity']-weather['min_u_humidity'] # относительная влажность
weather['diff_cloudiness'] = weather['max_cloudiness']-weather['min_cloudiness'] # общая облачность
weather['diff_height_clouds'] = weather['max_height_clouds']-weather['min_height_clouds'] # высота облаков

### Удаление лишних "рабочих" столбцов и сохранение данных в файл

In [18]:
## Удаление лишних столбцов
# удаление колонок со средними значениями (для заполнения значений Nan)
weather.drop(columns=drop_col[2:], inplace=True)

In [19]:
## Информация о наборе данных
weather.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 394322 entries, 0 to 394321
Data columns (total 72 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   date                       394322 non-null  datetime64[ns]
 1   years                      394322 non-null  float64       
 2   min_air_temp               394322 non-null  float64       
 3   max_air_temp               394322 non-null  float64       
 4   avg_air_temp               394322 non-null  float64       
 5   sum_air_temp               394322 non-null  float64       
 6   min_temp                   394322 non-null  float64       
 7   sum_min_temperature        394322 non-null  float64       
 8   max_temp                   394322 non-null  float64       
 9   sum_max_temperature        394322 non-null  float64       
 10  min_dew_point_temp         394322 non-null  float64       
 11  max_dew_point_temp         394322 non-null  float64 

In [20]:
## Сохранение данных в файл
file_csv = pd.DataFrame(weather).to_csv("DATA/Weather.csv.gz", sep=';', index=False, compression="gzip")

In [21]:
del sets

## 2. Создание набора данных для обучения предиктивной модели

In [22]:
## Чтение данных
weather = pd.read_csv('DATA/Weather.csv.gz', compression='gzip', sep=';', header=0, quotechar='"')

In [23]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 394322 entries, 0 to 394321
Data columns (total 72 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   date                       394322 non-null  object 
 1   years                      394322 non-null  float64
 2   min_air_temp               394322 non-null  float64
 3   max_air_temp               394322 non-null  float64
 4   avg_air_temp               394322 non-null  float64
 5   sum_air_temp               394322 non-null  float64
 6   min_temp                   394322 non-null  float64
 7   sum_min_temperature        394322 non-null  float64
 8   max_temp                   394322 non-null  float64
 9   sum_max_temperature        394322 non-null  float64
 10  min_dew_point_temp         394322 non-null  float64
 11  max_dew_point_temp         394322 non-null  float64
 12  avg_dew_point_temp         394322 non-null  float64
 13  sum_dew_point_temp         39

In [24]:
## Сортировка датафрейма по дате и метеостанциям
weather = weather.sort_values(by=['meteoid', 'date'], ascending=True).reset_index()
weather.drop(columns='index', inplace=True)
weather.head()

Unnamed: 0,date,years,min_air_temp,max_air_temp,avg_air_temp,sum_air_temp,min_temp,sum_min_temperature,max_temp,sum_max_temperature,...,avg_soil_surface_withsnow,meteoid,week_num,GTK,diff_air_temp,diff_po_press,diff_baric_tendency,diff_u_humidity,diff_cloudiness,diff_height_clouds
0,2005-01-31,2005.0,-27.3,-13.5,-20.81087,-119.6625,-27.7,-150.6,-13.5,-187.9,...,1.304348,23847,5,-0.008357,13.8,15.1,0.0,11.0,1.0,2500
1,2005-02-07,2005.0,-33.7,-11.9,-21.003704,-141.775,-33.8,-169.1,-9.5,-226.7,...,0.925926,23847,6,-0.338565,21.8,15.9,0.0,25.0,1.0,2500
2,2005-02-14,2005.0,-30.0,-14.0,-23.36,-160.6,-30.1,-186.0,-12.5,-248.6,...,1.272727,23847,7,-0.059153,16.0,40.7,0.0,15.0,1.0,2500
3,2005-02-21,2005.0,-33.4,-7.5,-16.701818,-114.825,-33.4,-147.6,-7.6,-169.2,...,1.25,23847,8,-0.357065,25.9,29.4,0.0,32.0,1.0,2500
4,2005-02-28,2005.0,-30.4,0.3,-12.898182,-88.675,-27.0,-132.1,0.3,-103.4,...,1.090909,23847,9,-2.063716,30.7,27.9,0.0,41.0,1.0,2500


In [25]:
# Добавление столбца суммарная облачность
weather['avg_all_clouds'] = weather['avg_cl_clouds']+weather['avg_cm_clouds']+weather['avg_ch_clouds']

In [26]:
## Удаление лишних столбцов
# список столбцов на удаление
drop_col2 = [
    'date',
    'min_temp', 'sum_min_temperature',
    'max_temp', 'sum_max_temperature',
    'min_dew_point_temp', 'max_dew_point_temp', 'avg_dew_point_temp', 'sum_dew_point_temp',
    'max_ground_temp', 'avg_ground_temp', 'sum_min_ground_temp',
    'min_p_press', 'max_p_press', 'avg_p_press',
    'min_wind_gust_before', 'max_wind_gust_before', 'avg_wind_gust_before',
    'min_wind_gust_between', 'avg_wind_gust_between',
    'min_nh_clouds', 'max_nh_clouds',
    'min_visibility_range', 'max_visibility_range', 'avg_visibility_range',
    'avg_precipitation_time',
    'avg_soil_surface_withsnow',
    'avg_cl_clouds', 'avg_cm_clouds', 'avg_ch_clouds',
    'diff_air_temp', 'diff_po_press', 'diff_baric_tendency', 'diff_u_humidity', 'diff_cloudiness', 'diff_height_clouds',
    'GTK',
    'week_num'
]
## Удаление лишних столбцов
weather.drop(columns=drop_col2, inplace=True)

In [27]:
## Добавление столбца фаза развития растений
weather['phase'] = np.nan

In [28]:
## Информация о наборе данных
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 394322 entries, 0 to 394321
Data columns (total 36 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   years                   394322 non-null  float64
 1   min_air_temp            394322 non-null  float64
 2   max_air_temp            394322 non-null  float64
 3   avg_air_temp            394322 non-null  float64
 4   sum_air_temp            394322 non-null  float64
 5   min_ground_temp         394322 non-null  float64
 6   min_po_press            394322 non-null  float64
 7   max_po_press            394322 non-null  float64
 8   avg_po_press            394322 non-null  float64
 9   min_baric_tendency      394322 non-null  float64
 10  max_baric_tendency      394322 non-null  float64
 11  avg_baric_tendency      394322 non-null  float64
 12  min_u_humidity          394322 non-null  float64
 13  max_u_humidity          394322 non-null  float64
 14  avg_u_humidity      

In [29]:
## Создать сэмпл по годам и метеостанциям
season_data = []

for m in weather['meteoid'].unique():
    for y in weather['years'].unique():
        try:
            # создаем датафрейм для сезона (старт с недели, где min температура почвы от +8°C и выше)
            ds = weather.loc[(weather['years'] == y) & (weather['meteoid'] == m)]
            i_start = int(min(ds.loc[(ds['min_ground_temp'] >= 8.)].index))
            i_end = i_start+21
            season = ds.loc[i_start:i_end].reset_index()
            season['week_num'] = season.index+1
            season.drop(columns=['index'], inplace=True)
            # добавление переменной - фаза вегетации подсолнечника
            season.loc[(season['week_num'] <= 2), 'phase'] = 1
            season.loc[(season['week_num'] >= 3) & (season['week_num'] <= 7), 'phase'] = 2
            season.loc[(season['week_num'] >= 8) & (season['week_num'] <= 11), 'phase'] = 3
            season.loc[(season['week_num'] >= 12) & (season['week_num'] <= 16), 'phase'] = 4
            season.loc[(season['week_num'] >= 17), 'phase'] = 5
            season_data.append(season)
        except:
            pass      
weather = pd.concat(season_data, ignore_index=True)

In [30]:
## Удаление неполных данных
# Определяем какие данные не полные (если за указаный год по какой-либо метеостанции данных меньше чем за 22 недели)
max_week = weather.groupby(['years', 'meteoid']).agg(max_week=('week_num', 'max'))
max_week = max_week.loc[(max_week['max_week'] == 22)].reset_index() # определяем max номера недели

# Удаляем неполные данные из датафрейма
weather = pd.merge(weather, max_week, how='left', on=['years', 'meteoid'])
weather.dropna(how='any', axis=0, inplace=True)

In [31]:
# Удаление промежуточных данных (для очистки оперативной памяти)
del season_data

In [32]:
## Создание датафрейма с необходимыми переменными (с группировкой по годам, метеостанциям и фазам вегетации) 
weather = weather.loc[(weather['phase'] != 5)]
weather = weather.groupby(['years', 'meteoid', 'phase']).agg(
    # значения min для переменных
    min_air_temp=('min_air_temp', np.min),
    min_ground_temp=('min_ground_temp', np.min),
    min_po_press=('min_po_press', np.min),
    min_baric_tendency=('min_baric_tendency', np.min),
    min_u_humidity=('min_u_humidity', np.min),
    min_wind_speed=('min_wind_speed', np.min),
    min_cloudiness=('min_cloudiness', np.min),
    min_height_clouds=('min_height_clouds', np.min),

    # значения max для переменных
    max_air_temp=('max_air_temp', np.max),
    max_po_press=('max_po_press', np.max),
    max_baric_tendency=('max_baric_tendency', np.max),
    max_u_humidity=('max_u_humidity', np.max),
    max_wind_speed=('max_wind_speed', np.max),
    max_wind_gust_between=('max_wind_gust_between', np.max),
    max_cloudiness=('max_cloudiness', np.max),
    max_height_clouds=('max_height_clouds', np.max),
    
    # средние значения для переменных
    avg_air_temp=('avg_air_temp', np.mean),
    avg_po_press=('avg_po_press', np.mean),
    avg_baric_tendency=('avg_baric_tendency', np.mean),
    wind_rumb_radians=('wind_rumb_radians', np.mean),
    avg_u_humidity=('avg_u_humidity', np.mean),
    avg_wind_speed=('avg_wind_speed', np.mean),
    avg_cloudiness=('avg_cloudiness', np.mean),
    avg_nh_clouds=('avg_nh_clouds', np.mean),
    avg_height_clouds=('avg_height_clouds', np.mean),
    avg_precipitation=('avg_precipitation', np.mean),
    avg_soil_surface=('avg_soil_surface', np.mean),
    avg_all_clouds=('avg_all_clouds', np.mean),
    
    # кумулятивные значения для переменных
    sum_air_temp=('sum_air_temp', np.sum),
    sum_wind_speed=('sum_wind_speed', np.sum),
    sum_precipitation=('sum_precipitation', np.sum),
    sum_precipitation_time=('sum_precipitation_time', np.sum)
)
weather.reset_index(inplace=True) # перевод индексов группировки в столбцы

In [33]:
## Создание временного датафрейма с итоговыми значениями для всего сезона вегетации (итоговые значения для всех 5 фаз) 
total_data = weather.groupby(['years', 'meteoid']).agg(
    # значения min для итоговых переменных
    total_min_air_temp=('min_air_temp', np.min),
    total_min_ground_temp=('min_ground_temp', np.min),
    total_min_po_press=('min_po_press', np.min),
    total_min_baric_tendency=('min_baric_tendency', np.min),
    total_min_u_humidity=('min_u_humidity', np.min),
    total_min_wind_speed=('min_wind_speed', np.min),
    total_min_cloudiness=('min_cloudiness', np.min),
    total_min_height_clouds=('min_height_clouds', np.min),

    # значения max для итоговых переменных
    total_max_air_temp=('max_air_temp', np.max),
    total_max_po_press=('max_po_press', np.max),
    total_max_baric_tendency=('max_baric_tendency', np.max),
    total_max_u_humidity=('max_u_humidity', np.max),
    total_max_wind_speed=('max_wind_speed', np.max),
    total_max_wind_gust_between=('max_wind_gust_between', np.max),
    total_max_cloudiness=('max_cloudiness', np.max),
    total_max_height_clouds=('max_height_clouds', np.max),
    
    # средние значения для итоговых переменных
    total_avg_air_temp=('avg_air_temp', np.mean),
    total_avg_po_press=('avg_po_press', np.mean),
    total_avg_baric_tendency=('avg_baric_tendency', np.mean),
    total_wind_rumb_radians=('wind_rumb_radians', np.mean),
    total_avg_u_humidity=('avg_u_humidity', np.mean),
    total_avg_wind_speed=('avg_wind_speed', np.mean),
    total_avg_cloudiness=('avg_cloudiness', np.mean),
    total_avg_nh_clouds=('avg_nh_clouds', np.mean),
    total_avg_height_clouds=('avg_height_clouds', np.mean),
    total_avg_precipitation=('avg_precipitation', np.mean),
    total_avg_soil_surface=('avg_soil_surface', np.mean),
    total_avg_all_clouds=('avg_all_clouds', np.mean),
    
    # кумулятивные значения для итоговых переменных
    total_sum_air_temp=('sum_air_temp', np.sum),
    total_sum_wind_speed=('sum_wind_speed', np.sum),
    total_sum_precipitation=('sum_precipitation', np.sum),
    total_sum_precipitation_time=('sum_precipitation_time', np.sum)
)
total_data.reset_index(inplace=True) # перевод индексов группировки в столбцы
total_data.rename(columns={"years": "year"}, inplace=True) # переименование столбца

### Добавление новых features

In [34]:
# Функция для расчета гидротермического коэффициента Селянинова (ГТК)
def gtk_calc(row):
    try:
        gtk = (row.sum_precipitation*10)/row.sum_air_temp
    except ZeroDivisionError:
        gtk = 0
    return gtk

In [35]:
# Функция для расчета скорости выпадения осадков
def precipitation_speed_calc(row):
    try:
        precipitation_speed = row.sum_precipitation/row.sum_precipitation_time
    except ZeroDivisionError:
        precipitation_speed = 0
    return precipitation_speed

In [36]:
# Функция для расчета показателя оценки суховея
def dry_wind_calc(row):
    try:
        dry_wind = (row.avg_wind_speed*row.avg_air_temp)/row.avg_u_humidity
    except ZeroDivisionError:
        dry_wind = 0
    return dry_wind

In [37]:
# Добавление нового столбца с значением ГТК
weather['GTK'] = weather.apply(gtk_calc, axis=1)

# Добавление нового столбца скорость выпадения осадков
weather['precipitation_speed'] = weather.apply(precipitation_speed_calc, axis=1)

# Добавление нового столбца суховей
weather['dry_wind'] = weather.apply(dry_wind_calc, axis=1)

  precipitation_speed = row.sum_precipitation/row.sum_precipitation_time
  precipitation_speed = row.sum_precipitation/row.sum_precipitation_time


In [38]:
# Добавление новых столбцов диапазон значений от min до max
weather['diff_air_temp'] = weather['max_air_temp']-weather['min_air_temp'] # температура воздуха
weather['diff_po_press'] = weather['max_po_press']-weather['min_po_press'] # атмосферное давление
weather['diff_baric_tendency'] = weather['max_baric_tendency']-weather['min_baric_tendency'] # барическая тенденция
weather['diff_u_humidity'] = weather['max_u_humidity']-weather['min_u_humidity'] # относительная влажность
weather['diff_cloudiness'] = weather['max_cloudiness']-weather['min_cloudiness'] # общая облачность
weather['diff_height_clouds'] = weather['max_height_clouds']-weather['min_height_clouds'] # высота облаков

In [39]:
# Добавление новых столбцов (для итоговых значений)
total_data['total_GTK'] = (total_data['total_sum_precipitation']*10)/total_data['total_sum_air_temp']
total_data['total_precipitation_speed'] = total_data['total_sum_precipitation']/total_data['total_sum_precipitation_time']
total_data['total_dry_wind'] = (total_data['total_avg_wind_speed']*total_data['total_avg_air_temp'])/total_data['total_avg_u_humidity'] 

In [40]:
# Добавление новых столбцов диапазон значений от min до max (для итоговых значений)
total_data['total_diff_air_temp'] = total_data['total_max_air_temp']-total_data['total_min_air_temp'] # температура воздуха
total_data['total_diff_po_press'] = total_data['total_max_po_press']-total_data['total_min_po_press'] # атмосферное давление
total_data['total_diff_baric_tendency'] = total_data['total_max_baric_tendency']-total_data['total_min_baric_tendency'] # барическая тенденция
total_data['total_diff_u_humidity'] = total_data['total_max_u_humidity']-total_data['total_min_u_humidity'] # относительная влажность
total_data['total_diff_cloudiness'] = total_data['total_max_cloudiness']-total_data['total_min_cloudiness'] # общая облачность
total_data['total_diff_height_clouds'] = total_data['total_max_height_clouds']-total_data['total_min_height_clouds'] # высота облаков

### Создание датафрейма для обучения и тестирования модели

In [41]:
## Создать сэмпл по годам и метеостанциям
season_data = []
ph = int(len(weather['phase'].unique()))

for m in weather['meteoid'].unique():
    for y in weather['years'].unique():
        ds = weather.loc[(weather['years'] == y) & (weather['meteoid'] == m)]
        elem = []
        for ph in weather['phase'].unique():
            season = ds.loc[(ds['phase'] == ph)]
            elem+=season.values.tolist()
        res = [y for x in elem for y in x]
        season_data.append(res)

In [42]:
## Создание набора данных для обучения и тестирования предиктивной модели
# создание списка с наименованием столбцов
col = []

for phase in range(1,5):
    for items in weather.columns:
        c = 'ph{}_{}'.format(phase, items)
        col.append(c)
        
# создание датафрейма для ML-модели
weather = pd.DataFrame(season_data, columns = col)
weather.drop_duplicates(keep='first', inplace=True, ignore_index=True)
weather.dropna(how='any', axis=0, inplace=True)
weather.tail()

Unnamed: 0,ph1_years,ph1_meteoid,ph1_phase,ph1_min_air_temp,ph1_min_ground_temp,ph1_min_po_press,ph1_min_baric_tendency,ph1_min_u_humidity,ph1_min_wind_speed,ph1_min_cloudiness,...,ph4_sum_precipitation_time,ph4_GTK,ph4_precipitation_speed,ph4_dry_wind,ph4_diff_air_temp,ph4_diff_po_press,ph4_diff_baric_tendency,ph4_diff_u_humidity,ph4_diff_cloudiness,ph4_diff_height_clouds
7092,2021.0,28903.0,1.0,7.9,9.0,743.7,0.0,8.0,0.0,0.0,...,840.0,0.707987,0.066667,0.385257,37.0,21.3,0.0,90.0,1.0,2500.0
7093,2020.0,28925.0,1.0,4.9,5.0,740.7,0.0,21.0,0.0,0.0,...,840.0,0.463605,0.02506,0.429612,33.0,34.1,0.0,79.0,1.1,2500.0
7094,2021.0,28925.0,1.0,8.5,7.0,736.0,0.0,10.0,0.0,0.0,...,804.0,0.160019,0.014988,1.074853,36.1,20.2,0.0,83.0,1.0,2500.0
7095,2020.0,34723.0,1.0,8.9,8.01,751.1,0.0,24.0,0.0,0.0,...,840.0,0.669159,0.067381,0.441897,23.4,12.0,0.0,82.0,0.0,0.0
7096,2021.0,34723.0,1.0,13.5,12.15,749.4,0.0,22.0,0.0,0.0,...,816.0,0.646391,0.067034,0.418856,24.7,14.0,0.0,75.0,0.0,0.0


In [43]:
# Удаление промежуточных данных (для очистки оперативной памяти)
del season_data

In [44]:
## Удаление неинформативных столбцов
# создание списка неинформативных столбцов
drop_col = []

for phase in range(2,5):
    for items in ['years', 'meteoid', 'phase']:
        c = 'ph{}_{}'.format(phase, items)
        drop_col.append(c)
        
# удаление столбцов
weather.drop(columns=drop_col, inplace=True)
weather.drop(columns=['ph1_phase'], inplace=True)
weather.rename(columns={"ph1_years": "year", "ph1_meteoid": "meteoid"}, inplace=True)

In [45]:
# Удаляем неполные данные из датафрейма
weather = pd.merge(weather, total_data, how='left', on=['year', 'meteoid'])
weather.dropna(how='any', axis=0, inplace=True)

In [46]:
## Информация о наборе данных
weather.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6649 entries, 0 to 6648
Columns: 207 entries, year to total_diff_height_clouds
dtypes: float64(204), int64(3)
memory usage: 10.6 MB


In [47]:
## Сохранение набора данных для модели в файл 
file_phase = weather.to_csv('DATA/weather_data_phase4.csv.gz', sep=';', index=False, compression="gzip")
file_total = total_data.to_csv('DATA/weather_data_total4.csv.gz', sep=';', index=False, compression="gzip")