In [2]:
import warnings
import numpy as np
import pandas as pd
import copy
from pathlib import Path
import torch
import torch.nn as nn

import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger

import pytorch_forecasting
from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss

import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
import tensorboard as tb
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile


device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')  

train_data = pd.read_csv('rosneft/train_data.csv')
gtm = pd.read_csv('rosneft/gtm.csv')
coords = pd.read_csv('rosneft/coords.csv')
sampl_sab = pd.read_csv('rosneft/sample_submission.csv')
train_data["MEASURED_IN_DATE"] = pd.to_datetime(train_data["MEASURED_IN_DATE"])

all_data = pd.read_csv('rosneft/gotov_ebat.csv', index_col='Unnamed: 0')
all_data["MEASURED_IN_DATE"] = pd.to_datetime(all_data["MEASURED_IN_DATE"])
all_data_w_d = all_data.drop(columns=['MEASURED_IN_DATE'])

                not been set for this class (SMAPE). The property determines if `update` by
                default needs access to the full metric state. If this is not the case, significant speedups can be
                achieved and we recommend setting this to `False`.
                We provide an checking function
                `from torchmetrics.utilities import check_forward_full_state_property`
                that can be used to check if the `full_state_update=True` (old and potential slower behaviour,
                default for now) or if `full_state_update=False` can be used safely.
                


In [3]:
transformer_to_do = dict()

for i in all_data['WELL_NAME'].unique():
    df = all_data[all_data['WELL_NAME'] == i]
    need_range = pd.date_range(start=df.MEASURED_IN_DATE.max() + np.timedelta64(1, 'D')
                               , end='2022-09-29')
    transformer_to_do[i] = len(need_range)

transformer_to_do_list = [
    i for i in all_data['WELL_NAME'].unique() if transformer_to_do[i] <= 122
]
rf_to_do_dict = {
    i: transformer_to_do[i] for i in all_data['WELL_NAME'].unique() if transformer_to_do[i] > 122
}

In [4]:
def zdorovaya_dura_LR(all_data_w_d, batch_size, max_prediction_length=122):
    #L_R
    training = TimeSeriesDataSet(
        data=all_data_w_d, 
        time_idx='time_id',
        target='LIQ_RATE',
        group_ids=['WELL_NAME'],
        allow_missing_timesteps=True,
        add_relative_time_idx=True,
        add_target_scales=True,
        static_categoricals = ['WELL_NAME'],
        time_varying_known_reals=["time_id"],
        time_varying_unknown_reals=[
        "LIQ_RATE",
        "WATER_CUT",
        "CHARWORK",
        "INTAKE",
        ],
        max_prediction_length=max_prediction_length,
        min_encoder_length=1,
        max_encoder_length=2128,
        target_normalizer=GroupNormalizer(
            groups=['WELL_NAME'], transformation="softplus"
        ),
#         lags={'LIQ_RATE': [7, 30, 180]},
    )
    
    validation = TimeSeriesDataSet.from_dataset(training, all_data_w_d
                                                , predict=True, stop_randomization=True)
    train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
    val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10
                                              , num_workers=0)
    
    lr_logger = LearningRateMonitor()
    early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4
                                        , patience=10, verbose=False, mode="min")
    logger = TensorBoardLogger("lightning_logs")

    trainer = pl.Trainer(
        max_epochs=300,
        gpus=1,
        enable_model_summary=True,
        gradient_clip_val=0.1,
        limit_train_batches=30,
        callbacks=[lr_logger, early_stop_callback],
        logger=logger,
#         default_root_dir = "rosneft/TFT/LR/"
    )

    tft = TemporalFusionTransformer.from_dataset(
        training,
        learning_rate=0.01,
        hidden_size=16,
        attention_head_size=1,
        dropout=0.1,
        hidden_continuous_size=8,
        loss=QuantileLoss(),
        log_interval=10,
        output_size=7,
        reduce_on_plateau_patience=4,
    )
    
#     tft = TemporalFusionTransformer \
#             .load_from_checkpoint('lightning_logs/lightning_logs/version_46/checkpoints/epoch=99-step=3000.ckpt')
    
    trainer.fit(
    tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
    )
    
    return tft, tft.predict(all_data_w_d)

tft_LR, pred_LR = zdorovaya_dura_LR(all_data_w_d, 128)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name                               | Type                            | Params
----------------------------------------------------------------------------------------
0  | loss                               | QuantileLoss                    | 0     
1  | logging_metrics                    | ModuleList                      | 0     
2  | input_embeddings                   | MultiEmbedding                  | 2.9 K 
3  | prescalers                         | ModuleDict                      | 144   
4  | static_variable_selection          | VariableSelectionNetwork        | 1.9 K 
5  | encoder_variable_selection         | VariableSelectionNetwork        | 3.7 K 
6  | decoder_variable_selection         | VariableSelectionNetwork        | 1.2 K 
7  | static_context_variable_selection  | GatedResid

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [5]:
def zdorovaya_dura_WC(all_data_w_d, batch_size, max_prediction_length=122):
    #W_C
    training = TimeSeriesDataSet(
        data=all_data_w_d, 
        time_idx='time_id',
        target='WATER_CUT',
        group_ids=['WELL_NAME'],
        allow_missing_timesteps=True,
        add_relative_time_idx=True,
        add_target_scales=True,
        static_categoricals = ['WELL_NAME'],
        time_varying_known_reals=["time_id"],
        time_varying_unknown_reals=[
        "LIQ_RATE",
        "WATER_CUT",
        "CHARWORK",
        "INTAKE",
        ],
        max_prediction_length=max_prediction_length,
        min_encoder_length=1,
        target_normalizer=GroupNormalizer(
            groups=['WELL_NAME'], transformation="softplus"
        ),
        
    )
    
    validation = TimeSeriesDataSet.from_dataset(training, all_data_w_d
                                                , predict=True, stop_randomization=True)
    train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
    val_dataloader = validation.to_dataloader(train=False
                                              , batch_size=batch_size * 10, num_workers=0)
    
    lr_logger = LearningRateMonitor()
    early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4
                                        , patience=10, verbose=False, mode="min")
    logger = TensorBoardLogger("lightning_logs")

    trainer = pl.Trainer(
        max_epochs=300,
        gpus=1,
        enable_model_summary=True,
        gradient_clip_val=0.1,
        limit_train_batches=30,
        callbacks=[lr_logger, early_stop_callback],
        logger=logger,
        default_root_dir = "rosneft/TFT/WC/"
    )

    tft = TemporalFusionTransformer.from_dataset(
        training,
        learning_rate=0.01,
        hidden_size=16,
        attention_head_size=1,
        dropout=0.1,
        hidden_continuous_size=8,
        loss=QuantileLoss(),
        log_interval=10,
        output_size=7,
        reduce_on_plateau_patience=4,
    )
    
    trainer.fit(
    tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
    )
#     tft = TemporalFusionTransformer \
#             .load_from_checkpoint('lightning_logs/lightning_logs/version_47/checkpoints/epoch=99-step=3000.ckpt')
    return tft, tft.predict(all_data_w_d)

tft_WC, pred_WC = zdorovaya_dura_WC(all_data_w_d, 128)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name                               | Type                            | Params
----------------------------------------------------------------------------------------
0  | loss                               | QuantileLoss                    | 0     
1  | logging_metrics                    | ModuleList                      | 0     
2  | input_embeddings                   | MultiEmbedding                  | 2.9 K 
3  | prescalers                         | ModuleDict                      | 144   
4  | static_variable_selection          | VariableSelectionNetwork        | 1.9 K 
5  | encoder_variable_selection         | VariableSelectionNetwork        | 3.7 K 
6  | decoder_variable_selection         | VariableSelectionNetwork        | 1.2 K 
7  | static_context_variable_selection  | GatedResid

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [6]:
# pd.DataFrame(pred_WC.detach().numpy()).transpose()
# pd.DataFrame(pred_LR.detach().numpy()).transpose()

In [7]:
answer = pd.DataFrame()
for i in transformer_to_do_list:
    df = all_data.copy()
    df = df[df['WELL_NAME'] == i]
    df.index = range(len(df))
    date_range = pd.date_range(start=df["MEASURED_IN_DATE"][len(df)-1] 
                               + np.timedelta64(1, 'D')
                               , end=df["MEASURED_IN_DATE"][len(df)-1] 
                               + np.timedelta64(122, 'D'))
    df = df.drop(columns=['MEASURED_IN_DATE'])
    pred_LR = tft_LR.predict(df)
    pred_WC = tft_WC.predict(df)
    pred_df = pd.DataFrame()
    pred_df['MEASURED_IN_DATE'] = date_range
    pred_df['WELL_NAME'] = i
    pred_df['LIQ_RATE'] = pred_LR.detach().numpy()[0]
    pred_df['WATER_CUT'] = pred_WC.detach().numpy()[0]
    answer = pd.concat([answer, pred_df])
# answer

Random forest

In [8]:
all_data

Unnamed: 0,MEASURED_IN_DATE,CHARWORK,LIQ_RATE,WATER_CUT,P_ZAB,INTAKE,WELL_NAME,time_id
17437,2016-08-02,0.0,690.0,0.5,159.19,0.0,aebc60b8446bd0a77ff9f51006a8d675f29b1a90d5fac2...,0
17438,2016-08-03,0.0,690.0,0.5,159.19,0.0,aebc60b8446bd0a77ff9f51006a8d675f29b1a90d5fac2...,1
17439,2016-08-04,0.0,690.0,0.5,153.19,0.0,aebc60b8446bd0a77ff9f51006a8d675f29b1a90d5fac2...,2
17440,2016-08-05,0.0,780.0,0.5,151.21,0.0,aebc60b8446bd0a77ff9f51006a8d675f29b1a90d5fac2...,3
17441,2016-08-06,0.0,780.0,0.5,151.21,0.0,aebc60b8446bd0a77ff9f51006a8d675f29b1a90d5fac2...,4
...,...,...,...,...,...,...,...,...
104377,2022-05-31,0.0,339.0,70.0,55.69,0.0,18ea8ca26bcabaac4fd073a00ba741a627f3649a6983a7...,2128
53662,2022-05-31,0.0,225.0,58.0,75.48,0.0,b04d54ff8a670262e81f0e0b514a32e03baa85200cd9f2...,2128
54379,2022-05-31,0.0,170.0,91.0,178.15,0.0,b694e479af091b82198c24a43195efbb9578a4f0e2e163...,2128
104892,2022-05-31,0.0,208.0,88.0,113.02,0.0,139e20fe864acf1a4b831ac25d417fe70725cbee85197c...,2128


In [9]:
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, RandomizedSearchCV

In [10]:
def plot_train_valid(model, X_train, y_train, X_test, y_test):
    train_indice = range(X_train.shape[0])
    test_indice = range(X_tran.shape[0], X_train.shape[0] + X_test.shape[0])
    
    plt.figure(figsize=(12, 6))
    sns.lineplot(x=train_indice, y=y_train)
    sns.lineplot(x=test_indice, y=model.predict(X_test))
    sns.lineplot(x=test_indice, y=y_test)
    
    plt.axvline(X.shape[0], linestyle='--', c='red')

    plt.legend(['Train', 'Valid', 'Prediction', 'Train/valid sep'])
    plt.show()

def predict_step_by_step(model, row, days):
    forecasting_for_days = []
    new_arr_to_predict = row.to_numpy().tolist()
    for i in range(days):
        tensor_to_predict = np.array(new_arr_to_predict)

        predict = model.predict(tensor_to_predict)[0]
        forecasting_for_days.append(predict)
        new_arr_to_predict = [new_arr_to_predict[0][:-1]]
        new_arr_to_predict[0].insert(0, forecasting_for_days[-1])
    
    return forecasting_for_days

def lag_features_liquid_rate(df, lags):
    for lag in lags:
        df[f"lag_t-{lag}"] = df.groupby(["CHARWORK"])["LIQ_RATE"].transform(
            lambda x: x.shift(lag))
    return df

def lag_features_water_cut(df, lags):
    for lag in lags:
        df[f"lag_t-{lag}"] = df.groupby(["CHARWORK"])["WATER_CUT"].transform(
            lambda x: x.shift(lag))
    return df

In [11]:
from sklearn.neighbors import KNeighborsRegressor

LAG_WINDOW_SIZE = 30

# Рустам - для вас 

# train_data = pd.read_csv('rosneft/train_data.csv')
# sampl_sab = pd.read_csv('rosneft/sample_submission.csv')


train_data = pd.read_csv('rosneft/train_data.csv')
sampl_sab = pd.read_csv('rosneft/sample_submission.csv')
train_data["MEASURED_IN_DATE"] = pd.to_datetime(train_data["MEASURED_IN_DATE"])

cols_to_select = ['MEASURED_IN_DATE', 'WELL_NAME', 'CHARWORK', 'LIQ_RATE', 'WATER_CUT',
       'P_ZAB', 'INTAKE']
train_data = train_data[cols_to_select]


# Filling date gaps with KNN
knn_train = train_data.dropna().drop(columns=['WELL_NAME', 'MEASURED_IN_DATE'
                                     , 'P_ZAB', 'LIQ_RATE', 'WATER_CUT'])
knn_train.CHARWORK = knn_train.CHARWORK.apply(lambda x: int(x == 'НАГ'))
X, y = knn_train, train_data.dropna().P_ZAB
knn = KNeighborsRegressor(n_neighbors=7)
knn.fit(X, y)


knn_na = train_data[~(train_data.index.isin(knn_train.index))] \
            .drop(columns=['WELL_NAME', 'MEASURED_IN_DATE'
                            , 'P_ZAB', 'LIQ_RATE', 'WATER_CUT'])
knn_na.CHARWORK = knn_na.CHARWORK.apply(lambda x: int(x == 'НАГ'))
preds = knn.predict(knn_na)


k = 0
for i, row in train_data.iterrows():
    if i in knn_na.index:
        train_data.at[i, 'P_ZAB'] = preds[k]
        k += 1
train_data.CHARWORK = train_data.CHARWORK.apply(lambda x: int(x == 'НАГ'))
well_names = train_data['WELL_NAME'].unique()


# Get missing days for each well
date_miss_dict = dict()
for i in well_names:
    df = train_data[train_data.WELL_NAME == i]
    df.index = range(len(df))
    date_range = pd.date_range(start=df.MEASURED_IN_DATE[0]
                               , end=df["MEASURED_IN_DATE"][len(df)-1])
    date_train_i, date_i, date_miss = 0, 0, []
    while date_train_i < len(df["MEASURED_IN_DATE"]) and date_i < len(date_range):
        if df.MEASURED_IN_DATE[date_train_i] == date_range[date_i]:
            date_i += 1
            date_train_i += 1
        else:
            date_i += 1
            date_miss.append(date_range[date_i])
    date_miss_ser = pd.Series(date_miss)
    ans_df = df.merge(pd.DataFrame(date_miss_ser), how="outer"
                      , left_on="MEASURED_IN_DATE", right_on=0)
    ans_df = ans_df.drop(columns=[0, 'WELL_NAME'])
    date_miss_dict[i] = ans_df

# Fill missing days with rolling mean values    
for key in date_miss_dict.keys():
    date_miss_dict[key] = date_miss_dict[key].sort_values(by='MEASURED_IN_DATE')
    
    curr_length = 0
    max_missing_seq = -1

    for i in date_miss_dict[key].index:    
        if date_miss_dict[key].loc[[i]].isna().any().any():
            curr_length += 1
        else:
            max_missing_seq = max(curr_length, max_missing_seq)
            curr_length = 0
        i += 1  
        
    date_miss_dict[key] = date_miss_dict[key].fillna(
        date_miss_dict[key].rolling(max_missing_seq + 1, min_periods=1).mean()
    )

In [12]:
rf_date_miss_dict = {
    key: value for key, value in date_miss_dict.items() 
    if key in rf_to_do_dict.keys()
}

In [13]:
rf_to_do_dict

{'debcf7a160692239563af8a9c8ba32c4ba067e747f19c8073055e2be5aeb4022': 253,
 '114183900f2d540609911971f3ae3f3d42ab8b4fa9252d0c4076b00d482c1594': 216,
 '8d4b4b4f0140179b1c94722f3d39100bcde8f3c39415983813c74bd52cf1ab3c': 336,
 '2ceb0e2b3c28cc1b3c3f8dec7ad56148dbf6ea77f9ffc832f4c4b9ce845d9bb7': 133}

In [14]:
import sklearn
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'matthews_corrcoef', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_s

In [15]:
# Forecast for most wells

rf_forecasts_dict_WC = dict()
best_wc_models = []

for c, i in enumerate(rf_date_miss_dict):
    train_data = rf_date_miss_dict[i].drop(columns=['P_ZAB', 'INTAKE', 'LIQ_RATE'])
    train_with_lags = lag_features_water_cut(train_data, lags=[*range(1, LAG_WINDOW_SIZE + 1)])
    train_with_lags = train_with_lags.dropna()
    y = train_with_lags['WATER_CUT']
    if len(train_with_lags['MEASURED_IN_DATE']) == 0:
        continue
    else:
        last_date = np.array(train_with_lags['MEASURED_IN_DATE'])[-1]
        train_with_lags = train_with_lags.drop(columns=['WATER_CUT'
                                                        , 'MEASURED_IN_DATE', 'CHARWORK'])
        index_to_predict = pd.date_range(start=last_date + np.timedelta64(1, 'D')
                                         , end=last_date + np.timedelta64(rf_to_do_dict[i], 'D'))
        
            
        model = RandomForestRegressor(random_state=1984)
        params = {
            'n_estimators': [30, 60, 100],
            'max_depth': [10, 12, 15, None],
            'min_samples_split': [3, 5, 7, 10],
            'min_samples_leaf': [1, 3, 5],
            'random_state': [2077]
        }
        
        search = RandomizedSearchCV(model, params, scoring='neg_mean_squared_error'
                              , cv=TimeSeriesSplit(n_splits=5), n_jobs=-1, random_state=1488)
        
        search.fit(train_with_lags, y)
        
        forecast = pd.Series(predict_step_by_step(search, train_with_lags.tail(1)
                                                  , rf_to_do_dict[i])
                             , index=index_to_predict)
        rf_forecasts_dict_WC[i] = forecast
        
        
        print('Score:\t', -search.score(train_with_lags, y))
        best_wc_models.append(search.best_estimator_)

Score:	 0.5246519726694854
Score:	 6.7452785965768305
Score:	 19.659933076186658
Score:	 0.0499584358729787


In [16]:
rf_forecasts_dict_LR = dict()
best_lr_models = []

for c, i in enumerate(rf_date_miss_dict):
    train_data = rf_date_miss_dict[i].drop(columns=['P_ZAB', 'INTAKE', 'WATER_CUT'])
    train_with_lags = lag_features_liquid_rate(train_data
                                               , lags=[*range(1, LAG_WINDOW_SIZE + 1)])
    train_with_lags = train_with_lags.dropna()
    y = train_with_lags['LIQ_RATE']
    if len(train_with_lags['MEASURED_IN_DATE']) == 0:
        continue
    else:
        last_date = np.array(train_with_lags['MEASURED_IN_DATE'])[-1]
        train_with_lags = train_with_lags.drop(columns=['LIQ_RATE'
                                                        , 'MEASURED_IN_DATE', 'CHARWORK'])
        index_to_predict = pd.date_range(start=last_date + np.timedelta64(1, 'D')
                                         , end=last_date \
                                         + np.timedelta64(rf_to_do_dict[i], 'D'))
        
        model = RandomForestRegressor()
        params = {
            'n_estimators': [30, 60, 100],
            'max_depth': [10, 12, 15, None],
            'min_samples_split': [3, 5, 7, 10],
            'min_samples_leaf': [1, 3, 5],
            'random_state': [2077]
        }
        
        search = RandomizedSearchCV(model, params, scoring='neg_mean_absolute_error'
                              , cv=TimeSeriesSplit(n_splits=5), n_jobs=-1, random_state=66)
        
        search.fit(train_with_lags, y)
        
        forecast = pd.Series(predict_step_by_step(search, train_with_lags.tail(1)
                                                  , rf_to_do_dict[i])
                             , index=index_to_predict)
        rf_forecasts_dict_LR[i] = forecast
        
        
        print('Score:\t', -search.score(train_with_lags, y))
        best_lr_models.append(search.best_estimator_)

Score:	 2.1916459819760794
Score:	 1.8189068398639154
Score:	 0.621109189585627
Score:	 2.01984875541126


Скор на WATER_CUT пиздец

- Score:	 0.5341674113649033
- Score:	 6.745349770013871
- Score:	 19.606119368192207
- Score:	 0.09145253840877589

In [17]:
from sklearn.linear_model import Lasso
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error



def drop_constant_columns(df):
    return df.loc[:, (df != df.iloc[0]).any()]
    

cb_forecasts_dict_WC = dict()
best_wc_cbrs = []
scores = []

for c, i in enumerate(rf_date_miss_dict):
    train_data = rf_date_miss_dict[i].drop(columns=['P_ZAB', 'INTAKE', 'LIQ_RATE'])
    train_with_lags = lag_features_water_cut(train_data, lags=[*range(1, LAG_WINDOW_SIZE + 1)])
    train_with_lags = train_with_lags.dropna()
    y = train_with_lags['WATER_CUT']
    


    if len(train_with_lags['MEASURED_IN_DATE']) == 0:
        continue
    else:
        last_date = np.array(train_with_lags['MEASURED_IN_DATE'])[-1]
        train_with_lags = train_with_lags.drop(columns=['WATER_CUT'
                                                        , 'MEASURED_IN_DATE', 'CHARWORK'])
        train_with_lags = drop_constant_columns(train_with_lags)


        index_to_predict = pd.date_range(start=last_date + np.timedelta64(1, 'D')
                                         , end=last_date + np.timedelta64(rf_to_do_dict[i], 'D'))
        
        
        model = CatBoostRegressor(iterations=4000, task_type="GPU" ,
                           devices='0:1')

        params = {
            'random_seed': [2077],
#             'n_estimators': [115, 228, 300],
#             'max_depth': [5, 7, 9],
#             'learning_rate': [9e-3, 9e-2]
        }
        
#         search = GridSearchCV(model, params, scoring='neg_mean_absolute_error'
#                               , cv=TimeSeriesSplit(n_splits=5), n_jobs=-1)
#         search.fit(train_with_lags, y)
        model.fit(train_with_lags, y, plot=True)
        
        forecast = pd.Series(predict_step_by_step(model, train_with_lags.tail(1)
                                                  , rf_to_do_dict[i])
                             , index=index_to_predict)
        cb_forecasts_dict_WC[i] = forecast
    
        scores.append(-search.score(train_with_lags, y))
        best_wc_cbrs.append(search.best_estimator_)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))



Learning rate set to 0.018176
0:	learn: 22.6898507	total: 16.2ms	remaining: 1m 4s
1:	learn: 22.2952579	total: 26.2ms	remaining: 52.4s
2:	learn: 21.9073467	total: 36ms	remaining: 48s
3:	learn: 21.5255408	total: 46.2ms	remaining: 46.2s
4:	learn: 21.1522983	total: 56.3ms	remaining: 45s
5:	learn: 20.7848121	total: 66.1ms	remaining: 44s
6:	learn: 20.4247276	total: 76ms	remaining: 43.4s
7:	learn: 20.0706896	total: 85.9ms	remaining: 42.9s
8:	learn: 19.7229471	total: 95.8ms	remaining: 42.5s
9:	learn: 19.3825442	total: 105ms	remaining: 42s
10:	learn: 19.0473156	total: 115ms	remaining: 41.8s
11:	learn: 18.7171173	total: 125ms	remaining: 41.5s
12:	learn: 18.3940519	total: 135ms	remaining: 41.4s
13:	learn: 18.0769125	total: 145ms	remaining: 41.3s
14:	learn: 17.7655293	total: 155ms	remaining: 41.3s
15:	learn: 17.4579301	total: 165ms	remaining: 41.1s
16:	learn: 17.1574066	total: 175ms	remaining: 41s
17:	learn: 16.8638451	total: 185ms	remaining: 40.9s
18:	learn: 16.5726006	total: 195ms	remaining: 40.

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))



Learning rate set to 0.014887
0:	learn: 20.8715971	total: 10.4ms	remaining: 41.6s
1:	learn: 20.5966369	total: 20ms	remaining: 40s
2:	learn: 20.3185328	total: 29.5ms	remaining: 39.3s
3:	learn: 20.0419976	total: 37.5ms	remaining: 37.5s
4:	learn: 19.7717051	total: 45.7ms	remaining: 36.5s
5:	learn: 19.5063680	total: 53.6ms	remaining: 35.7s
6:	learn: 19.2469746	total: 61.4ms	remaining: 35s
7:	learn: 18.9874464	total: 70ms	remaining: 34.9s
8:	learn: 18.7355413	total: 78.1ms	remaining: 34.6s
9:	learn: 18.4849371	total: 86.8ms	remaining: 34.7s
10:	learn: 18.2359253	total: 95.7ms	remaining: 34.7s
11:	learn: 17.9909166	total: 106ms	remaining: 35.1s
12:	learn: 17.7515226	total: 114ms	remaining: 34.9s
13:	learn: 17.5175791	total: 123ms	remaining: 35.2s
14:	learn: 17.2872440	total: 133ms	remaining: 35.4s
15:	learn: 17.0577177	total: 142ms	remaining: 35.3s
16:	learn: 16.8332545	total: 151ms	remaining: 35.5s
17:	learn: 16.6098233	total: 160ms	remaining: 35.5s
18:	learn: 16.3895030	total: 170ms	remain

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))



Learning rate set to 0.014547
0:	learn: 19.9913736	total: 10.5ms	remaining: 42.2s
1:	learn: 19.7392516	total: 20.6ms	remaining: 41.3s
2:	learn: 19.4923169	total: 30.6ms	remaining: 40.8s
3:	learn: 19.2484843	total: 40.4ms	remaining: 40.3s
4:	learn: 19.0058080	total: 49.7ms	remaining: 39.7s
5:	learn: 18.7686494	total: 59.6ms	remaining: 39.7s
6:	learn: 18.5333536	total: 69.3ms	remaining: 39.5s
7:	learn: 18.3012959	total: 79.2ms	remaining: 39.5s
8:	learn: 18.0723866	total: 89ms	remaining: 39.5s
9:	learn: 17.8482540	total: 98.2ms	remaining: 39.2s
10:	learn: 17.6255113	total: 107ms	remaining: 38.8s
11:	learn: 17.4077869	total: 117ms	remaining: 38.8s
12:	learn: 17.2024457	total: 127ms	remaining: 38.8s
13:	learn: 16.9884920	total: 136ms	remaining: 38.8s
14:	learn: 16.7775696	total: 145ms	remaining: 38.4s
15:	learn: 16.5716728	total: 155ms	remaining: 38.6s
16:	learn: 16.3665031	total: 163ms	remaining: 38.2s
17:	learn: 16.1649798	total: 172ms	remaining: 38s
18:	learn: 15.9655954	total: 181ms	rem

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))



Learning rate set to 0.013794
0:	learn: 6.3433403	total: 9.32ms	remaining: 37.3s
1:	learn: 6.2742058	total: 18.1ms	remaining: 36.2s
2:	learn: 6.2072505	total: 26.9ms	remaining: 35.9s
3:	learn: 6.1410406	total: 35.8ms	remaining: 35.8s
4:	learn: 6.0757586	total: 44.5ms	remaining: 35.5s
5:	learn: 6.0111278	total: 53.2ms	remaining: 35.4s
6:	learn: 5.9466791	total: 62ms	remaining: 35.3s
7:	learn: 5.8828758	total: 70.6ms	remaining: 35.2s
8:	learn: 5.8203889	total: 79.1ms	remaining: 35.1s
9:	learn: 5.7577386	total: 87.7ms	remaining: 35s
10:	learn: 5.6964797	total: 96.4ms	remaining: 35s
11:	learn: 5.6373539	total: 105ms	remaining: 34.9s
12:	learn: 5.5777230	total: 114ms	remaining: 34.9s
13:	learn: 5.5176724	total: 122ms	remaining: 34.8s
14:	learn: 5.4589333	total: 131ms	remaining: 34.8s
15:	learn: 5.4008318	total: 140ms	remaining: 34.8s
16:	learn: 5.3439992	total: 149ms	remaining: 34.8s
17:	learn: 5.2885423	total: 158ms	remaining: 34.9s
18:	learn: 5.2329756	total: 166ms	remaining: 34.8s
19:	le

In [18]:
print(scores)

[462.3104897700519, 458.93940915316733, 503.12773928773834, 434.62804135338365]


[0.3706618248854376, 0.4875355331238526, 0.6506005719809389, 0.07157820889607548]

In [19]:
df = pd.DataFrame()
check = set()
for i in rf_forecasts_dict_LR:
    frcst_for_well_l_r = pd.DataFrame()
    frcst_for_well_w_c = pd.DataFrame()
    
    frcst_for_well_l_r['MEASURED_IN_DATE'] = rf_forecasts_dict_LR[i].index
    frcst_for_well_w_c['MEASURED_IN_DATE'] = cb_forecasts_dict_WC[i].index
    
    frcst_for_well_l_r['WELL_NAME'] = i
    frcst_for_well_w_c['WELL_NAME'] = i
    
    frcst_for_well_w_c = frcst_for_well_w_c.set_index('MEASURED_IN_DATE')
    frcst_for_well_l_r = frcst_for_well_l_r.set_index('MEASURED_IN_DATE')
    
    frcst_for_well_l_r['LIQ_RATE'] = rf_forecasts_dict_LR[i]
    frcst_for_well_w_c['WATER_CUT'] = cb_forecasts_dict_WC[i]
    
    frcst_for_well = frcst_for_well_l_r.merge(frcst_for_well_w_c
                                              , on=['MEASURED_IN_DATE', 'WELL_NAME']
                                              , how='inner')
    if i not in check:
        check.add(i)
    else:
        print(i)
    df = pd.concat([df, frcst_for_well])
df = df.reset_index()
df.head()

Unnamed: 0,MEASURED_IN_DATE,WELL_NAME,LIQ_RATE,WATER_CUT
0,2022-01-20,debcf7a160692239563af8a9c8ba32c4ba067e747f19c8...,434.912428,90.101609
1,2022-01-21,debcf7a160692239563af8a9c8ba32c4ba067e747f19c8...,434.048899,91.401233
2,2022-01-22,debcf7a160692239563af8a9c8ba32c4ba067e747f19c8...,433.362593,91.245552
3,2022-01-23,debcf7a160692239563af8a9c8ba32c4ba067e747f19c8...,430.958546,91.237029
4,2022-01-24,debcf7a160692239563af8a9c8ba32c4ba067e747f19c8...,428.949426,91.095173


In [20]:
pogodite_eto_realno = pd.concat([df, answer])
pogodite_eto_realno

Unnamed: 0,MEASURED_IN_DATE,WELL_NAME,LIQ_RATE,WATER_CUT
0,2022-01-20,debcf7a160692239563af8a9c8ba32c4ba067e747f19c8...,434.912428,90.101609
1,2022-01-21,debcf7a160692239563af8a9c8ba32c4ba067e747f19c8...,434.048899,91.401233
2,2022-01-22,debcf7a160692239563af8a9c8ba32c4ba067e747f19c8...,433.362593,91.245552
3,2022-01-23,debcf7a160692239563af8a9c8ba32c4ba067e747f19c8...,430.958546,91.237029
4,2022-01-24,debcf7a160692239563af8a9c8ba32c4ba067e747f19c8...,428.949426,91.095173
...,...,...,...,...
117,2022-09-26,fe15ccab850c10f95481f1309a2e9f88e1458be2a8a8f8...,475.516724,41.833595
118,2022-09-27,fe15ccab850c10f95481f1309a2e9f88e1458be2a8a8f8...,475.007874,41.851025
119,2022-09-28,fe15ccab850c10f95481f1309a2e9f88e1458be2a8a8f8...,474.552460,41.868034
120,2022-09-29,fe15ccab850c10f95481f1309a2e9f88e1458be2a8a8f8...,474.178833,41.884644


In [23]:
sampl_sab = sampl_sab.drop(columns=['LIQ_RATE', 'WATER_CUT'])
sampl_sab['MEASURED_IN_DATE'] = pd.to_datetime(sampl_sab['MEASURED_IN_DATE'])

In [24]:
pogodite_eto_realno.merge(sampl_sab, on=['WELL_NAME', 'MEASURED_IN_DATE'], how='inner')

Unnamed: 0,MEASURED_IN_DATE,WELL_NAME,LIQ_RATE,WATER_CUT
0,2022-06-12,debcf7a160692239563af8a9c8ba32c4ba067e747f19c8...,425.943736,90.907732
1,2022-06-13,debcf7a160692239563af8a9c8ba32c4ba067e747f19c8...,425.943736,90.907732
2,2022-06-14,debcf7a160692239563af8a9c8ba32c4ba067e747f19c8...,425.943736,90.907732
3,2022-06-15,debcf7a160692239563af8a9c8ba32c4ba067e747f19c8...,425.943736,90.907732
4,2022-06-16,debcf7a160692239563af8a9c8ba32c4ba067e747f19c8...,425.943736,90.907732
...,...,...,...,...
20661,2022-09-24,fe15ccab850c10f95481f1309a2e9f88e1458be2a8a8f8...,476.409393,41.797318
20662,2022-09-25,fe15ccab850c10f95481f1309a2e9f88e1458be2a8a8f8...,476.016968,41.815704
20663,2022-09-26,fe15ccab850c10f95481f1309a2e9f88e1458be2a8a8f8...,475.516724,41.833595
20664,2022-09-27,fe15ccab850c10f95481f1309a2e9f88e1458be2a8a8f8...,475.007874,41.851025


In [25]:
pogodite_eto_realno.to_csv('blyat_gde_rijuliya_bez_tebya_ya_ne_mogu_jit.csv')