In [134]:
import pandas as pd
import numpy as np
from sklearn import metrics
from tqdm import tqdm
from IPython.display import display
from datetime import datetime
import gc

gc.enable()
pd.set_option("future.no_silent_downcasting", True)

# Useful Constants

In [135]:
FILE_PREDICTIONS = "data/structured_messages-6.xlsx"
FILES_GROUND_TRUTH = ("data/train_data.xlsx", "data/train_data2.xlsx", "data/70-109.xlsx")
OUTPUT_FILE = "labeled_data.pkl"

CURRENT_YEAR = current_year = datetime.now().year
DATE_FORMATS = [
    "%d.%m", "%d/%m", "%d.%m.%Y", "%d/%m/%Y", "%d.%m.%y", "%d/%m/%y",
    "%Y-%m-%d"
]

COLUMNS = [
    "msg_id", "Дата", "Подразделение", "Операция", "Культура",
    "За день, га", "С начала операции, га", "Вал за день, ц",
    "Вал с начала, ц"
]
TEXT_COLS = ["Дата", "Подразделение", "Операция", "Культура"]
NUM_COLS = ["За день, га", "С начала операции, га", "Вал за день, ц", "Вал с начала, ц"]

# Functions

In [136]:
def load_data(path):
    df = pd.read_excel(path, dtype={"Дата": str})
    if "text" in df.columns:
        df.drop("text", axis=1, inplace=True)
    if "message" in df.columns:
        df.drop("message", axis=1, inplace=True)
    df.columns = COLUMNS
    return df

In [137]:
def parse_datе(date):
    date = date.split(' ')[0]
    date = date.strip().rstrip('.')
    for i, fmt in enumerate(DATE_FORMATS):
        try:
            dt = datetime.strptime(date, fmt)
            if i < 2:
                dt = dt.replace(year=CURRENT_YEAR)
            dt = dt.strftime("%d.%m.%Y")
            return dt
        except Exception as e:
            pass

In [32]:
def calc_acc(y_true, y_pred):
    assert len(y_true) == len(y_pred), f"true={len(y_true)}, pred={len(y_pred)}"

    acc = 0.0
    for val in zip(y_true, y_pred):
        true_value, pred_value = val[0], val[1]
        if true_value == pred_value:
            acc += 1
        else:
            if isinstance(true_value, str) and isinstance(pred_value, str):
                if true_value.startswith(pred_value):
                    acc += 1
    return acc / len(y_true)

# Preparing single file with labeled data

In [138]:
# dfs = list(map(load_data, FILES_GROUND_TRUTH))
# dfs[-1]["msg_id"] += 69
# pd.concat(dfs, axis=0).to_parquet(OUTPUT_FILE, index=False)

<font color="red" size=14>DONE</font>

---

# Loading and preparing data for accuracy estimation

In [308]:
DROP_OP = [
    "боронование довсходовое",
    "выкашивание отцовских форм подсолнечник",
    "тестовая операция",
    "средства защиты растений",
    "затравка мышевидных грызунов"
]

MAP = {
    "внесение противозлакового гербицида": "гербицидная обработка",
    "посев": "сев",
    "вспашка": "пахота",
    "сплошная культивация": "культивация",
    "выравнивание": "выравнивание зяби",
    "химическая прополка": "гербицидная обработка"
}

In [309]:
# train = pd.read_parquet(OUTPUT_FILE)
train = pd.read_excel("1.xlsx")
# train["msg_id"] -= 1
pred = pd.read_excel(FILE_PREDICTIONS)

assert train.shape[1] == pred.shape[1]

pred.columns = COLUMNS

In [310]:
train.msg_id.max(), pred.msg_id.max()

(105, 105)

In [311]:
train.shape, pred.shape

((355, 9), (386, 9))

In [312]:
MSG_IDS = pred["msg_id"].unique()

In [313]:
for txt_col in TEXT_COLS:
    train[txt_col] = train[txt_col].str.lower()
    train[txt_col] = train[txt_col].str.replace('ё', 'е')
    
    pred[txt_col] = pred[txt_col].str.lower()
    pred[txt_col] = pred[txt_col].str.replace('ё', 'е')

In [314]:
for num_col in NUM_COLS:
    dtype = train[num_col].dtype.name
    if ("int" in dtype) or ("float" in dtype):
        continue

In [315]:
for num_col in NUM_COLS:
    dtype = pred[num_col].dtype.name
    if ("int" in dtype) or ("float" in dtype):
        continue
    pred[num_col] = pd.to_numeric(pred[num_col], errors="coerce")

In [316]:
for i, val in enumerate(train["Дата"]):
    if not pd.isna(val):
        train.loc[i, "Дата"] = parse_datе(str(val))

In [317]:
for i, val in enumerate(pred["Дата"]):
    if not pd.isna(val):
        pred.loc[i, "Дата"] = parse_datе(str(val))

In [318]:
pred.drop(pred[pred["Операция"].isin(DROP_OP)].index, axis=0, inplace=True)
pred.reset_index(drop=True, inplace=True)

In [319]:
pred["Операция"] = pred["Операция"].apply(lambda x: MAP.get(x, x))

In [320]:
pred["Подразделение"] = pred["Подразделение"].apply(lambda x: "аор" if x.startswith("отд") else x)

In [321]:
pred["Культура"] = pred["Культура"].apply(lambda x: "озимые культуры" if x == "озимые" else x)

In [322]:
pred["Операция"].isna().sum()

0

In [323]:
pred["Культура"].isna().sum()

8

In [324]:
pred[pred["Культура"].isna()]

Unnamed: 0,msg_id,Дата,Подразделение,Операция,Культура,"За день, га","С начала операции, га","Вал за день, ц","Вал с начала, ц"
143,41,,аор,внесение минеральных удобрений,,80.0,314.0,,
164,45,,аор,выравнивание зяби,,56.0,56.0,,
214,60,,восход,предпосевная культивация,,177.0,396.0,,
260,75,,аор,подкормка,,241.0,241.0,,
261,75,,аор,подкормка,,159.0,321.0,,
303,87,,восход,предпосевная культивация,,180.0,1430.0,,
305,88,,восход,подкормка,,218.0,,,
306,88,,восход,культивация,,196.0,1626.0,,


In [325]:
pred.drop(pred[pred["Культура"].isna()].index, axis=0, inplace=True)

In [326]:
train.shape, pred.shape

((355, 9), (361, 9))

In [327]:
train = train.fillna(-1)
pred = pred.fillna(-1)

In [328]:
train.isna().sum().sum(), pred.isna().sum().sum()

(0, 0)

In [329]:
set(pred["Операция"].dropna().str.lower().unique()).difference(
    set(train["Операция"].unique())
)

{'выравнивание многолетних трав', 'культ.'}

# Scoring

In [334]:
scores = []
cnt = 0
for msg_id in MSG_IDS:
    y_true, y_pred = [], []
    tmp_true = train[train["msg_id"].values == msg_id].fillna(-1).values[:, 1:]
    tmp_pred = pred[pred["msg_id"].values == msg_id].fillna(-1).values[:, 1:]
    if tmp_true.shape[0] == tmp_pred.shape[0]:
        y_true.extend(tmp_true.ravel().tolist())
        y_pred.extend(tmp_pred.ravel().tolist())
        score = calc_acc(y_true, y_pred)
        scores.append(score)
    else:
        t_size, p_size = tmp_true.shape[0], tmp_pred.shape[0]
        if t_size == 1:
            scores.append(
                calc_acc(tmp_true.ravel().tolist(), tmp_pred[:1].ravel().tolist())
            )
            continue
        cnt += 1

        print(msg_id, t_size, p_size)

sum(scores) / len(scores), len(scores), cnt

29 6 5
32 7 8
45 4 5
57 6 5
60 3 2
72 4 2
76 3 5
81 8 7
82 8 7
83 5 4
88 2 1
99 5 3


(0.9522108081583887, 93, 12)