In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from catboost import Pool, CatBoostRegressor

from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# Импортируем pytorch
import torch
import torch.nn as nn

In [4]:
df = pd.read_csv("ffkkmo_clean.csv", index_col='Unnamed: 0')
df_judges = pd.read_csv("ffkkmo_judges_clean.csv", index_col='Unnamed: 0')

In [5]:
cat_ord = [
    'Юный фигурист, мальчики',
    'Юный фигурист, девочки',
    '3-й юношеский разряд, мальчики',
    '3-й юношеский разряд, девочки',
    '2-й юношеский разряд, мальчики',
    '2-й юношеский разряд, девочки',
    '1-й юношеский разряд, мальчики',
    '1-й юношеский разряд, девочки',
    '3-й спортивный разряд, мальчики',
    '3-й спортивный разряд, девочки', 
    '2-й спортивный разряд, мальчики',
    '2-й спортивный разряд, девочки',
    '1-й спортивный разряд, мальчики',
    '1-й спортивный разряд, девочки',
    'КМС, юноши',
    'КМС, девушки',
    'МС, юноши',
    'МС, девушки'
]

In [6]:
df = df.drop(df.query("category not in @cat_ord").index).reset_index()

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30517 entries, 0 to 30516
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   index       30517 non-null  int64  
 1   date        30517 non-null  object 
 2   place       30517 non-null  object 
 3   online      30517 non-null  object 
 4   category    30517 non-null  object 
 5   segment     30517 non-null  object 
 6   rank        30517 non-null  int64  
 7   firstname   30517 non-null  object 
 8   middlename  1296 non-null   object 
 9   lastname    30517 non-null  object 
 10  club        29124 non-null  object 
 11  tss         30517 non-null  float64
 12  tes         30517 non-null  float64
 13  pcs         30517 non-null  float64
 14  year        30517 non-null  int64  
 15  month       30517 non-null  int64  
 16  season      30517 non-null  int64  
dtypes: float64(3), int64(5), object(9)
memory usage: 4.0+ MB


In [8]:
df['name'] = df.firstname + df.lastname

In [10]:
df_spec = df_judges[df_judges['function'] == 'Технический специалист'].groupby(['date', 'online']).first()
df_spec = df_spec.rename(columns={"name": "ts"})
df_cont = df_judges[df_judges['function'] == 'Технический контролер'].groupby(['date', 'online']).first()
df_cont = df_cont.rename(columns={"name": "tc"})

In [11]:
df = df.merge(df_spec.merge(df_cont, on=['date', 'online'])[['ts', 'tc']].reset_index(), on=['date', 'online'])

In [12]:
ohe_columns = ['place', 'club', 'segment', 'name', 'ts', 'tc']
ord_columns = ['category', 'season', 'month']
target = ['tss']

train = df.drop(df[df['date'] == '2025-06-14'].index).reset_index(drop=True)
test = df.drop(df[df['date'] != '2025-06-14'].index).reset_index(drop=True)

X_train = train[ohe_columns + ord_columns]
X_test = test[ohe_columns + ord_columns]
y_train = train[target]
y_test = test[target]

ord_pipe = Pipeline(
    [('simpleImputer_before_ord', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
     ('ord',  OrdinalEncoder(
                categories=[
                    cat_ord,
                    [2021, 2122, 2223, 2324, 2425],
                    [8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7],
                    # X_train.name.unique(),
                    # X_train.ts.unique(),
                    # X_train.tc.unique()
                ], 
                handle_unknown='use_encoded_value', unknown_value=np.nan
            )
        ),
     ('simpleImputer_after_ord', SimpleImputer(missing_values=np.nan, strategy='most_frequent'))
    ]
)
ohe_pipe = Pipeline(
    [('simpleImputer_ohe', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
     ('ohe', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ]
)
data_preprocessor = ColumnTransformer(
    [
        ('ohe', ohe_pipe, ohe_columns),
        ('ord', ord_pipe, ord_columns),
    ], 
    remainder='passthrough'
)
X_train_preprocessed = data_preprocessor.fit_transform(X_train)
X_test_preprocessed = data_preprocessor.transform(X_test)



### DummyRegressor

In [13]:
# Протестируем самую простую модель "Заглушка"
model_dr = DummyRegressor(strategy="mean")
model_dr.fit(X_train_preprocessed, y_train)
model_dr_pred = model_dr.predict(X_test_preprocessed)
dr_mae = mean_absolute_error(model_dr_pred, y_test)
print(f'Значение метрики самой простой модели "Заглушки" MAE = {dr_mae:0.2f}')

Значение метрики самой простой модели "Заглушки" MAE = 10.05


### LinearRegression

In [14]:
lr_grid = GridSearchCV(
    estimator=LinearRegression(),
    cv=10,
    param_grid={},
    scoring='neg_mean_absolute_error', 
    verbose=3
)

lr_grid.fit(X_train_preprocessed, y_train)

print(f"MAE score = {abs(lr_grid.best_score_):0.2f}")

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV 1/10] END ................................., score=-4.755 total time=   0.5s
[CV 2/10] END ................................., score=-4.210 total time=   0.5s
[CV 3/10] END ................................., score=-4.589 total time=   0.4s
[CV 4/10] END ................................., score=-4.286 total time=   0.4s
[CV 5/10] END ................................., score=-4.151 total time=   0.6s
[CV 6/10] END ................................., score=-4.147 total time=   0.4s
[CV 7/10] END ................................., score=-4.329 total time=   0.5s
[CV 8/10] END ................................., score=-3.993 total time=   0.4s
[CV 9/10] END ................................., score=-5.256 total time=   0.4s
[CV 10/10] END ................................, score=-4.307 total time=   0.5s
MAE score = 4.40


### CatBoostRegressor

In [None]:
cbr_params = {
    'iterations': [1000, 2000],
    'learning_rate': [0.01],
    'depth': [6, 8]
}
cbr_grid = GridSearchCV(
    estimator=CatBoostRegressor(
        random_state=42,
        logging_level='Silent',
        loss_function='MAE'
    ),
    param_grid=cbr_params,
    cv=5, 
    scoring='neg_mean_absolute_error', 
    verbose=3
)

cbr_grid.fit(X_train_preprocessed, y_train)

print(f"MAE score = {abs(cbr_grid.best_score_):0.2f}")

### RandomForestRegressor

In [None]:
fr_params = {
    'n_estimators': [10],
    'max_depth': [8, 16],
    # 'min_samples_split': [8, 16]
}
fr_grid = GridSearchCV(
    estimator=RandomForestRegressor(
        criterion='absolute_error',
        random_state=42
    ),
    param_grid=fr_params,
    cv=5, 
    scoring='neg_mean_absolute_error', 
    verbose=3
)

fr_grid.fit(X_train_preprocessed[:3000], y_train['tss'][:3000])

print(f"MAE score = {abs(fr_grid.best_score_):0.2f}")

### NN

In [15]:
X_train_scaled_tensor = torch.tensor(X_train_preprocessed.todense(), dtype=torch.float)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float)
X_valid_scaled_tensor = torch.tensor(X_test_preprocessed.todense(), dtype=torch.float)
y_valid_tensor = torch.tensor(y_test.values, dtype=torch.float)

In [16]:
class Net(nn.Module):
    def __init__(self, n_in_neurons, n_hidden_neurons_1, n_hidden_neurons_2, n_hidden_neurons_3, n_out_neurons):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(n_in_neurons, n_hidden_neurons_1)
        self.fc2 = nn.Linear(n_hidden_neurons_1, n_hidden_neurons_2) 
        self.fc3 = nn.Linear(n_hidden_neurons_2, n_hidden_neurons_3) 
        self.fc4 = nn.Linear(n_hidden_neurons_3, n_out_neurons)
        self.fa1 = nn.ELU()
        self.fa2 = nn.LeakyReLU()
        self.fa3 = nn.ELU()
        self.fa4 = nn.ReLU()
        self.dp1 = nn.Dropout()
        self.bn1 = nn.BatchNorm1d(n_hidden_neurons_1)
        self.dp2 = nn.Dropout()
        self.bn2 = nn.BatchNorm1d(n_hidden_neurons_2)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.bn1(out)
        out = self.fa1(out)
        out = self.dp1(out)
        out = self.fc2(out)
        # out = self.bn2(out)
        out = self.fa2(out)
        # out = self.dp2(out)
        out = self.fc3(out)
        out = self.fa3(out)
        out = self.fc4(out)
        out = self.fa4(out)
        
        return out

In [17]:
n_in_neurons = X_train_scaled_tensor.shape[1]
n_hidden_neurons_1 = X_train_scaled_tensor.shape[1]*4
n_hidden_neurons_2 = X_train_scaled_tensor.shape[1]*2
n_hidden_neurons_3 = X_train_scaled_tensor.shape[1]//2
n_out_neurons = 1
lr=0.001
num_epochs = 10000
per_epochs = 100

In [18]:
def MAELoss(yhat,y):
    return torch.abs(yhat - y).mean()

In [None]:
net = Net(n_in_neurons, n_hidden_neurons_1, n_hidden_neurons_2, n_hidden_neurons_3, n_out_neurons) optimizer = torch.optim.Adam(net.parameters(), lr=lr)

loss = MAELoss mae_min = float('inf') best_params = None

for epoch in range(num_epochs): net.train() optimizer.zero_grad() preds = net.forward(X_train_scaled_tensor).flatten() loss_value = loss(preds, y_train_tensor) loss_value.backward() optimizer.step()

if epoch % per_epochs == 0 or epoch == num_epochs - 1:
    net.eval()
    valid_preds = net(X_valid_scaled_tensor).flatten()
    mae = loss(valid_preds, y_valid_tensor)
    if mae_min > mae:
        mae_min = mae
        best_params = copy.deepcopy(net.state_dict())
    print(f"Epoch {epoch}/{num_epochs}, MAE = {mae}, best MAE = {mae_min}")

In [None]:
model = cbr_grid.best_estimator_

In [None]:
y_pred = model.predict(X_test_preprocessed)

In [None]:
result = pd.concat([test, pd.DataFrame(y_pred)], axis=1)

In [None]:
result.groupby(['category', 'segment', 'name']).first().sort_values(by=0, ascending=False)