# Анализ данных, что наколдовал

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Настройка визуализации
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

# Загрузка данных
df = pd.read_csv('rations_with_acids.csv', encoding='utf-8-sig')
df

Unnamed: 0,ration_id,силос,сенаж,корнаж,кукуруза,солома,жом,комбикорм 10,комбикорм 11,рожь,...,Миристолеиновая,Пальмитиновая,Пальмитолеиновая,Стеариновая,Олеиновая,Линолевая,Линоленовая,Арахиновая,Бегеновая,Прочие
0,10,6.820,5.04,0.00,3.15,0.0,0.800,0,0.0,0.0,...,1.12,25.69,1.59,9.60,25.05,3.31,1.31,0.21,0.06,6.34
1,11,7.200,3.60,0.00,5.40,0.0,2.000,0,0.0,0.0,...,1.23,22.84,2.23,11.71,26.90,3.48,1.07,0.18,0.09,5.92
2,12,6.920,4.59,0.00,3.00,0.0,0.800,0,0.0,0.0,...,0.81,23.15,1.42,12.22,22.15,4.95,0.23,0.01,0.00,5.64
3,13,6.965,2.95,3.00,0.00,0.3,0.550,0,0.0,0.0,...,1.13,31.50,1.97,8.13,21.34,2.27,0.81,0.16,0.05,5.89
4,14,6.900,2.95,3.00,0.00,0.3,0.465,0,0.0,0.0,...,1.23,25.98,1.63,13.14,26.92,4.72,0.97,0.24,0.04,4.68
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,91,6.410,9.85,0.00,3.00,0.0,0.000,0,0.0,0.0,...,1.04,31.05,1.42,11.42,21.43,2.22,0.76,0.26,0.00,5.43
89,92,7.400,3.82,0.00,4.00,0.0,0.930,0,0.0,0.0,...,1.37,23.51,1.46,10.86,20.40,4.66,0.73,0.23,0.01,4.85
90,93,7.400,3.82,0.00,4.00,0.0,0.930,0,0.0,0.0,...,0.96,26.31,1.63,8.25,20.57,3.02,0.26,0.27,0.09,5.86
91,94,7.000,3.70,0.00,2.50,0.0,0.000,0,0.0,0.0,...,1.23,21.61,2.23,8.56,20.47,3.67,0.59,0.25,0.02,4.58


In [12]:
acid_columns = [
    'Масляная', 'Капроновая', 'Каприловая', 'Каприновая', 'Деценовая',
    'Лауриновая', 'Миристиновая', 'Миристолеиновая', 'Пальмитиновая', 
    'Пальмитолеиновая', 'Стеариновая', 'Олеиновая', 'Линолевая', 
    'Линоленовая', 'Арахиновая', 'Бегеновая', 'Прочие'
]

component_columns = [col for col in df.columns if col not in ['ration_id'] + acid_columns]

In [16]:
# Анализ компонентов рациона
print("🥛 АНАЛИЗ КОМПОНЕНТОВ РАЦИОНА:")
print("=" * 50)

# Статистика по компонентам
component_stats = df[component_columns].describe().T
component_stats['nonzero_count'] = (df[component_columns] > 0).sum()
component_stats['nonzero_percent'] = (component_stats['nonzero_count'] / len(df)) * 100

print("📊 ОСНОВНАЯ СТАТИСТИКА ПО КОМПОНЕНТАМ:")
print(component_stats[['count', 'mean', 'std', 'min', 'max', 'nonzero_percent']].head(10))

# Топ-10 самых частых компонентов (ненулевые значения)
print("\n🏆 ТОП-10 САМЫХ ЧАСТЫХ КОМПОНЕНТОВ:")
top_components = component_stats.sort_values('nonzero_percent', ascending=False).head(10)
print(top_components[['nonzero_count', 'nonzero_percent', 'mean', 'max']])

# Топ-10 самых редких компонентов
print("\n📉 ТОП-10 САМЫХ РЕДКИХ КОМПОНЕНТОВ:")
rare_components = component_stats[component_stats['nonzero_count'] > 0].sort_values('nonzero_percent').head(10)
print(rare_components[['nonzero_count', 'nonzero_percent', 'mean', 'max']])

🥛 АНАЛИЗ КОМПОНЕНТОВ РАЦИОНА:
📊 ОСНОВНАЯ СТАТИСТИКА ПО КОМПОНЕНТАМ:
              count      mean       std  min    max  nonzero_percent
силос          93.0  5.311355  3.091919  0.0  13.90        77.419355
сенаж          93.0  1.740215  1.833041  0.0   9.85        54.838710
корнаж         93.0  0.534516  0.956267  0.0   3.00        25.806452
кукуруза       93.0  4.776333  3.372829  0.0  12.85        88.172043
солома         93.0  0.224086  0.387882  0.0   1.50        31.182796
жом            93.0  0.888097  0.606843  0.0   3.90        78.494624
комбикорм 10   93.0  0.000000  0.000000  0.0   0.00         0.000000
комбикорм 11   93.0  0.041398  0.399226  0.0   3.85         1.075269
рожь           93.0  0.013226  0.127545  0.0   1.23         1.075269
пшеница        93.0  0.079462  0.305651  0.0   2.10         7.526882

🏆 ТОП-10 САМЫХ ЧАСТЫХ КОМПОНЕНТОВ:
                nonzero_count  nonzero_percent      mean     max
шрот соевый                86        92.473118  2.484882   3.564
кукуруз

In [20]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# Разделяем на признаки (X) и цели (y)
X = df[component_columns].fillna(0)  # компоненты рациона
y = df[acid_columns]  # целевые кислоты

print(f"📊 Размеры: X {X.shape}, y {y.shape}")

📊 Размеры: X (93, 37), y (93, 17)


In [22]:
# Разделяем на train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Создаем и обучаем модель
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiRMSE',  # для многозадачной регрессии
    verbose=100,
    random_state=42
)

# Обучаем модель
model.fit(X_train, y_train, eval_set=(X_test, y_test))

# Предсказания
y_pred = model.predict(X_test)

# Оценка по каждой кислоте
print("📈 РЕЗУЛЬТАТЫ МОДЕЛИ:")
print("=" * 50)
for i, acid in enumerate(acid_columns):
    mae = mean_absolute_error(y_test.iloc[:, i], y_pred[:, i])
    r2 = r2_score(y_test.iloc[:, i], y_pred[:, i])
    print(f"{acid:15} | MAE: {mae:.3f} | R²: {r2:.3f}")

0:	learn: 4.7625709	test: 4.5452592	best: 4.5452592 (0)	total: 61.1ms	remaining: 1m 1s
100:	learn: 2.0890345	test: 4.9101139	best: 4.5082292 (16)	total: 144ms	remaining: 1.28s
200:	learn: 1.0586211	test: 5.1120510	best: 4.5082292 (16)	total: 214ms	remaining: 852ms
300:	learn: 0.5862619	test: 5.2333072	best: 4.5082292 (16)	total: 286ms	remaining: 664ms
400:	learn: 0.3348677	test: 5.2867145	best: 4.5082292 (16)	total: 359ms	remaining: 537ms
500:	learn: 0.1956671	test: 5.3072629	best: 4.5082292 (16)	total: 508ms	remaining: 506ms
600:	learn: 0.1190193	test: 5.3235693	best: 4.5082292 (16)	total: 674ms	remaining: 447ms
700:	learn: 0.0719737	test: 5.3300572	best: 4.5082292 (16)	total: 831ms	remaining: 354ms
800:	learn: 0.0432857	test: 5.3338760	best: 4.5082292 (16)	total: 990ms	remaining: 246ms
900:	learn: 0.0264256	test: 5.3364018	best: 4.5082292 (16)	total: 1.15s	remaining: 126ms
999:	learn: 0.0156371	test: 5.3378148	best: 4.5082292 (16)	total: 1.29s	remaining: 0us

bestTest = 4.508229177
b

In [24]:
# Важность признаков
feature_importance = model.get_feature_importance()
importance_df = pd.DataFrame({
    'component': component_columns,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("\n🔝 ТОП-15 ВАЖНЕЙШИХ КОМПОНЕНТОВ:")
print(importance_df.head(15))


🔝 ТОП-15 ВАЖНЕЙШИХ КОМПОНЕНТОВ:
         component  importance
3         кукуруза    9.676241
11   шрот рапсовый    9.153547
21   жмых рапсовый    9.144911
12     шрот соевый    8.771425
0            силос    7.745497
1            сенаж    6.370407
5              жом    6.195026
4           солома    6.105890
27         люцерна    5.878530
22            соль    4.724066
19  жир защищенный    4.614336
20          патока    4.593687
2           корнаж    3.304344
23  премикс дойный    3.298228
17          ячмень    3.033817


In [26]:
# Сохраняем модель и список фич
model.save_model('catboost_acid_model.cbm')

# Сохраняем список компонентов (важно для порядка фич)
import pickle
with open('component_columns.pkl', 'wb') as f:
    pickle.dump(component_columns, f)

print("✅ Модель и метаданные сохранены")

✅ Модель и метаданные сохранены
