In [57]:
import numpy as np
import pandas as pd

In [58]:
# define files path
train_path = "./train.csv"
test_path = "./test.csv"

In [59]:
# read csv file
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [60]:
# review train file
print(f"head of train_df:")
display(train_df.head())

print(f"head of test_df:")
display(test_df.head())


print(f"shape of train_df:{train_df.shape}")
print(f"info of train_df:")
train_df.info()

head of train_df:


Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,1,27,69,65,Sandy,Millets,30,6,18,28-28
2,2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,4,35,58,43,Red,Paddy,37,2,16,DAP


head of test_df:


Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
0,750000,31,70,52,Sandy,Wheat,34,11,24
1,750001,27,62,45,Red,Sugarcane,30,14,15
2,750002,28,72,28,Clayey,Ground Nuts,14,15,4
3,750003,37,53,57,Black,Ground Nuts,18,17,36
4,750004,31,55,32,Red,Pulses,13,19,14


shape of train_df:(750000, 10)
info of train_df:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   id               750000 non-null  int64 
 1   Temparature      750000 non-null  int64 
 2   Humidity         750000 non-null  int64 
 3   Moisture         750000 non-null  int64 
 4   Soil Type        750000 non-null  object
 5   Crop Type        750000 non-null  object
 6   Nitrogen         750000 non-null  int64 
 7   Potassium        750000 non-null  int64 
 8   Phosphorous      750000 non-null  int64 
 9   Fertilizer Name  750000 non-null  object
dtypes: int64(7), object(3)
memory usage: 57.2+ MB


In [61]:
# remove id
train_df.drop(columns=['id'], inplace=True)
test_ids = test_df['id']
test_df.drop(columns=['id'], inplace=True)

print(f"shape of train: {train_df.shape}")
print(f"shape of test: {test_df.shape}")
print(f"columns names: {train_df.columns[:].tolist()}")

shape of train: (750000, 9)
shape of test: (250000, 8)
columns names: ['Temparature', 'Humidity', 'Moisture', 'Soil Type', 'Crop Type', 'Nitrogen', 'Potassium', 'Phosphorous', 'Fertilizer Name']


In [62]:
# 获取所有唯一的肥料名称
unique_fertilizers = train_df['Fertilizer Name'].unique()

print(f"共有 {len(unique_fertilizers)} 种肥料：")
print(unique_fertilizers)

共有 7 种肥料：
['28-28' '17-17-17' '10-26-26' 'DAP' '20-20' '14-35-14' 'Urea']


In [63]:
# target variable y
if 'Fertilizer Name' in train_df.columns:
    
    y = train_df['Fertilizer Name'].map({'28-28':0, '17-17-17':1, '10-26-26':2, 'DAP':3, '20-20':4, '14-35-14':5, 'Urea':6})
    train_df.drop(columns=['Fertilizer Name'], inplace=True)

In [64]:
# label encoding
from sklearn.preprocessing import LabelEncoder

cat_cols = train_df.select_dtypes(include=['object']).columns

for col in cat_cols:
    # fill N/A
    train_df[col] = train_df[col].astype(str).fillna('missing')
    test_df[col] = test_df[col].astype(str).fillna('missing')
    
    le = LabelEncoder()

    # get all labels
    full_data = pd.concat([train_df[col], test_df[col]], axis=0)
    le.fit(full_data)
    
    train_df[col] = le.transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

print("Label Encoding finish")

Label Encoding finish


In [65]:
# split train and val
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    train_df, y, test_size=0.2, random_state=42, stratify=y
)

print(f"shape of X_train: {X_train.shape}")
print(f"shape of X_val: {X_val.shape}")

shape of X_train: (600000, 8)
shape of X_val: (150000, 8)


In [67]:
import optuna
import lightgbm as lgb
import numpy as np

# 1. 定义 MAP@3 计算函数
def mapk(actual, predicted_probs, k=3):
    """
    actual: 真实标签的数组 (n_samples,)
    predicted_probs: 模型输出的概率矩阵 (n_samples, n_classes)
    """
    scores = []
    # 获取概率最高的前 k 个索引
    top_k_indices = np.argsort(-predicted_probs, axis=1)[:, :k]
    
    for a, p in zip(actual, top_k_indices):
        score = 0.0
        for i, pred in enumerate(p):
            if pred == a:
                score = 1.0 / (i + 1)
                break
        scores.append(score)
    return np.mean(scores)

def objective_lgb(trial):
    # 2. 修改参数搜索空间为多分类
    param = {
        'objective': 'multiclass',
        'num_class': 7,  # 你的肥料种类总数
        'metric': 'multi_logloss',
        'verbosity': -1,
        'device': 'gpu', # 如果没有 GPU 请改为 cpu
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 64),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'n_estimators': 200,
        'random_state': 42
    }

    # 如果数据集很大，也可以只用一组简单的 train_test_split
    model = lgb.LGBMClassifier(**param)
    
    # 假设使用 X_train, y_train 进行简单的验证
    model.fit(X_train, y_train)
    
    # 4. 获取概率矩阵
    probs = model.predict_proba(X_val)
    
    # 5. 计算 MAP@3
    score = mapk(y_val, probs, k=3)
    
    return score

# 6. 开始优化
study_lgb = optuna.create_study(direction='maximize')
study_lgb.optimize(objective_lgb, n_trials=30)

print("最优参数: ", study_lgb.best_params)
print("最高 MAP@3: ", study_lgb.best_value)

[I 2026-01-04 13:34:37,312] A new study created in memory with name: no-name-9a3e0592-6fc8-4ab1-ba66-ab4bc7102f39
[I 2026-01-04 13:34:43,450] Trial 0 finished with value: 0.3126311111111111 and parameters: {'learning_rate': 0.01804786480710216, 'num_leaves': 25, 'max_depth': 7, 'min_child_samples': 54, 'subsample': 0.7939291707649618, 'colsample_bytree': 0.81749176443932}. Best is trial 0 with value: 0.3126311111111111.
[I 2026-01-04 13:34:50,781] Trial 1 finished with value: 0.31543666666666664 and parameters: {'learning_rate': 0.015133898829096066, 'num_leaves': 58, 'max_depth': 7, 'min_child_samples': 63, 'subsample': 0.7848484304439571, 'colsample_bytree': 0.726267706580765}. Best is trial 1 with value: 0.31543666666666664.
[I 2026-01-04 13:34:55,906] Trial 2 finished with value: 0.3017699999999999 and parameters: {'learning_rate': 0.012920549751380236, 'num_leaves': 24, 'max_depth': 4, 'min_child_samples': 40, 'subsample': 0.8457703432954939, 'colsample_bytree': 0.8571344502966183

最优参数:  {'learning_rate': 0.09829263879783082, 'num_leaves': 61, 'max_depth': 6, 'min_child_samples': 47, 'subsample': 0.7135136419518686, 'colsample_bytree': 0.6409854466084727}
最高 MAP@3:  0.33268888888888887


In [68]:
import json

# 1. 获取最优参数字典
best_lgb_params = study_lgb.best_params

# 2. 建议手动加入一些非搜索的固定参数，确保文件完整
best_lgb_params.update({
    'objective': 'multiclass',
    'num_class': 7,  # 你的肥料种类总数
    'metric': 'multi_logloss',
    'verbosity': -1
})

# 3. 写入文件
with open('best_lgb_params.json', 'w') as f:
    json.dump(best_lgb_params, f, indent=4)

print("LightGBM 参数已保存至 best_lgb_params.json")

LightGBM 参数已保存至 best_lgb_params.json


In [69]:
# --- A. 加载之前保存的最优参数 ---
with open('best_lgb_params.json', 'r') as f:
    final_params = json.load(f)

# --- B. 准备全量训练数据 ---
# 使用合并后的训练集和验证集进行最后一次训练，最大化利用数据
X_full = pd.concat([X_train, X_val])
y_full = pd.concat([y_train, y_val])

print("正在使用全量数据进行最终训练...")
final_model = lgb.LGBMClassifier(**final_params)
final_model.fit(X_full, y_full)

# --- C. 对测试集进行预测 ---
# 获取概率矩阵 [n_samples, 7]
probs = final_model.predict_proba(test_df)

# 获取概率最高的前 3 个索引
top3_idx = np.argsort(-probs, axis=1)[:, :3]

# --- D. 映射回肥料名称字符串 ---
# 创建索引到名称的逆向字典
# 注意：这里的映射顺序必须和你之前映射 y 时的字典完全一致
inv_map = {0:'28-28', 1:'17-17-17', 2:'10-26-26', 3:'DAP', 4:'20-20', 5:'14-35-14', 6:'Urea'}

final_preds = []
for row in top3_idx:
    # 将 3 个预测值拼成空格分隔的字符串
    labels = [inv_map[idx] for idx in row]
    final_preds.append(" ".join(labels))

# --- E. 生成提交文件 ---
submission = pd.DataFrame({
    'id': test_ids,
    'Fertilizer Name': final_preds
})

submission.to_csv('submission.csv', index=False)
print("恭喜！提交文件 submission.csv 已生成。")
display(submission.head())

正在使用全量数据进行最终训练...
恭喜！提交文件 submission.csv 已生成。


Unnamed: 0,id,Fertilizer Name
0,750000,28-28 DAP 20-20
1,750001,17-17-17 20-20 10-26-26
2,750002,20-20 14-35-14 10-26-26
3,750003,14-35-14 17-17-17 10-26-26
4,750004,20-20 10-26-26 17-17-17
