In [5]:
import pandas as pd


In [6]:

# 1. 定义映射字典
label_dict = {
    '28-28': 0, '17-17-17': 1, '10-26-26': 2, 'DAP': 3, 
    '20-20': 4, '14-35-14': 5, 'Urea': 6
}


In [7]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

In [8]:

# 2. 特征工程：拼接、标签转化、Bool化
def export_processed_data(train_df, test_df):
    # 记录训练集长度
    train_len = len(train_df)
    
    # 提取并转化标签
    y_train = train_df['Fertilizer Name'].map(label_dict).astype(int)
    
    # 合并数据进行统一编码 (排除标签列)
    combined = pd.concat([
        train_df.drop(columns=['Fertilizer Name']), 
        test_df
    ], axis=0).reset_index(drop=True)
    
    # 拼接新特征
    combined['Soil_Crop'] = combined['Soil Type'].astype(str) + "_" + combined['Crop Type'].astype(str)
    
    # Bool化 (get_dummies 会自动移除被编码的 object 列)
    cat_cols = ['Soil Type', 'Crop Type', 'Soil_Crop']
    combined_encoded = pd.get_dummies(combined, columns=cat_cols, dtype=bool)
    
    # 拆分
    X_train = combined_encoded.iloc[:train_len].copy()
    X_test = combined_encoded.iloc[train_len:].copy()
    
    # 将标签并入 X_train，命名为 class
    X_train['class'] = y_train.values
    
    # 3. 输出为 CSV
    X_train.to_csv('X_train.csv', index=False)
    X_test.to_csv('X_test.csv', index=False)
    
    print("文件已成功输出：X_train.csv, X_test.csv")
    print(f"训练集形状: {X_train.shape}, 测试集形状: {X_test.shape}")

    return X_train, X_test

# 调用函数
X_train , X_test = export_processed_data(train, test)

文件已成功输出：X_train.csv, X_test.csv
训练集形状: (750000, 79), 测试集形状: (250000, 78)


In [9]:
display(X_train.head())

Unnamed: 0,id,Temparature,Humidity,Moisture,Nitrogen,Potassium,Phosphorous,Soil Type_Black,Soil Type_Clayey,Soil Type_Loamy,...,Soil_Crop_Sandy_Ground Nuts,Soil_Crop_Sandy_Maize,Soil_Crop_Sandy_Millets,Soil_Crop_Sandy_Oil seeds,Soil_Crop_Sandy_Paddy,Soil_Crop_Sandy_Pulses,Soil_Crop_Sandy_Sugarcane,Soil_Crop_Sandy_Tobacco,Soil_Crop_Sandy_Wheat,class
0,0,37,70,36,36,4,5,False,True,False,...,False,False,False,False,False,False,False,False,False,0
1,1,27,69,65,30,6,18,False,False,False,...,False,False,True,False,False,False,False,False,False,0
2,2,29,63,32,24,12,16,False,False,False,...,False,False,True,False,False,False,False,False,False,1
3,3,35,62,54,39,12,4,False,False,False,...,False,False,False,False,False,False,False,False,False,2
4,4,35,58,43,37,2,16,False,False,False,...,False,False,False,False,False,False,False,False,False,3


In [10]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 79 columns):
 #   Column                        Non-Null Count   Dtype
---  ------                        --------------   -----
 0   id                            750000 non-null  int64
 1   Temparature                   750000 non-null  int64
 2   Humidity                      750000 non-null  int64
 3   Moisture                      750000 non-null  int64
 4   Nitrogen                      750000 non-null  int64
 5   Potassium                     750000 non-null  int64
 6   Phosphorous                   750000 non-null  int64
 7   Soil Type_Black               750000 non-null  bool 
 8   Soil Type_Clayey              750000 non-null  bool 
 9   Soil Type_Loamy               750000 non-null  bool 
 10  Soil Type_Red                 750000 non-null  bool 
 11  Soil Type_Sandy               750000 non-null  bool 
 12  Crop Type_Barley              750000 non-null  bool 
 13  Crop Type_Cott