In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier

# 1. Daten laden
df = pd.read_csv("train.csv")

# 2. Zielvariable extrahieren und Merkmale vorbereiten
X = df.drop(["id", "Fertilizer Name"], axis=1)
y = df["Fertilizer Name"]

# 3. Kategorische & numerische Features erkennen
categorical_features = ["Soil Type", "Crop Type"]
numerical_features = ["Temparature", "Humidity", "Moisture", "Nitrogen", "Potassium", "Phosphorous"]

# 4. Vorverarbeitung definieren
preprocessor = ColumnTransformer([
    ("onehot", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ("scale", StandardScaler(), numerical_features)
])

# 5. XGBoost-Pipeline aufbauen
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42)
)
])


In [2]:
# 6. Daten aufteilen
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Modell trainieren
pipeline.fit(X_train, y_train)

# 8. Vorhersage und Evaluation
y_pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, y_pred)


In [3]:
ohe = pipeline.named_steps['preprocessor'].named_transformers_['onehot']
ohe_features = ohe.get_feature_names_out(categorical_features)
print("OneHot Features:", ohe_features)

print(f"\nAccuracy: {acc:.5f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

OneHot Features: ['Soil Type_Black' 'Soil Type_Clayey' 'Soil Type_Loamy' 'Soil Type_Red'
 'Soil Type_Sandy' 'Crop Type_Barley' 'Crop Type_Cotton'
 'Crop Type_Ground Nuts' 'Crop Type_Maize' 'Crop Type_Millets'
 'Crop Type_Oil seeds' 'Crop Type_Paddy' 'Crop Type_Pulses'
 'Crop Type_Sugarcane' 'Crop Type_Tobacco' 'Crop Type_Wheat']

Accuracy: 0.16465

Classification Report:
              precision    recall  f1-score   support

    10-26-26       0.17      0.20      0.18     22841
    14-35-14       0.17      0.20      0.18     22639
    17-17-17       0.18      0.19      0.19     22764
       20-20       0.16      0.17      0.17     22010
       28-28       0.16      0.16      0.16     22384
         DAP       0.15      0.11      0.13     19148
        Urea       0.14      0.10      0.12     18214

    accuracy                           0.16    150000
   macro avg       0.16      0.16      0.16    150000
weighted avg       0.16      0.16      0.16    150000

