# Steps

## 1. [Import the Libraries](#1.-Import-the-libraries)
## 2. [Load the data](#2.-Load-the-Data)
## 3. [Preprocessing](#3.-Preprocessing-the-Data)
- 3.1 [OneHotEncoding](#3.1-Data-tranformation)
- 3.2 [Standard Scaling](#3.2-Standard-Scaling)
## 4. [Data preparation](#4.-Data-Preparation)
## 5. [Model Evaluator](#5.-Mean-Average-Precision)
## 6. [Model training](#6.-Model-Training)
- 6.1 [MAP@3 Score](#6.1-Model-Score)
## 7. [Feature Engineering](#7.-Adding-New-Features)
- 7.1 [Numeric Features](#7.1-Numerical-Columns)
## [Submission](#Submit-the-test-data-prediction)

## 1. Import the libraries

[🔝 Return to top](#Steps)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder # Used to label Categorical Target Variable
from sklearn.preprocessing import OneHotEncoder # Used for unordered categorical features
from sklearn.preprocessing import OrdinalEncoder # Used for ordered categorical features
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer

from xgboost import XGBClassifier, plot_importance
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score

########  hyperparameter Tuning  #################
#import optuna

import warnings
warnings.filterwarnings('ignore')

## 2. Load the Data

[🔝 Return to top](#Steps)

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data.set_index('id', inplace = True)
test_data.set_index('id', inplace = True)
train_copy = train_data.copy()
test_copy = test_data.copy()

## 3. Preprocessing the Data

[🔝 Return to top](#Steps)

In [3]:
numerical_columns = [ col for col in train_data.columns if train_data[col].dtype != 'O']
#['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']
# All Categorical Columns -> ['Soil Type', 'Crop Type', 'Fertilizer Name'] 
categorical_columns = [col for col in train_data.columns if train_data[col].dtype == 'object' and col != 'Fertilizer Name']
#['Soil Type', 'Crop Type']

### 3.1 Data tranformation

[🔝 Return to top](#Steps)

In [4]:
labeler = LabelEncoder()
train_data['Fertilizer Name'] = labeler.fit_transform(train_data['Fertilizer Name'])

In [5]:
cat_encoder = OneHotEncoder() # Unordered Catergorical Columns

preprocessor = ColumnTransformer(
    transformers = [
        ('categorical', OneHotEncoder(sparse_output = False, handle_unknown = 'ignore'), categorical_columns)
],
    remainder = 'passthrough'  # Keep other columns same
)

In [6]:
for col in categorical_columns:
    train_dummy = pd.get_dummies(train_data[col], dtype= int )
    train_data = pd.concat([train_data, train_dummy], axis = 1)
    test_dummy = pd.get_dummies(test_data[col], dtype = int)
    test_data = pd.concat([test_data, test_dummy], axis = 1)

In [7]:
train_data.drop(categorical_columns, axis = 1, inplace = True)
test_data.drop(categorical_columns, axis = 1, inplace = True)

In [8]:
test_data.shape

(250000, 22)

### 3.2 Standard Scaling

[🔝 Return to top](#Steps)

## 4. Data Preparation

[🔝 Return to top](#Steps)

In [13]:
X = train_data.drop(['Fertilizer Name'], axis = 1)
y = train_data['Fertilizer Name']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## 5. Mean Average Precision

[🔝 Return to top](#Steps)

In [10]:
############################### MAP@3 ####################################
def mapk(y_test, y_pred_proba,k =3):
    top3_indices = np.argsort(y_pred_proba, axis = 1)[:, ::-1][ : ,:3]
    sum_ap = 0

    for indx, lst in enumerate(top3_indices):
        true_label = y_test.iloc[indx]

        try:
            ap = 1 / (list(lst).index(true_label) + 1)
        except ValueError:
            ap = 0

        sum_ap += ap

    return (sum_ap / len(y_test))

## Hyperparameter Tuning

In [None]:
def objective_xgb_multiclass(trial):
    """
    Objective function for Optuna to optimize XGBoost hyperparameters for multi-class classification.
    """
    # 1. Define the hyperparameter search space
    params = {
        'objective': 'multi:softprob',  # For multi-class classification (returns probabilities)
        'eval_metric': 'mlogloss',      # Metric for early stopping for multi-class logloss
        'num_class': len(np.unique(y)), # Crucial: Specify the number of classes
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'use_label_encoder': False,      # Suppress the deprecation warning
        'random_state': 42               # For reproducibility
    }

    # 2. Instantiate the XGBClassifier with the suggested parameters
    model = XGBClassifier(**params)

    # 3. Evaluate the model using cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    # For multi-class, it's common to optimize for accuracy or negative log-loss
    # Here, we'll use accuracy as the primary optimization metric for the study,
    # but also show how to calculate log-loss if you wanted to optimize that instead.

    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train, y_train,
                  #early_stopping_rounds=50,
                  eval_set=[(X_val, y_val)],
                  verbose=False)

        # Predict probabilities for log-loss, or classes for accuracy
        y_pred_proba = model.predict_proba(X_val)
        y_pred_class = model.predict(X_val)

        # Calculate evaluation metrics
        current_accuracy = accuracy_score(y_val, y_pred_class)

        scores.append(current_accuracy) # We are optimizing for accuracy

    # Return the average score across folds
    return np.mean(scores)

In [11]:
params = {
    'n_estimators': 800,
    'learning_rate': 0.03217095903889588,
    'max_depth': 8,
    'subsample': 0.8208104960302076,
    'colsample_bytree': 0.6588310736593839,
    'gamma': 0.0032831657530829235,
    'reg_lambda': 8.963908695312456,
    'reg_alpha': 0.04783339091017353
}

model = XGBClassifier(**params)

In [14]:
model.fit(X_train, y_train)

y_pred_proba = model.predict_proba(X_test)

print(mapk(y_test, y_pred_proba))

0.3416844444445301


## 6. Model Training 

[🔝 Return to top](#Steps)

##### ouput

## 7. Adding New Features

[🔝 Return to top](#Steps)

### 7.1 Numerical Columns

## Submit the test data prediction

[🔝 Return to top](#Steps)

In [15]:
params = {
    'n_estimators': 800,
    'learning_rate': 0.03217095903889588,
    'max_depth': 8,
    'subsample': 0.8208104960302076,
    'colsample_bytree': 0.6588310736593839,
    'gamma': 0.0032831657530829235,
    'reg_lambda': 8.963908695312456,
    'reg_alpha': 0.04783339091017353
}

model = XGBClassifier(**params)

model.fit(X,y)

probs = model.predict_proba(test_data)
top3_indices = np.argsort(probs , axis = 1)[ : , ::-1][:,:3]
top3_labels = labeler.inverse_transform(top3_indices.ravel()).reshape(top3_indices.shape)

pred_top3 = [' '.join(row) for row in top3_labels]

In [16]:
submission = pd.DataFrame({
    "id" : test_data.index,
    "Fertilizer Name" : pred_top3
})

submission.to_csv("submission.csv", index = False) 
submission.head()

Unnamed: 0,id,Fertilizer Name
0,750000,DAP 28-28 10-26-26
1,750001,17-17-17 20-20 Urea
2,750002,20-20 28-28 10-26-26
3,750003,14-35-14 17-17-17 Urea
4,750004,20-20 10-26-26 28-28
