In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# caricamento train e split hold-out
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/fertilizer/data/train.csv")
X = df.drop(["id", "Fertilizer Name"], axis=1)
y = df["Fertilizer Name"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [2]:
# preprocessing
cat_feat = ['Soil Type','Crop Type']
num_feat = ['Temparature','Humidity','Moisture','Nitrogen','Potassium','Phosphorous']
preprocessing = ColumnTransformer([
    ("cat", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), cat_feat),
    ("num", StandardScaler(), num_feat)
])

# pipeline
pipe = Pipeline([
    ("pre", preprocessing),
    ("clf", LogisticRegression(multi_class="multinomial", max_iter=500, random_state=42))
])
pipe.fit(X_train, y_train)
preds_val = pipe.predict_proba(X_val)
classes = pipe.named_steps["clf"].classes_ # estrae array delle classi in ordine predict_proba

# lista di 3 predizione per la validation
preds_list = [list(classes[idxs[::-1]]) for idxs in np.argsort(preds_val, axis=1)[:, -3:]]



In [3]:
# funzione che restituisce map3
def map3_score(true, preds, k=3): # true -> etichette vere, preds -> lista di liste, contengono le top-k predizioni ordinate
  score = 0.0
  for t, p in zip(true, preds):
    if t in p:
      score += 1.0 / (p.index(t)+1) # 1st -> 1.0, 2nd -> 0.5, 3rd -> 0.333
  return score/len(true)

In [4]:
baseline = map3_score(y_val.tolist(), preds_list)
print(f'Baseline MAP@3 on hold-out: {baseline:.4f}')

Baseline MAP@3 on hold-out: 0.2872


In [5]:
# ricerca random sulle permutazioni
top_k = []
n_iter = 5000

for i in range(n_iter):
    # permuta casuale delle 3 predizioni per ogni riga
    cand = [list(np.random.permutation(p)) for p in preds_list]
    sc = map3_score(y_val.tolist(), cand)

    # se rientra nei migliori 3, inserisci e tronca a 3 elementi
    if len(top_k) < 3 or sc > top_k[-1][0]:
        top_k.append((sc, cand))
        # ordina per score decrescente e tieni solo i primi 3
        top_k = sorted(top_k, key=lambda x: x[0], reverse=True)[:3]

print("Top 3 MAP@3 trovati")
for rank, (score, _) in enumerate(top_k, 1):
    print(f"{rank}) MAP@3 = {score:.4f}")

Top 3 MAP@3 trovati
1) MAP@3 = 0.2854
2) MAP@3 = 0.2853
3) MAP@3 = 0.2852


In [7]:
# salviamo 3 migliori submission migliorata
for rank, (score, best_preds) in enumerate(top_k, 1):
    out = pd.DataFrame({
        'id': X_val.index,
        'Fertilizer Name': [' '.join(p) for p in best_preds]
    })
    out.to_csv(f'/content/drive/MyDrive/Colab Notebooks/fertilizer/data/submission/best_val_submission_rank{rank}.csv', index=False)
    print(f"Salvato best_val_submission_rank{rank}.csv con MAP@3 = {score:.4f}")

Salvato best_val_submission_rank1.csv con MAP@3 = 0.2854
Salvato best_val_submission_rank2.csv con MAP@3 = 0.2853
Salvato best_val_submission_rank3.csv con MAP@3 = 0.2852
