In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import optuna
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

In [6]:
train_df = pd.read_csv('/kaggle/input/amini-soil-data/Train (2).csv')
test_df = pd.read_csv('/kaggle/input/amini-soil-data/Test (1).csv')
train_gap_df = pd.read_csv('/kaggle/input/amini-soil-data/Gap_Train.csv')
test_gap_df = pd.read_csv('/kaggle/input/amini-soil-data/Gap_Test.csv')
sample_submission = pd.read_csv('/kaggle/input/amini-soil-data/SampleSubmission (5).csv')

In [3]:
train_df.tail()

Unnamed: 0,site,PID,lon,lat,pH,alb,bio1,bio12,bio15,bio7,...,P,K,Ca,Mg,S,Fe,Mn,Zn,Cu,B
7739,site_id_qnYCGg,ID_SL3ETR,70.240079,39.738497,5.4,140,143,1453,66,141,...,2.0,522,1878,305,11.0,124.0,185.0,4.6,2.0,0.39
7740,site_id_qnYCGg,ID_m3eydu,70.241244,39.735097,5.5,141,143,1453,66,141,...,2.0,264,1317,211,12.0,122.0,217.0,5.3,2.5,0.2
7741,site_id_qnYCGg,ID_GGQ39P,70.229183,39.734177,5.5,145,145,1406,65,140,...,3.0,352,1671,228,10.0,142.0,209.0,9.9,2.9,0.3
7742,site_id_qnYCGg,ID_PS0d4J,70.243131,39.756517,5.1,140,142,1420,66,141,...,3.0,366,1507,239,13.0,142.0,264.0,10.0,2.2,0.33
7743,site_id_jfGDPa,ID_IGMXgo,70.238442,39.733527,5.4,145,144,1466,66,140,...,2.0,449,1510,182,14.0,112.0,264.0,4.7,2.3,0.17


In [4]:
test_df.head()

Unnamed: 0,site,PID,lon,lat,pH,alb,bio1,bio12,bio15,bio7,...,para,parv,ph20,slope,snd20,soc20,tim,wp,xhp20,BulkDensity
0,site_id_hgJpkz,ID_NGS9Bx,69.170794,44.522885,6.86,144,256,910,108,186,...,37.940418,467.619293,6.825,1.056416,25.5,15.25,8.732471,0.016981,0.005831,1.2
1,site_id_olmuI5,ID_YdVKXw,68.885265,44.741057,7.08,129,260,851,110,187,...,35.961353,542.590149,6.725,0.730379,18.75,14.0,10.565657,0.02103,0.005134,1.24
2,site_id_PTZdJz,ID_MZAlfE,68.97021,44.675777,6.5,142,259,901,109,187,...,38.983898,416.385437,6.825,1.146542,21.0,14.0,9.590125,0.018507,0.00448,1.23
3,site_id_DOTgr8,ID_GwCCMN,69.068751,44.647707,6.82,142,261,847,109,187,...,39.948471,374.971008,6.725,0.56721,23.25,12.25,9.669279,0.021688,0.006803,1.22
4,site_id_1rQNvy,ID_K8sowf,68.990002,44.577607,6.52,145,253,1109,110,186,...,33.658615,361.233643,6.2,1.169207,26.25,18.25,7.89592,0.023016,0.000874,1.23


In [7]:
train_gap_df.tail()

Unnamed: 0,Nutrient,Required,Available,Gap,PID
85179,Fe,0.8,248.64,-247.84,ID_IGMXgo
85180,Mn,0.4,586.08,-585.68,ID_IGMXgo
85181,Zn,0.4,10.434,-10.034,ID_IGMXgo
85182,Cu,0.2,5.106,-4.906,ID_IGMXgo
85183,B,0.08,0.3774,-0.2974,ID_IGMXgo


In [7]:
test_gap_df.head()

Unnamed: 0,Nutrient,Required,PID
0,N,100.0,ID_NGS9Bx
1,P,40.0,ID_NGS9Bx
2,K,52.0,ID_NGS9Bx
3,Ca,12.0,ID_NGS9Bx
4,Mg,8.0,ID_NGS9Bx


In [8]:
test_gap_df = pd.merge(test_gap_df, test_df[['PID', 'BulkDensity']], on='PID', how='left')

In [9]:
test_gap_df.head()

Unnamed: 0,Nutrient,Required,PID,BulkDensity
0,N,100.0,ID_NGS9Bx,1.2
1,P,40.0,ID_NGS9Bx,1.2
2,K,52.0,ID_NGS9Bx,1.2
3,Ca,12.0,ID_NGS9Bx,1.2
4,Mg,8.0,ID_NGS9Bx,1.2


In [7]:
sample_submission.head()

Unnamed: 0,ID,Gap
0,ID_002W8m_B,0
1,ID_002W8m_Ca,0
2,ID_002W8m_Cu,0
3,ID_002W8m_Fe,0
4,ID_002W8m_K,0


In [10]:

# Fill missing values with the mean for columns with missing values in train_df
for column in train_df.columns:
  if train_df[column].isnull().any():
    train_df[column].fillna(train_df[column].mean(), inplace=True)

# Fill missing values with the mean for columns with missing values in test_df
for column in test_df.columns:
  if test_df[column].isnull().any():
    test_df[column].fillna(test_df[column].mean(), inplace=True)


In [18]:
landsat_8 = pd.read_csv('/kaggle/input/amini-soil-data/LANDSAT8_data_updated.csv')

In [19]:
landsat_8.columns

Index(['QA_PIXEL', 'QA_RADSAT', 'SR_B1', 'SR_B2', 'SR_B3', 'SR_B4', 'SR_B5',
       'SR_B6', 'SR_B7', 'ST_B10', 'date', 'lat', 'lon', 'PID'],
      dtype='object')

In [24]:
numeric_cols = landsat_8.select_dtypes(include='number').columns

landsat_agg = landsat_8.groupby('PID')[numeric_cols].mean().reset_index()


In [28]:
def compute_landsat_indices(data_frame):
    df = data_frame.copy()

    # Convert bands to float
    bands = ["SR_B1", "SR_B2", "SR_B3", "SR_B4", "SR_B5", "SR_B6"]
    for b in bands:
        df[b] = df[b].astype(float)

    # NDVI
    df['NDVI'] = (df['SR_B4'] - df['SR_B3']) / (df['SR_B4'] + df['SR_B3'])

    # NDWI McFeeters
    df['NDWI_mcf'] = (df['SR_B2'] - df['SR_B4']) / (df['SR_B2'] + df['SR_B4'])

    # NDWI Gao
    df['NDWI_gao'] = (df['SR_B4'] - df['SR_B5']) / (df['SR_B4'] + df['SR_B5'])

    # NDRE (approximate with SWIR1)
    df['NDRE_approx'] = (df['SR_B4'] - df['SR_B5']) / (df['SR_B4'] + df['SR_B5'])

    # EVI
    df["EVI"] = 2.5 * (df["SR_B4"] - df["SR_B3"]) / (
        df["SR_B4"] + 6 * df["SR_B3"] - 7.5 * df["SR_B1"] + 1
    )

    # SAVI
    L = 0.5
    df["SAVI"] = ((df["SR_B4"] - df["SR_B3"]) * (1 + L)) / (
        df["SR_B4"] + df["SR_B3"] + L
    )

    # NBR
    df["NBR"] = (df["SR_B4"] - df["SR_B6"]) / (df["SR_B4"] + df["SR_B6"])

    return df


In [29]:
landsat_indices = compute_landsat_indices(landsat_agg)

In [31]:
landsat_indices.columns

Index(['PID', 'QA_PIXEL', 'QA_RADSAT', 'SR_B1', 'SR_B2', 'SR_B3', 'SR_B4',
       'SR_B5', 'SR_B6', 'SR_B7', 'ST_B10', 'lat', 'lon', 'NDVI', 'NDWI_mcf',
       'NDWI_gao', 'NDRE_approx', 'EVI', 'SAVI', 'NBR'],
      dtype='object')

In [33]:
indices.head()

NameError: name 'indices' is not defined

In [35]:

train_merged = pd.merge(train_df ,landsat_indices , on = 'PID' ,how = 'left')

In [38]:
test_merged = pd.merge(test_df , landsat_indices , on = 'PID' , how = 'left')


In [37]:
print(f"initial train len is {len(train_df)} merged len is {len(train_merged)}" )

initial train len is 7744 merged len is7744


In [40]:
print(f"initial test len is {len(test_df)} merged len is {len(test_merged)}" )

initial test len is 2418 merged len is 2418


In [41]:
train_merged.columns

Index(['site', 'PID', 'lon_x', 'lat_x', 'pH', 'alb', 'bio1', 'bio12', 'bio15',
       'bio7', 'bp', 'cec20', 'dows', 'ecec20', 'hp20', 'ls', 'lstd', 'lstn',
       'mb1', 'mb2', 'mb3', 'mb7', 'mdem', 'para', 'parv', 'ph20', 'slope',
       'snd20', 'soc20', 'tim', 'wp', 'xhp20', 'BulkDensity', 'N', 'P', 'K',
       'Ca', 'Mg', 'S', 'Fe', 'Mn', 'Zn', 'Cu', 'B', 'QA_PIXEL', 'QA_RADSAT',
       'SR_B1', 'SR_B2', 'SR_B3', 'SR_B4', 'SR_B5', 'SR_B6', 'SR_B7', 'ST_B10',
       'lat_y', 'lon_y', 'NDVI', 'NDWI_mcf', 'NDWI_gao', 'NDRE_approx', 'EVI',
       'SAVI', 'NBR'],
      dtype='object')

In [42]:
train_merged.isnull().sum()

site           0
PID            0
lon_x          0
lat_x          0
pH             0
              ..
NDWI_gao       3
NDRE_approx    3
EVI            3
SAVI           3
NBR            3
Length: 63, dtype: int64

In [46]:
# Fill numeric columns with mean, in place
numeric_cols = train_merged.select_dtypes(include='number').columns
train_merged[numeric_cols] = train_merged[numeric_cols].fillna(train_merged[numeric_cols].mean())

# Fill non-numeric columns with mode, in place
non_numeric_cols = train_merged.select_dtypes(exclude='number').columns

for col in non_numeric_cols:
    if train_merged[col].isnull().any():
        mode = train_merged[col].mode()
        if not mode.empty:
            train_merged[col].fillna(train_merged.mean(), inplace=True)


In [47]:
train_merged.isnull().sum()

site           0
PID            0
lon_x          0
lat_x          0
pH             0
              ..
NDWI_gao       0
NDRE_approx    0
EVI            0
SAVI           0
NBR            0
Length: 63, dtype: int64

In [48]:
target_columns = ['N', 'P', 'K', 'Ca', 'Mg', 'S', 'Fe', 'Mn', 'Zn', 'Cu', 'B']

In [23]:
test_merged = pd.merge(test_df , indices ,on = 'PID', how = 'left')

In [None]:
test_merged.isnull().sum()

In [49]:
train_df.tail()

Unnamed: 0,site,PID,lon,lat,pH,alb,bio1,bio12,bio15,bio7,...,P,K,Ca,Mg,S,Fe,Mn,Zn,Cu,B
7739,site_id_qnYCGg,ID_SL3ETR,70.240079,39.738497,5.4,140,143,1453,66,141,...,2.0,522,1878,305,11.0,124.0,185.0,4.6,2.0,0.39
7740,site_id_qnYCGg,ID_m3eydu,70.241244,39.735097,5.5,141,143,1453,66,141,...,2.0,264,1317,211,12.0,122.0,217.0,5.3,2.5,0.2
7741,site_id_qnYCGg,ID_GGQ39P,70.229183,39.734177,5.5,145,145,1406,65,140,...,3.0,352,1671,228,10.0,142.0,209.0,9.9,2.9,0.3
7742,site_id_qnYCGg,ID_PS0d4J,70.243131,39.756517,5.1,140,142,1420,66,141,...,3.0,366,1507,239,13.0,142.0,264.0,10.0,2.2,0.33
7743,site_id_jfGDPa,ID_IGMXgo,70.238442,39.733527,5.4,145,144,1466,66,140,...,2.0,449,1510,182,14.0,112.0,264.0,4.7,2.3,0.17


In [49]:
test_merged.columns

Index(['site', 'PID', 'lon_x', 'lat_x', 'pH', 'alb', 'bio1', 'bio12', 'bio15',
       'bio7', 'bp', 'cec20', 'dows', 'ecec20', 'hp20', 'ls', 'lstd', 'lstn',
       'mb1', 'mb2', 'mb3', 'mb7', 'mdem', 'para', 'parv', 'ph20', 'slope',
       'snd20', 'soc20', 'tim', 'wp', 'xhp20', 'BulkDensity', 'QA_PIXEL',
       'QA_RADSAT', 'SR_B1', 'SR_B2', 'SR_B3', 'SR_B4', 'SR_B5', 'SR_B6',
       'SR_B7', 'ST_B10', 'lat_y', 'lon_y', 'NDVI', 'NDWI_mcf', 'NDWI_gao',
       'NDRE_approx', 'EVI', 'SAVI', 'NBR'],
      dtype='object')

In [50]:
# Feature selection
X = train_merged.drop(columns=target_columns)
y = train_merged[target_columns]
X_test = test_merged.drop(columns=['PID',"site",'QA_PIXEL',
       'QA_RADSAT', 'SR_B1', 'SR_B2', 'SR_B3', 'SR_B4', 'SR_B5', 'SR_B6',
       'SR_B7', 'ST_B10', 'lat_y', 'lon_y','lon_x', 'lat_x'])

In [52]:
X = X.drop(columns = ['site', 'PID', 'lon_x', 'lat_x','QA_PIXEL',
       'QA_RADSAT', 'SR_B1', 'SR_B2', 'SR_B3', 'SR_B4', 'SR_B5', 'SR_B6',
       'SR_B7', 'ST_B10', 'lat_y', 'lon_y'])

In [56]:
len(X.columns)

36

In [55]:
len(X_test.columns)

36

In [57]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
X_train = X_train.drop(columns=['PID','site'])
X_val = X_val.drop(columns=['PID','site'])

In [58]:
X_train

Unnamed: 0,pH,alb,bio1,bio12,bio15,bio7,bp,cec20,dows,ecec20,...,wp,xhp20,BulkDensity,NDVI,NDWI_mcf,NDWI_gao,NDRE_approx,EVI,SAVI,NBR
221,7.57,127,267,987,114,197,0.060588,61.00,21.049513,70.014099,...,0.017628,0.000201,1.29,0.041422,-0.150438,-0.225305,-0.225305,0.026407,0.024545,-0.164073
267,8.07,148,255,849,104,193,0.108005,40.25,8.161971,36.000000,...,0.023924,0.000000,1.39,0.067632,-0.211506,-0.220585,-0.220585,0.039338,0.039422,-0.224795
107,7.47,178,236,1101,124,187,0.177600,23.00,11.162961,19.145000,...,0.021552,0.007574,1.49,0.087079,-0.247291,-0.224688,-0.224688,0.052692,0.052809,-0.246486
6206,6.70,140,177,972,81,154,0.717659,52.00,5.942144,39.000980,...,0.015061,0.000025,1.37,0.057918,-0.165057,-0.207050,-0.207050,0.044755,0.039309,-0.142711
1226,5.42,120,165,1264,107,163,0.235929,54.25,17.337679,34.749001,...,0.015792,0.021555,1.24,0.033305,-0.144956,-0.203396,-0.203396,0.023249,0.020828,-0.077231
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5226,6.15,142,203,806,97,157,0.086938,37.25,2.692435,27.076900,...,0.016263,0.002840,1.28,0.058899,-0.178675,-0.253031,-0.253031,0.044417,0.040027,-0.239786
5390,6.30,160,135,976,91,158,0.363874,43.75,14.019321,28.107000,...,0.015221,0.003807,1.26,0.049906,-0.152149,-0.235103,-0.235103,0.040410,0.034566,-0.162316
860,5.77,143,177,1675,108,174,0.487071,41.00,9.659302,40.331001,...,0.015110,0.008207,1.27,0.074395,-0.206085,-0.197446,-0.197446,0.052551,0.048170,-0.128022
7603,6.70,151,185,1375,84,139,0.078164,34.50,0.249988,22.590000,...,0.017956,0.026118,1.20,0.011372,-0.127139,-0.397704,-0.397704,0.006504,0.006066,-0.302242


In [59]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.fit_transform(X_val)

X_train_scaled_df = pd.DataFrame(X_train_scaled , columns = X_train.columns ,index = X_train.index)
X_val_scaled_df = pd.DataFrame(X_val_scaled , columns = X_val.columns ,index = X_val.index)

In [None]:
X_train_scaled_df

In [35]:
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model.fit(X_train_scaled_df, y_train)

In [54]:
X_test_scaled = scaler.transform(X_test)
X_test_scaled_df = pd.DataFrame(X_test_scaled ,columns = X_test.columns , index = X_test.index)

In [59]:
# Predict on validation set
predictions = model.predict(X_test_scaled_df)
y_pred = model.predict(X_val)

In [60]:
# Evaluate model
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print(f'MAE: {mae:.4f}, RMSE: {rmse:.4f}')

MAE: 1209.0272, RMSE: 3224.8147


In [57]:
test_predictions = model.predict(X_test_scaled)

In [58]:
test_predictions

array([[2.0497e+03, 1.2238e+01, 1.9178e+02, ..., 1.7910e+00, 4.5418e+00,
        2.3650e-01],
       [1.6761e+03, 4.6209e+00, 1.4185e+02, ..., 1.6434e+00, 4.2973e+00,
        2.0520e-01],
       [2.0947e+03, 2.9839e+00, 1.6684e+02, ..., 1.5384e+00, 3.7332e+00,
        2.2530e-01],
       ...,
       [2.2246e+03, 5.0050e+00, 3.1491e+02, ..., 5.9715e+00, 1.0173e+00,
        3.4740e-01],
       [2.4741e+03, 6.1150e+00, 2.7551e+02, ..., 5.5017e+00, 1.1335e+00,
        3.5640e-01],
       [2.0658e+03, 2.0736e+01, 5.0899e+02, ..., 5.5265e+00, 2.0773e+00,
        5.6240e-01]])

In [61]:
# Split the predictions into separate columns
N_pred =  test_predictions[:, 0]  # Predictions for N
P_pred =  test_predictions[:, 1]  # Predictions for P
K_pred =  test_predictions[:, 2]  # Predictions for K
Ca_pred = test_predictions[:, 3]  # Predictions for Ca
Mg_pred = test_predictions[:, 4]  # Predictions for Mg
S_pred =  test_predictions[:, 5]  # Predictions for S
Fe_pred = test_predictions[:, 6]  # Predictions for Fe
Mn_pred = test_predictions[:, 7]  # Predictions for Mn
Zn_pred = test_predictions[:, 8]  # Predictions for Zn
Cu_pred = test_predictions[:, 9]  # Predictions for Cu
B_pred =  test_predictions[:, 10]  # Predictions for B


In [62]:
submission = pd.DataFrame({'PID': test_merged['PID'], 'N': N_pred, 'P': P_pred, 'K': K_pred, 'Ca': Ca_pred, 'Mg': Mg_pred, 'S': S_pred, 'Fe': Fe_pred, 'Mn': Mn_pred, 'Zn': Zn_pred, 'Cu': Cu_pred, 'B': B_pred})
submission.head()

Unnamed: 0,PID,N,P,K,Ca,Mg,S,Fe,Mn,Zn,Cu,B
0,ID_NGS9Bx,2049.7,12.238,191.78,5143.12,1709.41,9.4797,133.807,145.42,1.791,4.5418,0.2365
1,ID_YdVKXw,1676.1,4.6209,141.85,6898.86,2330.68,9.6006,108.169,145.63,1.6434,4.2973,0.2052
2,ID_MZAlfE,2094.7,2.9839,166.84,5911.91,1785.81,9.2944,121.737,149.73,1.5384,3.7332,0.2253
3,ID_GwCCMN,2124.2,4.6806,194.26,4939.75,1876.53,9.5115,132.227,147.21,1.6929,4.2722,0.2092
4,ID_K8sowf,2081.9,9.8319,206.57,4621.62,1395.8,9.1215,132.17,149.38,1.9455,4.5767,0.1713


In [63]:
submission_melted = submission.melt(id_vars=['PID'], var_name='Nutrient', value_name='Available_Nutrients_in_ppm')
submission_melted = submission_melted.sort_values('PID')
submission_melted.head()

Unnamed: 0,PID,Nutrient,Available_Nutrients_in_ppm
1025,ID_002W8m,N,2120.9
19275,ID_002W8m,S,11.9684
15625,ID_002W8m,Mg,558.99
30225,ID_002W8m,Zn,3.0481
30226,ID_002W8m,Zn,3.0542


In [64]:
# prompt: merge test_gap_df with submission_melted on PID and Nutrient
nutrient_df = pd.merge(test_gap_df, submission_melted, on=['PID', 'Nutrient'], how='left')


In [65]:
soil_depth = 20  # cm

# Calculate the Available_Nutrients_in_kg_ha
nutrient_df['Available_Nutrients_in_kg_ha'] = (nutrient_df['Available_Nutrients_in_ppm']
                                               * soil_depth * nutrient_df['BulkDensity'] * 0.1)

In [66]:
nutrient_df.head()

Unnamed: 0,Nutrient,Required,PID,BulkDensity,Available_Nutrients_in_ppm,Available_Nutrients_in_kg_ha
0,N,100.0,ID_NGS9Bx,1.2,2049.7,4919.28
1,P,40.0,ID_NGS9Bx,1.2,12.238,29.3712
2,K,52.0,ID_NGS9Bx,1.2,191.78,460.272
3,Ca,12.0,ID_NGS9Bx,1.2,5143.12,12343.488
4,Mg,8.0,ID_NGS9Bx,1.2,1709.41,4102.584


In [67]:
nutrient_df["Gap"] = nutrient_df["Required"] - nutrient_df["Available_Nutrients_in_kg_ha"]

In [68]:
nutrient_df['ID'] = nutrient_df['PID'] + "_" + nutrient_df['Nutrient']
nutrient_df = nutrient_df[['ID', 'Gap']]
nutrient_df.head()

Unnamed: 0,ID,Gap
0,ID_NGS9Bx_N,-4819.28
1,ID_NGS9Bx_P,10.6288
2,ID_NGS9Bx_K,-408.272
3,ID_NGS9Bx_Ca,-12331.488
4,ID_NGS9Bx_Mg,-4094.584


In [69]:
nutrient_df.tail()

Unnamed: 0,ID,Gap
40145,ID_oMn2Yb_Fe,-413.90464
40146,ID_oMn2Yb_Mn,-413.7848
40147,ID_oMn2Yb_Zn,-12.20042
40148,ID_oMn2Yb_Cu,-4.536244
40149,ID_oMn2Yb_B,-1.202272


In [70]:
nutrient_df.to_csv('mysubmissionx69.csv', index=False)
print("Submission file saved as submission.csv")

Submission file saved as submission.csv


In [29]:
from xgboost import XGBRegressor

In [30]:
import optuna
from sklearn.model_selection import cross_val_score

In [31]:
# #bayesian optimization of XGBoost
# def objective(trial):
#     params = {
#         "n_estimators" : trial.suggest_int("n_estimators", 100 , 1000),
#         "max_depth" : trial.suggest_int("max_depth" , 3 , 25),
#         "learning_rate" : trial.suggest_float("learning_rate" , 0.01 , 0.4, log = True),
#         "subsample": trial.suggest_float("subsample", 0.5, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
#         "gamma": trial.suggest_float("gamma", 0, 10),
#         "reg_alpha": trial.suggest_float("reg_alpha", 0, 10),
#         "reg_lambda": trial.suggest_float("reg_lambda", 0, 10)
#     }

#     xgb_reg = XGBRegressor(objective = "reg:squarederror",random_state = 42 , **params)
#     #wrapping up with the multioutput regressor
#     model  = MultiOutputRegressor(xgb_reg)
#     #performing cross-validation
#     scores = cross_val_score(model , X_train , y_train , cv = 6 ,scoring="neg_root_mean_squared_error")

#     # Return the mean RMSE (negative because Optuna minimizes)
#     return -np.mean(scores)
    
    


In [32]:
# #creating the study
# xgb_study = optuna.create_study(direction = "minimize")
# xgb_study.optimize(objective , n_trials = 75 , show_progress_bar = True)


In [1]:
sentinel_2 = pd.read_csv('/kaggle/input/amini-soil-data/Sentinel2_data.csv')

In [2]:
sentinel_2.head()

Unnamed: 0,B1,B11,B12,B2,B3,B4,B5,B6,B7,B8,...,B9,CLOUDY_PIXEL_PERCENTAGE,MEAN_SOLAR_ZENITH_ANGLE,NODATA_PIXEL_PERCENTAGE,SENSING_ORBIT_NUMBER,SPACECRAFT_NAME,date,lat,lon,PID
0,709,3916,3442,1184,1736,2274,2551,2630,2835,2846,...,2804,0.000693,42.97793,0.0,135.0,Sentinel-2B,2019-12-18,46.289808,70.668003,ID_ZaKEFI
1,709,3769,3304,1116,1722,2236,2570,2590,2744,2802,...,2804,0.000693,42.97793,0.0,135.0,Sentinel-2B,2019-12-18,46.289808,70.668096,ID_ZaKEFI
2,568,3877,3684,789,1274,2208,2565,2708,2880,2822,...,3011,0.000693,42.97793,0.0,135.0,Sentinel-2B,2019-12-18,46.326085,70.661299,ID_xFLwnc
3,553,3796,3435,823,1286,2130,2366,2534,2721,2798,...,2945,0.000693,42.97793,0.0,135.0,Sentinel-2B,2019-12-18,46.326085,70.661391,ID_xFLwnc
4,547,3416,2763,728,1168,1526,1728,1866,2034,2130,...,2394,0.000693,42.97793,0.0,135.0,Sentinel-2B,2019-12-18,46.255129,70.741908,ID_9uA1am


In [3]:
sentinel_2.columns

Index(['B1', 'B11', 'B12', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A',
       'B9', 'CLOUDY_PIXEL_PERCENTAGE', 'MEAN_SOLAR_ZENITH_ANGLE',
       'NODATA_PIXEL_PERCENTAGE', 'SENSING_ORBIT_NUMBER', 'SPACECRAFT_NAME',
       'date', 'lat', 'lon', 'PID'],
      dtype='object')

In [8]:
compute_indices(sentinel_2)

Unnamed: 0,B1,B11,B12,B2,B3,B4,B5,B6,B7,B8,...,SENSING_ORBIT_NUMBER,SPACECRAFT_NAME,date,lat,lon,PID,NDVI,NDRE_b5,NDWI_mcf,NDWI_gao
0,709,3916.0,3442,1184,1736.0,2274.0,2551.0,2630,2835,2846.0,...,135.0,Sentinel-2B,2019-12-18,46.289808,70.668003,ID_ZaKEFI,0.111719,0.054660,-0.242252,-0.158237
1,709,3769.0,3304,1116,1722.0,2236.0,2570.0,2590,2744,2802.0,...,135.0,Sentinel-2B,2019-12-18,46.289808,70.668096,ID_ZaKEFI,0.112346,0.043187,-0.238727,-0.147162
2,568,3877.0,3684,789,1274.0,2208.0,2565.0,2708,2880,2822.0,...,135.0,Sentinel-2B,2019-12-18,46.326085,70.661299,ID_xFLwnc,0.122068,0.047707,-0.377930,-0.157486
3,553,3796.0,3435,823,1286.0,2130.0,2366.0,2534,2721,2798.0,...,135.0,Sentinel-2B,2019-12-18,46.326085,70.661391,ID_xFLwnc,0.135552,0.083656,-0.370225,-0.151350
4,547,3416.0,2763,728,1168.0,1526.0,1728.0,1866,2034,2130.0,...,135.0,Sentinel-2B,2019-12-18,46.255129,70.741908,ID_9uA1am,0.165208,0.104199,-0.291692,-0.231879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11763,381,2934.0,1961,481,802.0,835.0,1543.0,2341,2565,2524.0,...,135.0,Sentinel-2B,2019-12-18,38.170237,68.626339,ID_ugetZ9,0.502828,0.241210,-0.517739,-0.075119
11764,366,2389.0,1509,386,627.0,592.0,1178.0,2011,2293,2272.0,...,135.0,Sentinel-2B,2019-12-18,38.170237,68.626429,ID_ugetZ9,0.586592,0.317101,-0.567437,-0.025102
11765,232,1368.0,791,348,583.0,542.0,770.0,1998,2367,2524.0,...,135.0,Sentinel-2B,2019-12-18,38.138668,68.646008,ID_WWd142,0.646445,0.532483,-0.624718,0.297020
11766,232,1340.0,703,243,422.0,295.0,778.0,1970,2550,2578.0,...,135.0,Sentinel-2B,2019-12-18,38.138667,68.646098,ID_WWd142,0.794640,0.536353,-0.718667,0.315978
