In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
warnings.filterwarnings('ignore')

In [9]:
df = pd.read_csv('Train.csv')

In [10]:
df.columns

Index(['site', 'PID', 'lon', 'lat', 'pH', 'alb', 'bio1', 'bio12', 'bio15',
       'bio7', 'bp', 'cec20', 'dows', 'ecec20', 'hp20', 'ls', 'lstd', 'lstn',
       'mb1', 'mb2', 'mb3', 'mb7', 'mdem', 'para', 'parv', 'ph20', 'slope',
       'snd20', 'soc20', 'tim', 'wp', 'xhp20', 'BulkDensity', 'N', 'P', 'K',
       'Ca', 'Mg', 'S', 'Fe', 'Mn', 'Zn', 'Cu', 'B'],
      dtype='object')

In [11]:
df.isnull().sum()[df.isnull().sum()!=0]

ecec20         5
hp20           5
xhp20          5
BulkDensity    4
dtype: int64

In [12]:
for column in df.columns:
  if df[column].isnull().any():
    df[column].fillna(df[column].mean(), inplace=True)

In [13]:
test_df = pd.read_csv('Test.csv')
train_gap_df = pd.read_csv('Gap_Train.csv')
test_gap_df = pd.read_csv('Gap_Test.csv')
sample_submission = pd.read_csv('SampleSubmission.csv')

In [14]:
test_gap_df = pd.merge(test_gap_df, test_df[['PID', 'BulkDensity']], on='PID', how='left')

In [16]:
df.drop(columns=['site','lon','lat'],inplace=True)
test_df.drop(columns=['site','lon','lat'],inplace=True)

In [18]:
bands=pd.read_csv('bands.csv')
bands_test=pd.read_csv('bands_test.csv')

In [19]:
df = pd.merge(df, bands, on='PID')
test_df=pd.merge(test_df,bands_test,on='PID')

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [22]:
target_columns = ['N', 'P', 'K', 'Ca', 'Mg', 'S', 'Fe', 'Mn', 'Zn', 'Cu', 'B']

In [23]:
# Feature selection
X = df.drop(columns=target_columns)
y = df[target_columns]
X_test = test_df.drop(columns=['PID'])

In [24]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
X_train = X_train.drop(columns=['PID'])
X_val = X_val.drop(columns=['PID'])

In [None]:
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor

model = MultiOutputRegressor(LGBMRegressor(n_estimators=100, random_state=42, n_jobs=-1))
model.fit(X_train, y_train)


In [40]:
y_pred = model.predict(X_val)

In [41]:
from sklearn.metrics import mean_squared_error, r2_score
mse_per_target = mean_squared_error(y_val, y_pred, multioutput='raw_values')
r2_per_target = r2_score(y_val, y_pred, multioutput='raw_values')

for i, (nutrient, mse, r2) in enumerate(zip(target_columns, mse_per_target, r2_per_target)):
    rmse = np.sqrt(mse)
    print(f"{nutrient}: RMSE = {rmse:.4f}, R² = {r2:.4f}")

N: RMSE = 506.1712, R² = 0.5472
P: RMSE = 43.8934, R² = 0.0235
K: RMSE = 204.1293, R² = 0.4082
Ca: RMSE = 1473.0224, R² = 0.8252
Mg: RMSE = 331.8990, R² = 0.6542
S: RMSE = 16.4349, R² = -0.0830
Fe: RMSE = 44.1104, R² = 0.5502
Mn: RMSE = 55.7614, R² = 0.3736
Zn: RMSE = 1.9865, R² = 0.0796
Cu: RMSE = 5.0314, R² = 0.0466
B: RMSE = 0.2287, R² = 0.5533


In [42]:
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print(f' RMSE: {rmse:.4f}')

 RMSE: 484.7784


In [43]:
test_predictions = model.predict(X_test)

In [44]:
N_pred =  test_predictions[:, 0]  
P_pred =  test_predictions[:, 1]  
K_pred =  test_predictions[:, 2]  
Ca_pred = test_predictions[:, 3] 
Mg_pred = test_predictions[:, 4] 
S_pred =  test_predictions[:, 5] 
Fe_pred = test_predictions[:, 6] 
Mn_pred = test_predictions[:, 7]  
Zn_pred = test_predictions[:, 8] 
Cu_pred = test_predictions[:, 9] 
B_pred =  test_predictions[:, 10] 

In [45]:
submission = pd.DataFrame({'PID': test_df['PID'], 'N': N_pred, 'P': P_pred, 'K': K_pred, 'Ca': Ca_pred, 'Mg': Mg_pred, 'S': S_pred, 'Fe': Fe_pred, 'Mn': Mn_pred, 'Zn': Zn_pred, 'Cu': Cu_pred, 'B': B_pred})
submission.head()

Unnamed: 0,PID,N,P,K,Ca,Mg,S,Fe,Mn,Zn,Cu,B
0,ID_NGS9Bx,1755.931243,31.782846,222.851904,5558.990242,1539.317345,14.249245,126.482902,165.753261,1.766312,3.414415,0.217132
1,ID_YdVKXw,1440.065046,8.286716,192.817973,6088.16681,2297.784136,6.984707,123.744517,121.83407,1.690569,4.438316,0.205894
2,ID_MZAlfE,1735.795168,3.456613,219.256328,5171.623002,2130.187352,7.810359,133.611112,132.988251,1.352434,4.114458,0.166331
3,ID_GwCCMN,1671.733568,5.340949,171.394761,5362.992028,1805.530817,8.641349,136.686621,165.219855,1.992525,4.255531,0.164705
4,ID_K8sowf,1906.884229,2.29435,233.781605,4677.334071,1489.147795,8.206712,131.12946,136.535799,1.634566,3.314964,0.181952


In [46]:

submission_melted = submission.melt(id_vars=['PID'], var_name='Nutrient', value_name='Available_Nutrients_in_ppm')
submission_melted = submission_melted.sort_values('PID')
submission_melted.head()

Unnamed: 0,PID,Nutrient,Available_Nutrients_in_ppm
19869,ID_002W8m,Zn,2.745487
15033,ID_002W8m,Fe,197.018715
2943,ID_002W8m,P,0.75895
24705,ID_002W8m,B,0.33757
525,ID_002W8m,N,1981.093891


In [47]:
nutrient_df = pd.merge(test_gap_df, submission_melted, on=['PID', 'Nutrient'], how='left')

In [48]:
soil_depth = 20 
nutrient_df['Available_Nutrients_in_kg_ha'] = (nutrient_df['Available_Nutrients_in_ppm']
                                               * soil_depth * nutrient_df['BulkDensity'] * 0.1)

In [49]:
nutrient_df["Gap"] = nutrient_df["Required"] - nutrient_df["Available_Nutrients_in_kg_ha"]

In [50]:
nutrient_df['ID'] = nutrient_df['PID'] + "_" + nutrient_df['Nutrient']
nutrient_df = nutrient_df[['ID', 'Gap']]
nutrient_df.head()

Unnamed: 0,ID,Gap
0,ID_NGS9Bx_N,-4114.234982
1,ID_NGS9Bx_P,-36.27883
2,ID_NGS9Bx_K,-482.844571
3,ID_NGS9Bx_Ca,-13329.576581
4,ID_NGS9Bx_Mg,-3686.361628


In [51]:
nutrient_df.to_csv('submission.csv', index=False)
print("Submission file saved as submission.csv")

Submission file saved as submission.csv
