In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
warnings.filterwarnings('ignore')

In [9]:
df = pd.read_csv('Train.csv')

In [10]:
df.columns

Index(['site', 'PID', 'lon', 'lat', 'pH', 'alb', 'bio1', 'bio12', 'bio15',
       'bio7', 'bp', 'cec20', 'dows', 'ecec20', 'hp20', 'ls', 'lstd', 'lstn',
       'mb1', 'mb2', 'mb3', 'mb7', 'mdem', 'para', 'parv', 'ph20', 'slope',
       'snd20', 'soc20', 'tim', 'wp', 'xhp20', 'BulkDensity', 'N', 'P', 'K',
       'Ca', 'Mg', 'S', 'Fe', 'Mn', 'Zn', 'Cu', 'B'],
      dtype='object')

In [3]:
df.isnull().sum()[df.isnull().sum()!=0]

ecec20         5
hp20           5
xhp20          5
BulkDensity    4
dtype: int64

In [4]:
for column in df.columns:
  if df[column].isnull().any():
    df[column].fillna(df[column].mean(), inplace=True)
    

In [5]:
test_df = pd.read_csv('Test.csv')
train_gap_df = pd.read_csv('Gap_Train.csv')
test_gap_df = pd.read_csv('Gap_Test.csv')
sample_submission = pd.read_csv('SampleSubmission.csv')

In [6]:
test_gap_df = pd.merge(test_gap_df, test_df[['PID', 'BulkDensity']], on='PID', how='left')

In [7]:
df.drop(columns=['site','lon','lat'],inplace=True)
test_df.drop(columns=['site','lon','lat'],inplace=True)

In [98]:
bands=pd.read_csv('bands.csv')
bands_test=pd.read_csv('bands_test.csv')

In [99]:
df = pd.merge(df, bands, on='PID')
test_df=pd.merge(test_df,bands_test,on='PID')

In [105]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [106]:
target_columns = ['N', 'P', 'K', 'Ca', 'Mg', 'S', 'Fe', 'Mn', 'Zn', 'Cu', 'B']

In [107]:
# Feature selection
X = df.drop(columns=target_columns)
y = df[target_columns]
X_test = test_df.drop(columns=['PID'])

In [108]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [109]:
X_train = X_train.drop(columns=['PID'])
X_val = X_val.drop(columns=['PID'])

Index(['pH', 'alb', 'bio1', 'bio12', 'bio15', 'bio7', 'bp', 'cec20', 'dows',
       'ecec20',
       ...
       'b07_2018-05', 'b07_2018-07', 'b07_2018-09', 'b07_2018-11',
       'b07_2019-01', 'b07_2019-03', 'b07_2019-05', 'b07_2019-07',
       'b07_2019-09', 'b07_2019-11'],
      dtype='object', length=155)

In [141]:
from cuml.ensemble import RandomForestRegressor as cuRF
from sklearn.multioutput import MultiOutputRegressor

model = MultiOutputRegressor(
    cuRF(
        n_estimators=150,
        random_state=42
    )
)
model.fit(X_train, y_train)


ModuleNotFoundError: No module named 'cuml'

In [113]:
y_pred = model.predict(X_val)

In [114]:
from sklearn.metrics import mean_squared_error, r2_score
mse_per_target = mean_squared_error(y_val, y_pred, multioutput='raw_values')
r2_per_target = r2_score(y_val, y_pred, multioutput='raw_values')

for i, (nutrient, mse, r2) in enumerate(zip(target_columns, mse_per_target, r2_per_target)):
    rmse = np.sqrt(mse)
    print(f"{nutrient}: RMSE = {rmse:.4f}, R² = {r2:.4f}")


N: RMSE = 500.9464, R² = 0.5565
P: RMSE = 44.3605, R² = 0.0026
K: RMSE = 206.1790, R² = 0.3963
Ca: RMSE = 1519.8603, R² = 0.8139
Mg: RMSE = 345.3575, R² = 0.6256
S: RMSE = 57.2457, R² = -12.1393
Fe: RMSE = 44.7641, R² = 0.5368
Mn: RMSE = 56.0986, R² = 0.3660
Zn: RMSE = 3.2225, R² = -1.4221
Cu: RMSE = 5.0317, R² = 0.0465
B: RMSE = 0.2301, R² = 0.5479


In [115]:
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print(f' RMSE: {rmse:.4f}')

 RMSE: 498.4645


In [132]:
test_predictions = model.predict(X_test)

In [133]:
# Split the predictions into separate columns
N_pred =  test_predictions[:, 0]  # Predictions for N
P_pred =  test_predictions[:, 1]  # Predictions for P
K_pred =  test_predictions[:, 2]  # Predictions for K
Ca_pred = test_predictions[:, 3]  # Predictions for Ca
Mg_pred = test_predictions[:, 4]  # Predictions for Mg
S_pred =  test_predictions[:, 5]  # Predictions for S
Fe_pred = test_predictions[:, 6]  # Predictions for Fe
Mn_pred = test_predictions[:, 7]  # Predictions for Mn
Zn_pred = test_predictions[:, 8]  # Predictions for Zn
Cu_pred = test_predictions[:, 9]  # Predictions for Cu
B_pred =  test_predictions[:, 10]  # Predictions for B


In [134]:
submission = pd.DataFrame({'PID': test_df['PID'], 'N': N_pred, 'P': P_pred, 'K': K_pred, 'Ca': Ca_pred, 'Mg': Mg_pred, 'S': S_pred, 'Fe': Fe_pred, 'Mn': Mn_pred, 'Zn': Zn_pred, 'Cu': Cu_pred, 'B': B_pred})
submission.head()

Unnamed: 0,PID,N,P,K,Ca,Mg,S,Fe,Mn,Zn,Cu,B
0,ID_NGS9Bx,1275.8,12.1814,175.6,5156.1,1393.52,9.8262,123.198,180.96,1.7524,4.7226,0.2584
1,ID_YdVKXw,1403.0,3.6412,174.12,6350.96,2010.7,9.338,131.394,142.08,1.4722,4.1206,0.1798
2,ID_MZAlfE,1757.2,1.1348,186.0,5513.8,1893.22,9.6842,132.362,132.34,1.399,3.8888,0.1814
3,ID_GwCCMN,1724.4,3.6186,172.0,5595.56,1559.9,8.9602,129.884,146.5,1.4738,3.9196,0.1844
4,ID_K8sowf,1685.8,4.805,185.52,5645.46,1289.8,8.2768,121.6,140.14,1.6718,3.7414,0.164


In [135]:
# prompt: turn submission into a 3 column file that has the column PID, Nutrient, Value

submission_melted = submission.melt(id_vars=['PID'], var_name='Nutrient', value_name='Available_Nutrients_in_ppm')
submission_melted = submission_melted.sort_values('PID')
submission_melted.head()

Unnamed: 0,PID,Nutrient,Available_Nutrients_in_ppm
19869,ID_002W8m,Zn,2.5614
15033,ID_002W8m,Fe,226.33
2943,ID_002W8m,P,5.076
24705,ID_002W8m,B,0.3092
525,ID_002W8m,N,1988.6


In [136]:
# prompt: merge test_gap_df with submission_melted on PID and Nutrient
nutrient_df = pd.merge(test_gap_df, submission_melted, on=['PID', 'Nutrient'], how='left')


In [137]:
soil_depth = 20  # cm

# Calculate the Available_Nutrients_in_kg_ha
nutrient_df['Available_Nutrients_in_kg_ha'] = (nutrient_df['Available_Nutrients_in_ppm']
                                               * soil_depth * nutrient_df['BulkDensity'] * 0.1)

In [138]:
nutrient_df["Gap"] = nutrient_df["Required"] - nutrient_df["Available_Nutrients_in_kg_ha"]

In [139]:
nutrient_df['ID'] = nutrient_df['PID'] + "_" + nutrient_df['Nutrient']
nutrient_df = nutrient_df[['ID', 'Gap']]
nutrient_df.head()

Unnamed: 0,ID,Gap
0,ID_NGS9Bx_N,-2961.92
1,ID_NGS9Bx_P,10.76464
2,ID_NGS9Bx_K,-369.44
3,ID_NGS9Bx_Ca,-12362.64
4,ID_NGS9Bx_Mg,-3336.448


In [140]:
nutrient_df.to_csv('submission.csv', index=False)
print("Submission file saved as submission.csv")

Submission file saved as submission.csv
