# Initializing environment

In [1]:
import numpy as np
import os

# To make the output stable across other runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "training_farmer_income_models"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Loading dataset

In [3]:
import pandas as pd
train_df = pd.read_excel('Pearl Challenge data with dictionary.xlsx', sheet_name='TrainData')
test_df = pd.read_excel('Pearl Challenge data with dictionary.xlsx', sheet_name='TestData')

In [4]:
train_df.describe()

Unnamed: 0,FarmerID,Zipcode,No_of_Active_Loan_In_Bureau,Avg_Disbursement_Amount_Bureau,Non_Agriculture_Income,Total_Land_For_Agriculture,K022-Proximity to nearest mandi (Km),K022-Proximity to nearest railway (Km),KO22-Village score based on socio-economic parameters (0 to 100),K022-Seasonal Average Rainfall (mm),R022-Seasonal Average Rainfall (mm),K021-Seasonal Average Rainfall (mm),R021-Seasonal Average Rainfall (mm),R020-Seasonal Average Rainfall (mm),Perc_of_house_with_6plus_room,Women_15_19_Mothers_or_Pregnant_at_time_of_survey,perc_of_pop_living_in_hh_electricity,perc_Households_with_Pucca_House_That_Has_More_Than_3_Rooms,mat_roof_Metal_GI_Asbestos_sheets,perc_of_Wall_material_with_Burnt_brick,Households_with_improved_Sanitation_Facility,perc_Households_do_not_have_KCC_With_The_Credit_Limit_Of_50k,K022-Total Geographical Area (in Hectares)-,K022-Net Agri area (in Ha)-,K022-Net Agri area (% of total geog area)-,Kharif Seasons Irrigated area in 2022,Kharif Seasons Cropping density in 2022,Kharif Seasons Agricultural performance in 2022,Kharif Seasons Agricultural Score in 2022,Kharif Seasons Seasonal average groundwater thickness (cm) in 2022,Kharif Seasons Seasonal average groundwater replenishment rate (cm) in 2022,Rabi Seasons Season Irrigated area in 2022,Rabi Seasons Cropping density in 2022,Rabi Seasons Agricultural performance in 2022,Rabi Seasons Agricultural Score in 2022,Rabi Seasons Seasonal average groundwater thickness (cm) in 2022,Rabi Seasons Seasonal average groundwater replenishment rate (cm) in 2022,Rabi Seasons Kharif Season Irrigated area in 2021,Rabi Seasons Cropping density in 2021,Rabi Seasons Agricultural performance in 2021,Rabi Seasons Agricultural Score in 2021,Rabi Seasons Seasonal average groundwater thickness (cm) in 2021,Rabi Seasons Seasonal average groundwater replenishment rate (cm) in 2021,Kharif Seasons Kharif Season Irrigated area in 2021,Kharif Seasons Cropping density in 2021,Kharif Seasons Agricultural performance in 2021,Kharif Seasons Agricultural Score in 2021,Kharif Seasons Seasonal average groundwater thickness (cm) in 2021,Kharif Seasons Seasonal average groundwater replenishment rate (cm) in 2021,Kharif Seasons Kharif Season Irrigated area in 2020,Kharif Seasons Cropping density in 2020,Kharif Seasons Agricultural performance in 2020,Kharif Seasons Agricultural Score in 2020,Kharif Seasons Seasonal average groundwater thickness (cm) in 2020,Kharif Seasons Seasonal average groundwater replenishment rate (cm) in 2020,Rabi Seasons Kharif Season Irrigated area in 2020,Rabi Seasons Cropping density in 2020,Rabi Seasons Agricultural performance in 2020,Rabi Seasons Agricultural Score in 2020,Rabi Seasons Seasonal average groundwater thickness (cm) in 2020,Rabi Seasons Seasonal average groundwater replenishment rate (cm) in 2020,Night light index,Village score based on socio-economic parameters (Non normalised),Village score based on socio-economic parameters (0 to 100),Land Holding Index source (Total Agri Area/ no of people),Road density (Km/ SqKm),Target_Variable/Total Income
count,47970.0,47970.0,47970.0,27180.0,47970.0,47899.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47802.0,47802.0,47802.0,47802.0,47802.0,47802.0,47802.0,47802.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0,47970.0
mean,5488529000000000.0,486844.412529,1.780196,246383.4,280643.9,9.952977,11.50261,15.107913,37.387913,1394.703558,139.138688,930.843286,141.450563,184.277404,1.70353,7.129813,98.044752,5.472353,17.108636,39.974012,69.694892,3.414701,1343.381155,832.049711,64.111285,52.098883,56.66163,24.622157,26.63021,88.320294,25.338932,57.541951,47.554242,20.794567,23.078389,84.635742,17.36728,65.554164,45.361299,22.187941,32.662377,83.48676,20.997763,57.541951,47.554242,15.875101,19.844924,84.688148,22.076523,61.821247,43.171065,21.059823,26.27866,86.241283,26.136203,65.124157,46.26026,23.828276,25.177085,84.655071,21.216232,0.930509,24.762485,37.387913,0.683805,2.85037,1222255.0
std,2602973000000000.0,120222.366317,2.733938,711115.7,1707941.0,6.903007,7.515967,13.984214,7.484105,424.028028,109.354315,253.501524,78.434748,125.439381,1.744805,4.460903,2.926635,4.124985,20.764845,19.38844,11.993306,4.031244,1335.892566,866.949345,18.739129,17.773827,18.23331,12.852373,8.738836,15.53576,7.232706,28.54597,27.162464,11.846883,8.817123,14.894818,6.440266,27.085117,25.262738,11.232275,10.559737,14.481376,7.731492,28.54597,27.162464,13.121163,7.727002,15.032286,6.576946,29.331604,25.453386,14.264316,10.667087,15.759384,8.1981,29.41177,26.216106,12.140449,9.083251,14.77761,7.074566,0.058456,4.617983,7.484105,1.980722,6.126023,2073935.0
min,1000336000000000.0,122103.0,0.0,0.0,0.0,0.0,0.0,0.0,8.177678,530.37,17.49,369.65,16.38,19.14,0.0,0.0,68.4,0.0,0.0,0.0,29.2,0.0,9.57,0.89,0.5,0.0,0.0,0.0,0.72,20.34,6.35,0.0,0.0,0.0,0.0,20.87,1.69,0.0,0.0,0.0,0.0,23.51,0.45,0.0,0.0,0.0,0.01,25.01,1.16,0.0,0.0,0.0,0.0,24.3,2.49,0.0,0.0,0.0,0.0,22.37,0.56,0.43,6.73864,8.177678,0.0,0.0,29000.0
25%,3210170000000000.0,445302.25,0.0,57902.62,0.0,5.0,6.2,3.4,32.980125,1057.19,63.91,750.6,84.15,100.91,0.6,4.52,97.4,2.73,2.0,26.24,62.33,0.79,492.38,284.7,54.18,40.11,45.33,14.03,20.7,81.55,21.19,33.49,25.36,11.75,16.85,76.84,13.87,50.5,25.1575,13.94,26.4,78.22,15.84,33.49,25.36,5.04,14.15,75.47,17.97,39.49,22.32,9.22,18.53,78.66,21.2,46.51,25.98,14.65,18.82,74.77,17.41,0.92,22.04271,32.980125,0.156027,0.0,720000.0
50%,5488973000000000.0,483119.0,1.0,129097.8,100000.0,10.0,10.7,11.4,37.776894,1375.8,103.03,933.79,125.07,142.8,1.17,5.8,98.95,4.3,8.19,39.91,71.3,1.75,901.07,551.93,69.78,51.2,57.47,24.28,26.9,92.51,25.57,64.67,47.85,20.97,23.47,89.06,16.06,75.115,46.84,21.41,33.6,88.07,21.61,64.67,47.85,12.79,19.53,88.96,22.24,72.61,42.58,18.55,26.91,90.18,26.76,76.75,45.99,23.15,26.41,89.12,20.37,0.95,25.002501,37.776894,0.30405,0.46,950000.0
75%,7750388000000000.0,521109.0,2.0,284025.2,250000.0,12.0,15.975,23.6,41.990679,1687.49,175.19,1104.68,180.54,232.17,2.22,8.66,99.6,6.85,23.12,51.5775,77.3,4.71,1778.79,1077.94,78.26,64.03,69.32,34.13,32.69,98.11,29.23,82.71,68.9,29.19,29.29,94.37,19.06,87.0375,65.11,29.1375,39.77,93.18,26.06,82.71,68.9,24.26,24.86,94.67,26.38,86.47,62.79,31.27,33.75,97.01,31.33,87.98,67.72,32.31,31.53,95.38,25.18,0.97,27.60257,41.990679,0.583235,3.42,1295000.0
max,9999402000000000.0,855117.0,62.0,80000000.0,71282300.0,151.0,44.2,93.7,100.0,2569.1,733.76,2239.36,621.47,936.98,16.04,25.0,100.0,44.31,88.23,96.36,97.1,25.53,30599.39,8527.61,96.7,97.06,96.35,67.97,55.86,178.83,52.06,97.94,97.97,60.78,50.96,165.39,50.67,97.99,97.96,68.11,68.2,157.91,53.77,97.94,97.97,71.14,51.18,182.61,47.79,97.99,97.96,71.96,59.37,176.24,62.69,97.98,97.59,67.82,57.62,157.7,56.88,1.0,63.396566,100.0,112.603896,198.55,80000000.0


In [5]:
print("TrainData size:",len(train_df))
print("TestData size:", len(test_df))

TrainData size: 47970
TestData size: 9986


# Preprocessing data

In [6]:
drop_cols = ['FarmerID', 'Zipcode', 'CITY', 'DISTRICT', 'VILLAGE', 'Location']
target = 'Target_Variable/Total Income'
X = train_df.drop(columns=[target] + drop_cols)
y = train_df[target]

X_test = test_df.drop(columns=[target] + drop_cols)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)

X_preprocessed = preprocessor.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Building neural network

## 1. Simple NN

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Input dimension
input_dim = X_train.shape[1]

model = Sequential([
    Dense(128, activation='relu', input_dim=input_dim),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1)  # Output layer for regression
])

model.compile(optimizer='adam', loss='mse', metrics=['mape'])

# Train with early stopping
es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=32,
    callbacks=[es],
    verbose=1
)


Epoch 1/100
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 8ms/step - loss: 5811745062912.0000 - mape: 93.7802 - val_loss: 4640217759744.0000 - val_mape: 31.7988
Epoch 2/100
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - loss: 4012301877248.0000 - mape: 36.8082 - val_loss: 4334829436928.0000 - val_mape: 46.5302
Epoch 3/100
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - loss: 3338294263808.0000 - mape: 47.9003 - val_loss: 4296961163264.0000 - val_mape: 49.0209
Epoch 4/100
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - loss: 4475205451776.0000 - mape: 49.9652 - val_loss: 4265132949504.0000 - val_mape: 46.9450
Epoch 5/100
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - loss: 3758292205568.0000 - mape: 48.4714 - val_loss: 4230997606400.0000 - val_mape: 48.2474
Epoch 6/100
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/

In [None]:
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np

# Predict on test set
y_pred_scaled = model.predict(X_val)

# Inverse transform both prediction and true values
y_pred = y_scaler.inverse_transform(y_pred_scaled)
y_true = y_scaler.inverse_transform(y_test.reshape(-1, 1))

# Compute MAPE
real_mape = mean_absolute_percentage_error(y_true, y_pred) * 100
print(f"Real Test MAPE: {real_mape:.2f}%")


## 2. Overkill NN (using BatchNormalization+Dropout)

In [10]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

def build_better_nn(input_dim):
    inputs = Input(shape=(input_dim,))
    
    x = Dense(512, activation='relu')(inputs)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    
    x = Dense(256, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    
    x = Dense(128, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    
    outputs = Dense(1)(x)
    
    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='mse',
                  metrics=['mape'])
    return model


In [11]:
model = build_better_nn(X_train.shape[1])

early_stop = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=200,
    batch_size=64,
    callbacks=[early_stop, lr_scheduler],
    verbose=1
)


Epoch 1/200
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 16ms/step - loss: 5915786870784.0000 - mape: 99.9997 - val_loss: 5886010982400.0000 - val_mape: 99.9985 - learning_rate: 0.0010
Epoch 2/200
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - loss: 6683652259840.0000 - mape: 99.9909 - val_loss: 5884581773312.0000 - val_mape: 99.9847 - learning_rate: 0.0010
Epoch 3/200
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - loss: 5648730292224.0000 - mape: 99.9685 - val_loss: 5882362462208.0000 - val_mape: 99.9501 - learning_rate: 0.0010
Epoch 4/200
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - loss: 6393287409664.0000 - mape: 99.9340 - val_loss: 5879887298560.0000 - val_mape: 99.9100 - learning_rate: 0.0010
Epoch 5/200
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - loss: 6773626372096.0000 - mape: 99.8887 - val_loss: 5876894138368.0000 - val_map

## 3. A bit more balanced NN

In [7]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

def build_tuned_nn(input_dim):
    inputs = Input(shape=(input_dim,))
    
    x = Dense(128, activation='relu')(inputs)
    x = Dropout(0.2)(x)
    
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.1)(x)
    
    outputs = Dense(1)(x)

    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='mse',
                  metrics=['mape'])
    return model

In [11]:
model = build_tuned_nn(X_train.shape[1])

early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=32,
    callbacks=early_stop,
    verbose=1
)

Epoch 1/100
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 5437347332096.0000 - mape: 93.1649 - val_loss: 4591176908800.0000 - val_mape: 30.3239
Epoch 2/100
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 3878629146624.0000 - mape: 36.7078 - val_loss: 4326619086848.0000 - val_mape: 47.8752
Epoch 3/100
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 3807391252480.0000 - mape: 47.9186 - val_loss: 4288770998272.0000 - val_mape: 49.5586
Epoch 4/100
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 4164289822720.0000 - mape: 49.5149 - val_loss: 4252754509824.0000 - val_mape: 48.7693
Epoch 5/100
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 5207862280192.0000 - mape: 49.5958 - val_loss: 4215910957056.0000 - val_mape: 47.3944
Epoch 6/100
[1m1200/1200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/s