In [1]:
# Imports
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
def remove_outlier(col):
    q1, q3 = col.quantile([0.25, 0.75])
    IQR = q3 - q1
    lwr_bound = q1 - (1.5 * IQR)
    upr_bound = q3 + (1.5 * IQR)
    return lwr_bound, upr_bound


In [3]:
ytr= pd.read_csv('Final_Dataset_after_temperature.csv')
temp=pd.read_csv('Final_Dataset_after_temperature.csv')

In [4]:
low, high = remove_outlier(ytr["Production_in_tons"])
print(low, high)

-12340.5 20879.5


In [5]:
ytr=ytr[ytr['Production_in_tons']<=high]
len(ytr)

143414

In [6]:
threshold = 20879.5
count_above_threshold = ytr[ytr['Production_in_tons'] > threshold].shape[0]

print(f"Number of values above {threshold}: {count_above_threshold}")


Number of values above 20879.5: 0


In [7]:
X=ytr.drop(["Yield_ton_per_hec", "Production_in_tons"],axis=1)
y=ytr["Production_in_tons"]

In [8]:
X_encoded=pd.get_dummies(X)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


In [10]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)


In [11]:
params = {
    'objective': 'reg:squarederror',
    'max_depth': 10,  # Reduced max_depth to avoid overfitting
    'learning_rate': 0.1,  # Reduced learning rate to improve accuracy
    'n_estimators': 1000,
    'subsample': 0.8,  # To avoid overfitting
    'colsample_bytree': 0.8,  # To avoid overfitting
    'eval_metric': 'rmse',
    'early_stopping_rounds': 10  # Stops if no improvement over 10 rounds
}


In [12]:
model_xgb = xgb.train(params, dtrain, num_boost_round=1000)


Parameters: { "early_stopping_rounds", "n_estimators" } are not used.



In [13]:
y_pred_xgb = model_xgb.predict(dtest)


In [14]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score,accuracy_score

mse_xgb = mean_squared_error(y_test, y_pred_xgb)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f'XGBoost Model - Mean Squared Error: {mse_xgb}')
print(f'XGBoost Model - Mean Absolute Error: {mae_xgb}')
print(f'XGBoost Model - R² Score: {r2_xgb}')


XGBoost Model - Mean Squared Error: 2630684.7686622743
XGBoost Model - Mean Absolute Error: 681.5307730484164
XGBoost Model - R² Score: 0.8502684248224954


In [15]:
# west bengal,rabi,Sesamum,152.54000000000002,22.28,244.0,95.0,0.38934426229508196
custom_input = {
    'State_Name': 'west bengal',
    'Crop_Type': 'rabi',
    'Crop': 'Sesamum',
    'rainfall': 152.54000000000002,
    'temperature': 22.28,
    'Area_in_hectares': 244.0,
}
custom_input_df = pd.DataFrame([custom_input])

In [16]:
custom_input_encoded = pd.get_dummies(custom_input_df)

# # Align the custom input with the training set to ensure it has the same columns
custom_input_encoded = custom_input_encoded.reindex(columns=X_encoded.columns, fill_value=0)
print(custom_input_encoded)


   rainfall  temperature  Area_in_hectares  \
0    152.54        22.28             244.0   

   State_Name_andaman and nicobar islands  State_Name_andhra pradesh  \
0                                       0                          0   

   State_Name_arunachal pradesh  State_Name_assam  State_Name_bihar  \
0                             0                 0                 0   

   State_Name_chandigarh  State_Name_chhattisgarh  ...  Crop_Turmeric  \
0                      0                        0  ...              0   

   Crop_Turnip  Crop_Urad  Crop_Varagu  Crop_Water Melon  Crop_Wheat  \
0            0          0            0                 0           0   

   Crop_Yam  Crop_other fibres  Crop_other misc. pulses  Crop_other oilseeds  
0         0                  0                        0                    0  

[1 rows x 154 columns]


In [17]:
custom_dmatrix = xgb.DMatrix(custom_input_encoded)

In [18]:
y_pred_custom = model_xgb.predict(custom_dmatrix)
print(f'Predicted Production in tons: {y_pred_custom[0]}')


Predicted Production in tons: 96.93362426757812


In [19]:
def pred(model,input_data):
    custom_input_df = pd.DataFrame([input_data])
    custom_input_encoded = pd.get_dummies(custom_input_df)

# # Align the custom input with the training set to ensure it has the same columns
    custom_input_encoded = custom_input_encoded.reindex(columns=X_encoded.columns, fill_value=0)
    custom_dmatrix = xgb.DMatrix(custom_input_encoded)
    y_pred_custom = model_xgb.predict(custom_dmatrix)
    print(f'Predicted Production in tons: {y_pred_custom[0]}')
custom_input = {
    'State_Name': 'andhra pradesh',
    'Crop_Type': 'kharif',
    'Crop': 'Arhar/Tur',
    'rainfall': 654.34,
    'temperature': 29.27,
    'Area_in_hectares': 1400,
}
pred(model_xgb,custom_input)

Predicted Production in tons: 699.2440795898438


In [20]:
!pip install lightgbm



In [26]:

import lightgbm as lgb

# Train LightGBM model
model_lgb = lgb.train(params, train_data, valid_sets=[train_data, test_data], num_boost_round=1000)

# Predictions
y_pred_lgb = model_lgb.predict(X_test, num_iteration=model_lgb.best_iteration)

# Evaluate LightGBM model
mse_lgb = mean_squared_error(y_test, y_pred_lgb)
mae_lgb = mean_absolute_error(y_test, y_pred_lgb)
r2_lgb = r2_score(y_test, y_pred_lgb)

print(f'LightGBM Model - Mean Squared Error: {mse_lgb}')
print(f'LightGBM Model - Mean Absolute Error: {mae_lgb}')
print(f'LightGBM Model - R² Score: {r2_lgb}')

# Prediction function
def pred(model, input_data):
    custom_input_df = pd.DataFrame([input_data])
    custom_input_encoded = pd.get_dummies(custom_input_df)
    
    # Align the custom input with the training set to ensure it has the same columns
    custom_input_encoded = custom_input_encoded.reindex(columns=X_encoded.columns, fill_value=0)
    
    # Predict using LightGBM
    y_pred_custom = model.predict(custom_input_encoded, num_iteration=model.best_iteration)
    print(f'Predicted Production in tons: {y_pred_custom[0]}')

# Custom input example
custom_input = {
    'State_Name': 'west bengal',
    'Crop_Type': 'rabi',
    'Crop': 'Sesamum',
    'rainfall': 152.54000000000002,
    'temperature': 22.28,
    'Area_in_hectares': 244.0,
}

# Predict on custom input using LightGBM
pred(model_lgb, custom_input)




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002759 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 733
[LightGBM] [Info] Number of data points in the train set: 114731, number of used features: 131
[LightGBM] [Info] Start training from score 2416.933517
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[968]	training's rmse: 1386.2	valid_1's rmse: 1532.83
LightGBM Model - Mean Squared Error: 2349574.2922685314
LightGBM Model - Mean Absolute Error: 697.6537405736528
LightGBM Model - R² Score: 0.8662684849326003
Predicted Production in tons: 201.41211843406572
