In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

In [2]:
df = pd.read_csv('data_final_yield_31_10_23.csv')

In [3]:
# Splitting the data
X = df.drop(['id', 'productivity'], axis=1)  # Features excluding 'id' and 'class'
y = df['productivity']  # Target variable

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
imputer = KNNImputer(n_neighbors=5)
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

In [6]:
X_train

Unnamed: 0,red_4_month,red_5_month,red_6_month,red_8_month,red_9_month,elevation_contour,district_name,soil_name
0,0.0890,0.0730,0.1160,0.123000,0.0965,2312.0,23.0,30.0
1,0.0450,0.1540,0.2170,0.174000,0.1660,2146.0,5.0,30.0
2,0.0650,0.2190,0.2524,0.226533,0.1030,1801.0,18.0,36.0
3,0.0226,0.1360,0.0700,0.139000,0.0540,1618.0,33.0,46.0
4,0.0610,0.0974,0.0430,0.039000,0.1136,1621.0,13.0,46.0
...,...,...,...,...,...,...,...,...
8703,0.0450,0.0370,0.1160,0.235000,0.1880,3102.0,19.0,26.0
8704,0.0202,0.0430,0.0390,0.034000,0.0330,2208.0,23.0,10.0
8705,0.0566,0.1920,0.2480,0.101733,0.1070,1681.0,15.0,39.0
8706,0.0644,0.1246,0.1550,0.194000,0.1600,2192.0,23.0,30.0


In [7]:
def linear_regression():
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    return model, y_pred_train, y_pred_test, y_pred_test

def ridge_regression():
    model = Ridge(alpha=1)
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    return model, y_pred_train, y_pred_test

def lasso_regression():
    model = Lasso(alpha=0.1)
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    return model, y_pred_train, y_pred_test

def elastic_net_regression():
    model = ElasticNet(alpha=0.1, l1_ratio=0.5)
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    return model, y_pred_train, y_pred_test

def svr():
    model = SVR(kernel='linear', C=1)
    model.fit(X_train, y_train.ravel())
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    return model, y_pred_train, y_pred_test

def decision_tree_regression():
    model = DecisionTreeRegressor()
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    return model, y_pred_train, y_pred_test

def random_forest_regression():
    model = RandomForestRegressor(n_estimators=100)
    model.fit(X_train, y_train.ravel())
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    return model, y_pred_train, y_pred_test

def gradient_boosting_regression():
    model = GradientBoostingRegressor(n_estimators=100)
    model.fit(X_train, y_train.ravel())
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    return model, y_pred_train, y_pred_test

def xgboost_regression():
    model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    return model, y_pred_train, y_pred_test

def lightgbm_regression():
    model = lgb.LGBMRegressor(n_estimators=100)
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    return model, y_pred_train, y_pred_test

def catboost_regression():
    model = CatBoostRegressor(verbose=0, n_estimators=100)
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    return model, y_pred_train, y_pred_test

In [8]:
regressors = {
    "Linear Regression": linear_regression,
    "Ridge Regression": ridge_regression,
    "Lasso Regression": lasso_regression,
    "Elastic Net Regression": elastic_net_regression,
    "SVR": svr,
    "Decision Tree Regression": decision_tree_regression,
    "Random Forest Regression": random_forest_regression,
    "Gradient Boosting Regression": gradient_boosting_regression,
    "XGBoost Regression": xgboost_regression,
    "LightGBM Regression": lightgbm_regression,
    "CatBoost Regression": catboost_regression
}

In [9]:
results = {}
# Loop over each regressor and store the results
for name, func in regressors.items():
    returned_values = func()
    model = returned_values[0]
    y_pred_train = returned_values[1]
    y_pred_test = returned_values[2]
    
    train_mae = mean_absolute_error(y_train, y_pred_train)
    test_mae = mean_absolute_error(y_test, y_pred_test)
    
    results[name] = {
        "Model": model,
        "Training MAE": train_mae,
        "Test MAE": test_mae
    }

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000122 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1617
[LightGBM] [Info] Number of data points in the train set: 8708, number of used features: 8
[LightGBM] [Info] Start training from score 4.587332


In [10]:
results

{'Linear Regression': {'Model': LinearRegression(),
  'Training MAE': 2.8662459991504012,
  'Test MAE': 2.8674856111723432},
 'Ridge Regression': {'Model': Ridge(alpha=1),
  'Training MAE': 2.866490176035559,
  'Test MAE': 2.86786164854202},
 'Lasso Regression': {'Model': Lasso(alpha=0.1),
  'Training MAE': 2.880580055800497,
  'Test MAE': 2.8914127023133354},
 'Elastic Net Regression': {'Model': ElasticNet(alpha=0.1),
  'Training MAE': 2.881129256537723,
  'Test MAE': 2.891984518621914},
 'SVR': {'Model': SVR(C=1, kernel='linear'),
  'Training MAE': 73.80468375313436,
  'Test MAE': 73.19879775987567},
 'Decision Tree Regression': {'Model': DecisionTreeRegressor(),
  'Training MAE': 0.00033394579696836884,
  'Test MAE': 1.998235415709692},
 'Random Forest Regression': {'Model': RandomForestRegressor(),
  'Training MAE': 0.6343882929145614,
  'Test MAE': 1.6342878139335475},
 'Gradient Boosting Regression': {'Model': GradientBoostingRegressor(),
  'Training MAE': 2.082739385676072,
  'T