In [13]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [14]:
data = pd.read_csv('../India_95/Data_new.csv')

In [15]:
data.head()

Unnamed: 0,number,data_0,data_1,data_2,data_3,data_4,data_5,data_6,data_7,data_8,...,data_503,data_504,data_505,data_506,data_507,data_508,data_509,data_510,data_511,label
0,1,1.477283,0.447356,0.073814,1.297098,0.656682,0.650417,0.841603,0.30997,2.813105,...,0.108073,0.101143,2.041315,1.009901,0.673432,2.166417,1.044953,0.426222,3.917333,12.2
1,10,0.90476,0.298885,0.189516,2.444686,0.302006,1.07269,0.979308,0.682768,3.41947,...,0.449496,0.217842,1.449206,1.406352,0.064069,1.235531,0.763827,0.25797,2.00771,11.3
2,11,0.868473,0.243895,0.386576,1.392882,0.452042,1.437542,0.186288,0.395023,2.674868,...,0.332258,0.453314,0.249696,1.020971,0.435663,1.162609,0.818375,0.471542,2.835151,13.2
3,12,1.069151,0.486807,0.0,1.246638,0.457728,0.751029,0.140204,0.699481,1.768866,...,0.087432,0.241764,0.271259,1.057166,0.591804,1.222538,0.574695,0.43706,3.138011,10.6
4,13,0.735834,0.450854,0.382536,1.35563,0.46307,1.522966,0.071118,0.439999,3.768094,...,0.173838,0.760434,0.8919,1.248412,0.070847,0.817019,0.529763,0.251784,2.262109,10.6


In [16]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.3)

In [17]:
# Separate features and target variable in train data
X_train = train_data.drop(columns=['number','label'])
y_train = train_data['label']

# Separate features and target variable in test data
X_test = test_data.drop(columns=['number','label'])
y_test = test_data['label']

In [18]:
algorithms = {
    'Linear Regression': LinearRegression(),
    'SVM Regression': SVR(kernel='poly'),  # Adjust kernel as needed
    'RandomForest': RandomForestRegressor(),
    'Gradient Boost': GradientBoostingRegressor(),
    'knn': KNeighborsRegressor(),
    'LGBM': LGBMRegressor(),
    'CatBoost': CatBoostRegressor(),
    'Kernel Ridge Regressor': KernelRidge(),
    'Elastic Net': ElasticNet(),
    'Bayesian Ridge': BayesianRidge(),
    'XG Boost': XGBRegressor()
}

In [19]:
# Metric tables
metric_table_train = pd.DataFrame()
metric_table_test = pd.DataFrame()

In [20]:
# Run the algorithms ... create metrics and plots
for algorithm_name, algorithm in algorithms.items():

    # Train model
    algorithm.fit(X_train, y_train)

    # Train predictions
    y_train_pred = algorithm.predict(X_train)

    # Test predictions
    y_test_pred = algorithm.predict(X_test)

    # Train metrics
    r2_train = algorithm.score(X_train, y_train)
    mse_train = mean_squared_error(y_train, y_train_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)

    # Test metrics
    r2_test = algorithm.score(X_test, y_test)
    mse_test = mean_squared_error(y_test, y_test_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)

    # Additional metrics using statsmodels for all algorithms
    residuals_train = y_train - y_train_pred
    residuals_test = y_test - y_test_pred

    durbin_watson_stat_train = sm.stats.durbin_watson(residuals_train)
    jb_stat_train, jb_p_value_train, _, _ = sm.stats.jarque_bera(residuals_train)

    durbin_watson_stat_test = sm.stats.durbin_watson(residuals_test)
    jb_stat_test, jb_p_value_test, _, _ = sm.stats.jarque_bera(residuals_test)

    # Update metric tables
    metric_table_train.at[algorithm_name, 'MAE'] = mae_train
    metric_table_train.at[algorithm_name, 'R-squared'] = r2_train
    metric_table_train.at[algorithm_name, 'MSE'] = mse_train
    metric_table_train.at[algorithm_name, 'Durbin-Watson'] = durbin_watson_stat_train
    metric_table_train.at[algorithm_name, 'Jarque-Bera'] = jb_stat_train
    metric_table_train.at[algorithm_name, 'JB P-value'] = jb_p_value_train

    metric_table_test.at[algorithm_name, 'MAE'] = mae_test
    metric_table_test.at[algorithm_name, 'R-squared'] = r2_test
    metric_table_test.at[algorithm_name, 'MSE'] = mse_test
    metric_table_test.at[algorithm_name, 'Durbin-Watson'] = durbin_watson_stat_test
    metric_table_test.at[algorithm_name, 'Jarque-Bera'] = jb_stat_test
    metric_table_test.at[algorithm_name, 'JB P-value'] = jb_p_value_test


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001097 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11623
[LightGBM] [Info] Number of data points in the train set: 66, number of used features: 512
[LightGBM] [Info] Start training from score 11.401515
Learning rate set to 0.026648
0:	learn: 2.0960878	total: 5.14ms	remaining: 5.14s
1:	learn: 2.0835202	total: 9.41ms	remaining: 4.7s
2:	learn: 2.0724721	total: 13ms	remaining: 4.33s
3:	learn: 2.0609848	total: 16.7ms	remaining: 4.15s
4:	learn: 2.0476763	total: 20.1ms	remaining: 4.01s
5:	learn: 2.0351602	total: 23.6ms	remaining: 3.9s
6:	learn: 2.0223660	total: 27.3ms	remaining: 3.87s
7:	learn: 2.0079931	total: 30.8ms	remaining: 3.81s
8:	learn: 1.9945094	total: 34.2ms	remaining: 3.77s
9:	learn: 1.9798698	total: 37.6ms	remaining: 3.72s
10:	learn: 1.9693106	total: 41ms	remaining: 3.68s
11:	learn: 1.9546607	total: 44.3ms	remaining: 3.65s
12:	learn: 1.9409259

In [21]:
# Display metrics in tables
print("Metrics - Train Data:\n")
print(metric_table_train.to_string())
print("-------------------------------------------------")

print("Metrics - Test Data:\n")
print(metric_table_test.to_string())

Metrics - Train Data:

                                 MAE  R-squared           MSE  Durbin-Watson  Jarque-Bera    JB P-value
Linear Regression       5.342528e-15   1.000000  4.619617e-29       1.784735     3.811179  1.487350e-01
SVM Regression          6.084120e-01   0.724047  1.230539e+00       2.269258    42.527368  5.825046e-10
RandomForest            6.298333e-01   0.864295  6.051391e-01       2.470521     1.146900  5.635778e-01
Gradient Boost          4.293676e-03   0.999994  2.743890e-05       2.216872     1.052993  5.906708e-01
knn                     1.620000e+00   0.135165  3.856509e+00       2.376085     1.361187  5.063164e-01
LGBM                    3.446505e-01   0.955807  1.970672e-01       2.270278     1.408758  4.944154e-01
CatBoost                6.718554e-04   1.000000  6.065093e-07       2.398706    11.259165  3.590074e-03
Kernel Ridge Regressor  1.100226e-01   0.994674  2.375029e-02       2.148958     5.429656  6.621633e-02
Elastic Net             1.737801e+00   0.

In [22]:
import pycaret

In [23]:
from pycaret.regression import *
s = setup(data, target='label', ignore_features=['number'], 
          #log_experiment=True, experiment_name="resnet_2", log_data=True, 
          pca=True, pca_method="linear", 
          remove_multicollinearity=True, multicollinearity_threshold=0.5, 
          #feature_selection=True, feature_selection_method="sequential", n_features_to_select='auto', 
          normalize=True, 
          session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,label
2,Target type,Regression
3,Original data shape,"(95, 514)"
4,Transformed data shape,"(95, 67)"
5,Transformed train set shape,"(66, 67)"
6,Transformed test set shape,"(29, 67)"
7,Ignore features,1
8,Numeric features,512
9,Preprocess,True


In [24]:
best_r = compare_models(sort="RMSE")

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,1.7633,4.4606,2.0506,-0.1791,0.168,0.1617,1.105
dummy,Dummy Regressor,1.7746,4.5818,2.0761,-0.1992,0.17,0.1622,0.03
llar,Lasso Least Angle Regression,1.7691,4.7046,2.1058,-0.2295,0.1722,0.162,0.028
lasso,Lasso Regression,1.7691,4.7046,2.1058,-0.2295,0.1722,0.162,0.133
rf,Random Forest Regressor,1.8149,4.7981,2.1296,-0.2728,0.174,0.1667,0.039
et,Extra Trees Regressor,1.8181,4.7959,2.1337,-0.2901,0.1746,0.1673,0.036
en,Elastic Net,1.7775,4.9602,2.1516,-0.2846,0.1753,0.1637,0.029
br,Bayesian Ridge,1.7728,4.9248,2.1665,-0.3618,0.1772,0.1653,0.028
gbr,Gradient Boosting Regressor,1.8263,5.0998,2.2118,-0.5088,0.1808,0.1684,0.053
xgboost,Extreme Gradient Boosting,1.8389,5.3403,2.2138,-0.4336,0.1808,0.1678,0.035
