In [1]:
import os
os.environ['PYDEVD_DISABLE_FILE_VALIDATION'] = '1'


In [3]:
import sys
sys.argv.append("-Xfrozen_modules=off")


In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
# Load Data
df = pd.read_csv("NFHS_5_India_Districts_Factsheet_Data.csv")

# Replace non-numeric values with NaN
df.replace(r'[*()\-,]', '', regex=True, inplace=True)

# Convert all possible columns to numeric
df = df.apply(pd.to_numeric, errors='coerce')


In [5]:
df.head()

Unnamed: 0,District Names,State/UT,Number of Households surveyed,Number of Women age 15-49 years interviewed,Number of Men age 15-54 years interviewed,Female population age 6 years and above who ever attended school (%),Population below age 15 years (%),"Sex ratio of the total population (females per 1,000 males)","Sex ratio at birth for children born in the last five years (females per 1,000 males)",Children under age 5 years whose birth was registered with the civil authority (%),...,Men age 15 years and above wih Mildly elevated blood pressure (Systolic 140-159 mm of Hg and/or Diastolic 90-99 mm of Hg) (%),Men age 15 years and above wih Moderately or severely elevated blood pressure (Systolic ≥160 mm of Hg and/or Diastolic ≥100 mm of Hg) (%),Men age 15 years and above wih Elevated blood pressure (Systolic ≥140 mm of Hg and/or Diastolic ≥90 mm of Hg) or taking medicine to control blood pressure (%),Women (age 30-49 years) Ever undergone a screening test for cervical cancer (%),Women (age 30-49 years) Ever undergone a breast examination for breast cancer (%),Women (age 30-49 years) Ever undergone an oral cavity examination for oral cancer (%),Women age 15 years and above who use any kind of tobacco (%),Men age 15 years and above who use any kind of tobacco (%),Women age 15 years and above who consume alcohol (%),Men age 15 years and above who consume alcohol (%)
0,,,882,764,125,78.0,23.0,973,927,98.0,...,32.9,11.1,47.0,13.4,13.2,5.4,63.5,76.8,29.6,64.5
1,,,874,789,108,82.7,19.8,950,844,100.0,...,22.6,6.0,32.2,1.7,0.3,15.8,46.8,70.5,5.1,45.3
2,,,868,844,134,84.7,21.0,967,935,96.5,...,17.9,6.1,26.9,1.3,0.7,8.0,19.6,50.8,1.7,32.8
3,,,874,780,100,60.0,20.7,1140,1163,95.0,...,14.4,5.5,22.9,1.0,0.2,3.8,7.1,21.3,0.6,28.3
4,,,902,853,134,56.0,20.6,1114,898,95.4,...,14.8,6.4,25.1,4.9,0.6,7.3,11.4,21.5,0.8,32.3


In [7]:

# Apply KNN Imputer for missing values
imputer = KNNImputer(n_neighbors=5)
df.iloc[:, 2:] = imputer.fit_transform(df.iloc[:, 2:])


In [9]:
# Normalize numeric columns
scaler = MinMaxScaler()
df.iloc[:, 2:] = scaler.fit_transform(df.iloc[:, 2:])

 0.86872587 0.86357786 0.83912484 0.82110682 0.78249678 0.83912484
 0.8970399  0.84555985 0.86100386 0.83912484 0.89060489 0.96010296
 0.93564994 0.94851995 0.997426   0.92921493 0.98841699 0.99485199
 0.91634492 0.97168597 0.92535393 0.98198198 0.38996139 0.42342342
 0.96653797 0.94337194 0.998713   0.96138996 0.96267696 0.94208494
 0.88030888 0.90604891 0.9021879  0.9034749  0.9047619  0.90733591
 0.90862291 0.90990991 0.90862291 0.90733591 0.89317889 0.90604891
 0.89060489 0.9047619  0.8996139  0.87644788 0.88803089 0.86615187
 0.91119691 0.90733591 0.9047619  0.88545689 0.91248391 0.8996139
 0.87902188 0.90604891 0.90862291 0.91119691 0.91248391 0.90862291
 0.8957529  0.8996139  0.91119691 0.92792793 0.92792793 0.94079794
 0.8983269  0.90733591 0.94208494 0.94465894 0.92149292 0.88931789
 0.87387387 0.92406692 0.89317889 0.9009009  0.9021879  0.8996139
 0.93693694 0.94208494 0.91634492 0.91377091 0.96911197 0.8957529
 0.96911197 0.98584299 0.97683398 0.97297297 0.95881596 0.9639639

In [11]:
# Rename target variable
anaemia_col = 'All women age 15-49 years who are anaemic22 (%)'
df.rename(columns={anaemia_col: 'anaemia_rate'}, inplace=True)


In [13]:
print(df.columns.tolist())

['District Names', 'State/UT', 'Number of Households surveyed', 'Number of Women age 15-49 years interviewed', 'Number of Men age 15-54 years interviewed', 'Female population age 6 years and above who ever attended school (%)', 'Population below age 15 years (%)', ' Sex ratio of the total population (females per 1,000 males)', 'Sex ratio at birth for children born in the last five years (females per 1,000 males)', 'Children under age 5 years whose birth was registered with the civil authority (%)', 'Deaths in the last 3 years registered with the civil authority (%)', 'Population living in households with electricity (%)', 'Population living in households with an improved drinking-water source1 (%)', 'Population living in households that use an improved sanitation facility2 (%)', 'Households using clean fuel for cooking3 (%)', 'Households using iodized salt (%)', 'Households with any usual member covered under a health insurance/financing scheme (%)', 'Children age 5 years who attended 

In [15]:
df = df.drop(columns=[col for col in df.columns if "anaemic22" in col], errors='ignore')


In [17]:
print(df.columns.tolist())


['District Names', 'State/UT', 'Number of Households surveyed', 'Number of Women age 15-49 years interviewed', 'Number of Men age 15-54 years interviewed', 'Female population age 6 years and above who ever attended school (%)', 'Population below age 15 years (%)', ' Sex ratio of the total population (females per 1,000 males)', 'Sex ratio at birth for children born in the last five years (females per 1,000 males)', 'Children under age 5 years whose birth was registered with the civil authority (%)', 'Deaths in the last 3 years registered with the civil authority (%)', 'Population living in households with electricity (%)', 'Population living in households with an improved drinking-water source1 (%)', 'Population living in households that use an improved sanitation facility2 (%)', 'Households using clean fuel for cooking3 (%)', 'Households using iodized salt (%)', 'Households with any usual member covered under a health insurance/financing scheme (%)', 'Children age 5 years who attended 

In [19]:
# Check correlation of anaemia-related columns with the target
anaemia_cols = [
    'Non-pregnant women age 15-49 years who are anaemic (<12.0 g/dl)22 (%)',
    'Pregnant women age 15-49 years who are anaemic (<11.0 g/dl)22 (%)',

]

# Compute correlation with target
correlation_with_target = df[anaemia_cols + ['anaemia_rate']].corr()

print(correlation_with_target['anaemia_rate'].sort_values(ascending=False))


anaemia_rate                                                             1.000000
Non-pregnant women age 15-49 years who are anaemic (<12.0 g/dl)22 (%)    0.999306
Pregnant women age 15-49 years who are anaemic (<11.0 g/dl)22 (%)        0.687800
Name: anaemia_rate, dtype: float64


In [21]:
# Feature Selection using Random Forest Importance
X = df.drop(columns=['District Names', 'State/UT', 'anaemia_rate','Non-pregnant women age 15-49 years who are anaemic (<12.0 g/dl)22 (%)','Pregnant women age 15-49 years who are anaemic (<11.0 g/dl)22 (%)', 'All women age 15-19 years who are anaemic22 (%)'], errors='ignore')
y = df['anaemia_rate']


In [23]:
# Split data into training and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

In [25]:
df.head()

Unnamed: 0,District Names,State/UT,Number of Households surveyed,Number of Women age 15-49 years interviewed,Number of Men age 15-54 years interviewed,Female population age 6 years and above who ever attended school (%),Population below age 15 years (%),"Sex ratio of the total population (females per 1,000 males)","Sex ratio at birth for children born in the last five years (females per 1,000 males)",Children under age 5 years whose birth was registered with the civil authority (%),...,Men age 15 years and above wih Mildly elevated blood pressure (Systolic 140-159 mm of Hg and/or Diastolic 90-99 mm of Hg) (%),Men age 15 years and above wih Moderately or severely elevated blood pressure (Systolic ≥160 mm of Hg and/or Diastolic ≥100 mm of Hg) (%),Men age 15 years and above wih Elevated blood pressure (Systolic ≥140 mm of Hg and/or Diastolic ≥90 mm of Hg) or taking medicine to control blood pressure (%),Women (age 30-49 years) Ever undergone a screening test for cervical cancer (%),Women (age 30-49 years) Ever undergone a breast examination for breast cancer (%),Women (age 30-49 years) Ever undergone an oral cavity examination for oral cancer (%),Women age 15 years and above who use any kind of tobacco (%),Men age 15 years and above who use any kind of tobacco (%),Women age 15 years and above who consume alcohol (%),Men age 15 years and above who consume alcohol (%)
0,,,0.861004,0.390036,0.482143,0.605948,0.202312,0.377816,0.796795,0.958678,...,1.0,0.550802,0.934343,0.577586,0.90411,0.341772,0.899291,0.948509,0.691589,0.942899
1,,,0.850708,0.407829,0.40625,0.693309,0.109827,0.337955,0.76657,1.0,...,0.626812,0.278075,0.560606,0.073276,0.020548,1.0,0.662411,0.863144,0.119159,0.661786
2,,,0.842986,0.446975,0.522321,0.730483,0.144509,0.367418,0.799709,0.927686,...,0.456522,0.283422,0.426768,0.056034,0.047945,0.506329,0.276596,0.596206,0.03972,0.47877
3,,,0.850708,0.401423,0.370536,0.271375,0.135838,0.667244,0.882739,0.896694,...,0.32971,0.251337,0.325758,0.043103,0.013699,0.240506,0.099291,0.196477,0.014019,0.412884
4,,,0.886744,0.453381,0.522321,0.197026,0.132948,0.622184,0.786235,0.904959,...,0.344203,0.299465,0.381313,0.211207,0.041096,0.462025,0.160284,0.199187,0.018692,0.471449


In [27]:
from sklearn.feature_selection import SelectFromModel

In [53]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

# Fit a RandomForest model for feature selection
feature_selector = RandomForestRegressor(n_estimators=100, random_state=42)
feature_selector.fit(X_train, y_train)

# Get feature importances
importances = feature_selector.feature_importances_

# Set a threshold
threshold = np.percentile(importances, 50)

# Select features with importance above the threshold
selector = SelectFromModel(feature_selector, threshold=threshold, prefit=True)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_temp)

# Get selected feature names and their importance values
selected_features = X_train.columns[selector.get_support()]
selected_importances = importances[selector.get_support()]

# Create a DataFrame for better visualization
selected_features_df = pd.DataFrame({
    'Feature': selected_features,
    'Importance': selected_importances
}).sort_values(by='Importance', ascending=False)


print(f"Selected {len(selected_features_df)} Features:")
print(selected_features_df)

Selected 72 Features:
                                              Feature  Importance
56  Children age 6-59 months who are anaemic (11.0...    0.428486
6   Population living in households that use an im...    0.041926
13  Women age 15-19 years who were already mothers...    0.039453
38  Children age 12-23 months who have received 3 ...    0.034972
57  Women  age 15 years and above with high (141-1...    0.023805
..                                                ...         ...
50  Non-breastfeeding children age 6-23 months rec...    0.002675
62  Women age 15 years and above wih Moderately or...    0.002623
63  Women age 15 years and above wih Elevated bloo...    0.002593
29  Average out-of-pocket expenditure per delivery...    0.002549
27  Mothers who consumed iron folic acid for 100 d...    0.002462

[72 rows x 2 columns]


In [55]:
X_train.columns = X_train.columns.astype(str)
X_temp.columns = X_temp.columns.astype(str)

In [57]:
X_train.columns = X_train.columns.str.replace(r'[\[\]<>]', '', regex=True)
X_temp.columns = X_temp.columns.str.replace(r'[\[\]<>]', '', regex=True)

In [59]:
print(X_train.columns)


Index(['Number of Households surveyed',
       'Number of Women age 15-49 years interviewed',
       'Number of Men age 15-54 years interviewed',
       'Female population age 6 years and above who ever attended school (%)',
       'Population below age 15 years (%)',
       ' Sex ratio of the total population (females per 1,000 males)',
       'Sex ratio at birth for children born in the last five years (females per 1,000 males)',
       'Children under age 5 years whose birth was registered with the civil authority (%)',
       'Deaths in the last 3 years registered with the civil authority (%)',
       'Population living in households with electricity (%)',
       ...
       'Men age 15 years and above wih Mildly elevated blood pressure (Systolic 140-159 mm of Hg and/or Diastolic 90-99 mm of Hg) (%)',
       'Men age 15 years and above wih Moderately or severely elevated blood pressure (Systolic ≥160 mm of Hg and/or Diastolic ≥100 mm of Hg) (%)',
       'Men age 15 years and above w

In [37]:
!pip install optuna

Defaulting to user installation because normal site-packages is not writeable


In [38]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable


In [63]:
import optuna
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# Split data into train, validation, and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.4, random_state=42)

# Define models (Linear Regression is included without tuning)
models = {
    'LinearRegression': LinearRegression(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'AdaBoost': AdaBoostRegressor(random_state=42),
    'Extra Trees': ExtraTreesRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

# Hyperparameter optimization using Optuna
best_models = {}
best_params = {}

# Main guard to avoid multiprocessing issues in Colab or Jupyter
if __name__ == "__main__":
    for name, model in models.items():
        if name == 'LinearRegression':
            best_models[name] = model.fit(X_train, y_train)
            print(f"Skipping hyperparameter tuning for {name}")
        else:
            def objective(trial):
                params = {}
                if name == 'DecisionTreeRegressor':
                    params = {
                        'max_depth': trial.suggest_int('max_depth', 5, 20),
                        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10)
                    }
                elif name == 'Random Forest':
                    params = {
                        'n_estimators': trial.suggest_int('n_estimators', 100, 300, step=50),
                        'max_depth': trial.suggest_int('max_depth', 10, 30),
                        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10)
                    }
                elif name == 'Gradient Boosting':
                    params = {
                        'n_estimators': trial.suggest_int('n_estimators', 100, 300, step=50),
                        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
                        'max_depth': trial.suggest_int('max_depth', 3, 7)
                    }
                elif name == 'AdaBoost':
                    params = {
                        'n_estimators': trial.suggest_int('n_estimators', 50, 200, step=50),
                        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1)
                    }
                elif name == 'Extra Trees':
                    params = {
                        'n_estimators': trial.suggest_int('n_estimators', 100, 300, step=50),
                        'max_depth': trial.suggest_int('max_depth', 10, 30),
                        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
                        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4)
                    }
                elif name == 'XGBoost':
                    params = {
                        'n_estimators': trial.suggest_int('n_estimators', 100, 300, step=50),
                        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
                        'max_depth': trial.suggest_int('max_depth', 3, 7)
                    }
                
                model.set_params(**params)
                model.fit(X_train, y_train)
                y_pred = model.predict(X_val)
                return r2_score(y_val, y_pred)

            # Use TPESampler for stability and set n_jobs=1 for safe execution
            study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())
            study.optimize(objective, n_trials=20, n_jobs=1)

            best_models[name] = model.set_params(**study.best_params)
            best_params[name] = study.best_params

            print(f"Best Params for {name}: {study.best_params}")

# Evaluate on test set
print("\nFinal Model Performance After Optuna Hyperparameter Tuning:")
print(f"{'Model':<20}{'R2 Score':<15}{'RMSE':<15}")
print("=" * 50)
for name, model in best_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"{name:<20}{r2:.4f}{rmse:.4f}")


[I 2025-03-29 00:39:34,952] A new study created in memory with name: no-name-37ac679a-dca1-4346-9daf-f46423534844
[I 2025-03-29 00:39:34,972] Trial 0 finished with value: 0.2340012160433329 and parameters: {'max_depth': 6, 'min_samples_split': 7}. Best is trial 0 with value: 0.2340012160433329.
[I 2025-03-29 00:39:35,005] Trial 1 finished with value: 0.28914863783816835 and parameters: {'max_depth': 17, 'min_samples_split': 10}. Best is trial 1 with value: 0.28914863783816835.
[I 2025-03-29 00:39:35,032] Trial 2 finished with value: 0.21133552456611393 and parameters: {'max_depth': 6, 'min_samples_split': 6}. Best is trial 1 with value: 0.28914863783816835.
[I 2025-03-29 00:39:35,059] Trial 3 finished with value: 0.17265550815984199 and parameters: {'max_depth': 18, 'min_samples_split': 7}. Best is trial 1 with value: 0.28914863783816835.
[I 2025-03-29 00:39:35,090] Trial 4 finished with value: 0.28504930853121724 and parameters: {'max_depth': 10, 'min_samples_split': 5}. Best is trial

Skipping hyperparameter tuning for LinearRegression


[I 2025-03-29 00:39:35,172] Trial 7 finished with value: 0.31008835808246615 and parameters: {'max_depth': 20, 'min_samples_split': 3}. Best is trial 7 with value: 0.31008835808246615.
[I 2025-03-29 00:39:35,200] Trial 8 finished with value: 0.23340064995418452 and parameters: {'max_depth': 8, 'min_samples_split': 2}. Best is trial 7 with value: 0.31008835808246615.
[I 2025-03-29 00:39:35,225] Trial 9 finished with value: 0.29225153441818974 and parameters: {'max_depth': 20, 'min_samples_split': 6}. Best is trial 7 with value: 0.31008835808246615.
[I 2025-03-29 00:39:35,301] Trial 10 finished with value: 0.16677704281730787 and parameters: {'max_depth': 14, 'min_samples_split': 4}. Best is trial 7 with value: 0.31008835808246615.
[I 2025-03-29 00:39:35,337] Trial 11 finished with value: 0.23533061912031872 and parameters: {'max_depth': 20, 'min_samples_split': 8}. Best is trial 7 with value: 0.31008835808246615.
[I 2025-03-29 00:39:35,367] Trial 12 finished with value: 0.24738614872192

Best Params for DecisionTreeRegressor: {'max_depth': 20, 'min_samples_split': 3}


[I 2025-03-29 00:39:37,786] Trial 0 finished with value: 0.5726457997603451 and parameters: {'n_estimators': 200, 'max_depth': 17, 'min_samples_split': 7}. Best is trial 0 with value: 0.5726457997603451.
[I 2025-03-29 00:39:39,084] Trial 1 finished with value: 0.5605632152257953 and parameters: {'n_estimators': 100, 'max_depth': 14, 'min_samples_split': 3}. Best is trial 0 with value: 0.5726457997603451.
[I 2025-03-29 00:39:41,741] Trial 2 finished with value: 0.570665107269049 and parameters: {'n_estimators': 250, 'max_depth': 19, 'min_samples_split': 8}. Best is trial 0 with value: 0.5726457997603451.
[I 2025-03-29 00:39:45,348] Trial 3 finished with value: 0.5688203644699806 and parameters: {'n_estimators': 300, 'max_depth': 12, 'min_samples_split': 7}. Best is trial 0 with value: 0.5726457997603451.
[I 2025-03-29 00:39:48,010] Trial 4 finished with value: 0.5670213126905063 and parameters: {'n_estimators': 200, 'max_depth': 25, 'min_samples_split': 4}. Best is trial 0 with value: 0

Best Params for Random Forest: {'n_estimators': 250, 'max_depth': 22, 'min_samples_split': 2}


[I 2025-03-29 00:40:29,561] Trial 0 finished with value: 0.5682982536305633 and parameters: {'n_estimators': 300, 'learning_rate': 0.07436320715443351, 'max_depth': 4}. Best is trial 0 with value: 0.5682982536305633.
[I 2025-03-29 00:40:32,458] Trial 1 finished with value: 0.5838791136831292 and parameters: {'n_estimators': 250, 'learning_rate': 0.058355750417975215, 'max_depth': 4}. Best is trial 1 with value: 0.5838791136831292.
[I 2025-03-29 00:40:34,927] Trial 2 finished with value: 0.48695456825441497 and parameters: {'n_estimators': 150, 'learning_rate': 0.014620386913034383, 'max_depth': 7}. Best is trial 1 with value: 0.5838791136831292.
[I 2025-03-29 00:40:37,258] Trial 3 finished with value: 0.5862977635735153 and parameters: {'n_estimators': 200, 'learning_rate': 0.07463160445383774, 'max_depth': 5}. Best is trial 3 with value: 0.5862977635735153.
[I 2025-03-29 00:40:38,856] Trial 4 finished with value: 0.583369974012861 and parameters: {'n_estimators': 200, 'learning_rate':

Best Params for Gradient Boosting: {'n_estimators': 200, 'learning_rate': 0.08079452903025511, 'max_depth': 3}


[I 2025-03-29 00:41:12,580] Trial 0 finished with value: 0.4908263912275108 and parameters: {'n_estimators': 150, 'learning_rate': 0.044648401377417976}. Best is trial 0 with value: 0.4908263912275108.
[I 2025-03-29 00:41:13,312] Trial 1 finished with value: 0.4808506558006531 and parameters: {'n_estimators': 100, 'learning_rate': 0.0873985203284593}. Best is trial 0 with value: 0.4908263912275108.
[I 2025-03-29 00:41:14,617] Trial 2 finished with value: 0.4965695064572213 and parameters: {'n_estimators': 200, 'learning_rate': 0.08285235233321196}. Best is trial 2 with value: 0.4965695064572213.
[I 2025-03-29 00:41:16,079] Trial 3 finished with value: 0.4767636522812191 and parameters: {'n_estimators': 200, 'learning_rate': 0.019590517072036948}. Best is trial 2 with value: 0.4965695064572213.
[I 2025-03-29 00:41:17,556] Trial 4 finished with value: 0.5045457653338148 and parameters: {'n_estimators': 200, 'learning_rate': 0.03620245426519985}. Best is trial 4 with value: 0.504545765333

Best Params for AdaBoost: {'n_estimators': 200, 'learning_rate': 0.0877084036119575}


[I 2025-03-29 00:41:34,259] Trial 0 finished with value: 0.6381207397739209 and parameters: {'n_estimators': 200, 'max_depth': 30, 'min_samples_split': 10, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.6381207397739209.
[I 2025-03-29 00:41:35,594] Trial 1 finished with value: 0.6519555930100571 and parameters: {'n_estimators': 300, 'max_depth': 26, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 1 with value: 0.6519555930100571.
[I 2025-03-29 00:41:36,267] Trial 2 finished with value: 0.6401358272367059 and parameters: {'n_estimators': 200, 'max_depth': 11, 'min_samples_split': 6, 'min_samples_leaf': 2}. Best is trial 1 with value: 0.6519555930100571.
[I 2025-03-29 00:41:36,740] Trial 3 finished with value: 0.6347230167655702 and parameters: {'n_estimators': 150, 'max_depth': 16, 'min_samples_split': 9, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.6519555930100571.
[I 2025-03-29 00:41:37,456] Trial 4 finished with value: 0.6388455192708141 and parameters:

Best Params for Extra Trees: {'n_estimators': 150, 'max_depth': 22, 'min_samples_split': 4, 'min_samples_leaf': 1}


[I 2025-03-29 00:41:48,912] Trial 0 finished with value: 0.5655796290445918 and parameters: {'n_estimators': 250, 'learning_rate': 0.09774085254686468, 'max_depth': 5}. Best is trial 0 with value: 0.5655796290445918.
[I 2025-03-29 00:41:49,588] Trial 1 finished with value: 0.5534957624576473 and parameters: {'n_estimators': 150, 'learning_rate': 0.016680642018692536, 'max_depth': 5}. Best is trial 0 with value: 0.5655796290445918.
[I 2025-03-29 00:41:50,421] Trial 2 finished with value: 0.5447968773202079 and parameters: {'n_estimators': 100, 'learning_rate': 0.020430998577915763, 'max_depth': 6}. Best is trial 0 with value: 0.5655796290445918.
[I 2025-03-29 00:41:50,858] Trial 3 finished with value: 0.5789444882334782 and parameters: {'n_estimators': 100, 'learning_rate': 0.05202017401769758, 'max_depth': 5}. Best is trial 3 with value: 0.5789444882334782.
[I 2025-03-29 00:41:51,941] Trial 4 finished with value: 0.532573662991252 and parameters: {'n_estimators': 150, 'learning_rate': 

Best Params for XGBoost: {'n_estimators': 250, 'learning_rate': 0.08470065594868373, 'max_depth': 3}

Final Model Performance After Optuna Hyperparameter Tuning:
Model               R2 Score       RMSE           
LinearRegression    0.49520.0996
DecisionTreeRegressor0.30510.1168
Random Forest       0.56610.0923
Gradient Boosting   0.63070.0852
AdaBoost            0.53170.0959
Extra Trees         0.60010.0886
XGBoost             0.57330.0915


In [61]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Split the training set into train and validation sets (80-20 split)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define hyperparameter grids (excluding Linear Regression)
param_grids = {
    'DecisionTreeRegressor': {
        'max_depth': [5, 10, 15],
        'min_samples_split': [2, 5, 10],
    },
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 15, 20],
        'min_samples_split': [2, 5, 10],
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.05, 0.1]
    },
    'Extra Trees': {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 15, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'XGBoost': {
        'n_estimators': [100, 150, 200, 250, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7, 10, 15],
    }
}

# Define models (Linear Regression is included but not tuned)
models = {
    'LinearRegression': LinearRegression(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'AdaBoost': AdaBoostRegressor(random_state=42),
    'Extra Trees': ExtraTreesRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

# Dictionary to store best models and parameters
best_models = {}
best_params = {}

# Perform GridSearchCV for all models except Linear Regression
for name, model in models.items():
    if name == 'LinearRegression':
        best_models[name] = model.fit(X_train, y_train)
        print(f"Skipping hyperparameter tuning for {name}")
    else:
        print(f"Training {name} with GridSearchCV...")
        grid_search = GridSearchCV(model, param_grids[name], cv=3, scoring='r2', verbose=1, n_jobs=-1)
        grid_search.fit(X_train, y_train)

        # Store best model and parameters
        best_models[name] = grid_search.best_estimator_
        best_params[name] = grid_search.best_params_

        # Validate on validation set
        val_pred = grid_search.best_estimator_.predict(X_val)
        val_r2 = r2_score(y_val, val_pred)
        print(f"Best Params for {name}: {grid_search.best_params_}")
        print(f"Best R2 Score on Validation for {name}: {val_r2}")

# Evaluate on test set
results = []
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    r2_score_best = r2_score(y_test, y_pred)
    rmse_best = np.sqrt(mean_squared_error(y_test, y_pred))
    results.append((name, r2_score_best, rmse_best))

# Print final scores
print("\nFinal Model Performance After Hyperparameter Tuning:")
print(f"{'Model':<20}{'R2 Score':<15}{'RMSE':<15}")
print("=" * 50)
for model, r2, rmse in results:
    print(f"{model:<20}{r2:.4f}{rmse:.4f}")


Skipping hyperparameter tuning for LinearRegression
Training DecisionTreeRegressor with GridSearchCV...
Fitting 3 folds for each of 9 candidates, totalling 27 fits
Best Params for DecisionTreeRegressor: {'max_depth': 5, 'min_samples_split': 5}
Best R2 Score on Validation for DecisionTreeRegressor: 0.5461101201394132
Training Random Forest with GridSearchCV...
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Params for Random Forest: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
Best R2 Score on Validation for Random Forest: 0.6768543384033578
Training Gradient Boosting with GridSearchCV...
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Params for Gradient Boosting: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 300}
Best R2 Score on Validation for Gradient Boosting: 0.7414961909905804
Training AdaBoost with GridSearchCV...
Fitting 3 folds for each of 9 candidates, totalling 27 fits
Best Params for AdaBoost: {'learning_rate':

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import r2_score, mean_squared_error

# Define the ANN model
def create_ann(input_dim):
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.2))  # Adding dropout to reduce overfitting
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1))  # Output layer for regression (no activation)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mse'])
    return model

# Initialize the model
input_dim = X_train.shape[1]  # Number of features
ann_model = create_ann(input_dim)

# Train the model
history = ann_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=32, verbose=1)

# Predict on test data
y_pred = ann_model.predict(X_test)

# Calculate R2 Score and RMSE
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print model performance
print("\nANN Model Performance on Test Data:")
print(f"R2 Score: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")

# Plotting the training and validation loss
import matplotlib.pyplot as plt

plt.plot(history.history['mse'], label='Training MSE')
plt.plot(history.history['val_mse'], label='Validation MSE')
plt.title('Model MSE')
plt.xlabel('Epochs')
plt.ylabel('MSE')
plt.legend()
plt.show()
