In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error

# Load train & test feature datasets
train_features = pd.read_csv("/content/final_train1.csv")
test_features = pd.read_csv("/content/final_test1.csv")

# Load target variables
train_target = pd.read_excel("/content/Target_train.xlsx")
test_target = pd.read_excel("/content/Target_test.xlsx") # Fixed path from // to /

# Merge target with feature datasets
final_train = train_features.copy()
final_train["log_TotalExpense"] = train_target["log_TotalExpense"]

final_test = test_features.copy()
final_test["log_totalexpense"] = test_target["log_totalexpense"]

# Assuming your DataFrame is named 'df'
def delete_unnamed_columns(df):
  """Deletes all columns with 'Unnamed' in their name from a DataFrame.

  Args:
    df: The pandas DataFrame to modify.

  Returns:
    The DataFrame with 'Unnamed' columns removed.
  """
  df = df.loc[:, ~df.columns.str.contains('^Unnamed:')]
  return df

# Example usage:
final_train = delete_unnamed_columns(final_train)
final_test = delete_unnamed_columns(final_test)

# Separate Features (X) and Target Variable (y)
X_train = final_train
y_train = final_train["log_TotalExpense"]

X_test = final_test
y_test = final_test["log_totalexpense"]

# Drop the log_totalexpense column if it exists
X_train = X_train.drop(columns=['log_totalexpense'], errors='ignore')
X_test = X_test.drop(columns=['log_totalexpense'], errors='ignore')

print(X_train.isnull().sum())  # Shows missing values per column
print(y_train.isnull().sum())  # Shows missing values in the target variable


HH Size (For FDQ)                                                0
Male_Count                                                       0
Female_Count                                                     0
Other_Count                                                      0
Age_0_18                                                         0
                                                                ..
Marital Status_head_3                                            0
Marital Status_head_4                                            0
Whether used internet from any location during last 30 days_1    0
Whether used internet from any location during last 30 days_2    0
log_TotalExpense                                                 0
Length: 250, dtype: int64
0


In [None]:
# Save X_train and X_test as CSV files
X_train.to_csv("X_train_cleaned.csv", index=False)
X_test.to_csv("X_test_cleaned.csv", index=False)

print("CSV files saved successfully!")


CSV files saved successfully!


In [None]:

print("Final Train Shape:", final_train.shape)
print("Final Test Shape:", final_test.shape)


Final Train Shape: (209396, 250)
Final Test Shape: (52350, 250)


In [None]:
X_train.shape

(209396, 249)

In [None]:
# Ensure Train & Test Have the Same Features
X_train, X_test = X_train.align(X_test, join="inner", axis=1)

# Build the ANN Model
model = keras.Sequential([
    keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),  # Input Layer
    keras.layers.Dense(64, activation='relu'),  # Hidden Layer 1
    keras.layers.Dense(32, activation='relu'),  # Hidden Layer 2
    keras.layers.Dense(1)  # Output Layer (Regression)
])

# Compile the Model
model.compile(optimizer='adam', loss='mse')

# Train the Model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=32, verbose=1)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m6544/6544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 4ms/step - loss: 0.8685 - val_loss: 0.1251
Epoch 2/50


KeyboardInterrupt: 

In [None]:
# Make Predictions
y_pred = model.predict(X_test).flatten()

import numpy as np
from sklearn.metrics import r2_score, mean_squared_error

# Calculate R² Score (Higher is better)
r2 = r2_score(y_test, y_pred)

# Calculate RMSE (Lower is better)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = mean_absolute_percentage_error(y_test, y_pred)

# Print Accuracy Metrics
print(f"ANN R² Score: {r2:.4f}")
print(f"ANN RMSE: {rmse:.2f}")
print(f"ANN MAPE : {mape:4f}")


[1m1636/1636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
ANN R² Score: 0.6893
ANN RMSE: 0.33
ANN MAPE : 0.026011


In [None]:
# Make Predictions
y_pred = model.predict(X_train).flatten()

[1m6544/6544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step


In [None]:
# Calculate R² Score (Higher is better)
r2 = r2_score(y_train, y_pred)

# Calculate RMSE (Lower is better)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
mape = mean_absolute_percentage_error(y_train, y_pred)

# Print Accuracy Metrics
print(f"ANN R² Score: {r2:.4f}")
print(f"ANN RMSE: {rmse:.2f}")
print(f"ANN MAPE : {mape:4f}")

ANN R² Score: 0.7093
ANN RMSE: 0.32
ANN MAPE : 0.025184


In [None]:
X_train = X_train[[
    "State_22",
    "State_11",
    "State_21",
    "State_16",
    "State_12",
    "State_20",
    "State_19",
    "State_23",
    "Is_HH_Have_Mobile_handset_0",
    "Is_HH_Have_Motorcar_jeep_van_0",
    "Is_HH_Have_Refrigerator_0",
    "Whether used internet from any location during last 30 days_1",
    "Is_HH_Have_Motorcycle_scooter_0",
    "Is_HH_Have_Washing_machine_0",
    "Sector_1",
    "Is_online_Clothing_Purchased_Last365_0",
    "Marital Status_head_2",
    "HH Size (For FDQ)",
    "Is_HH_Have_Television_0",
    "Is_online_Mobile_Handset_Purchased_Last365_0",
    "Is_online_Personal_Goods_Purchased_Last365_0",
    "State_13",
    "Household Type_9"
]]

In [None]:
X_test = X_test[[
    "State_22",
    "State_11",
    "State_21",
    "State_16",
    "State_12",
    "State_20",
    "State_19",
    "State_23",
    "Is_HH_Have_Mobile_handset_0",
    "Is_HH_Have_Motorcar_jeep_van_0",
    "Is_HH_Have_Refrigerator_0",
    "Whether used internet from any location during last 30 days_1",
    "Is_HH_Have_Motorcycle_scooter_0",
    "Is_HH_Have_Washing_machine_0",
    "Sector_1",
    "Is_online_Clothing_Purchased_Last365_0",
    "Marital Status_head_2",
    "HH Size (For FDQ)",
    "Is_HH_Have_Television_0",
    "Is_online_Mobile_Handset_Purchased_Last365_0",
    "Is_online_Personal_Goods_Purchased_Last365_0",
    "State_13",
    "Household Type_9"
]]

In [None]:
# Ensure Train & Test Have the Same Features
X_train, X_test = X_train.align(X_test, join="inner", axis=1)

# Build the ANN Model
model = keras.Sequential([
    keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),  # Input Layer
    keras.layers.Dense(64, activation='relu'),  # Hidden Layer 1
    keras.layers.Dense(32, activation='relu'),  # Hidden Layer 2
    keras.layers.Dense(1)  # Output Layer (Regression)
])

# Compile the Model
model.compile(optimizer='adam', loss='mse')

# Train the Model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=32, verbose=1)

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6544/6544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step - loss: 2.2615 - val_loss: 0.1532
Epoch 2/50
[1m6544/6544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - loss: 0.1414 - val_loss: 0.1444
Epoch 3/50
[1m6544/6544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - loss: 0.1394 - val_loss: 0.1418
Epoch 4/50
[1m6544/6544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 3ms/step - loss: 0.1366 - val_loss: 0.1318
Epoch 5/50
[1m6544/6544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - loss: 0.1340 - val_loss: 0.1313
Epoch 6/50
[1m6544/6544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - loss: 0.1333 - val_loss: 0.1329
Epoch 7/50
[1m6544/6544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - loss: 0.1319 - val_loss: 0.1318
Epoch 8/50
[1m6544/6544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - loss: 0.1333 - val_loss: 0.1300
Epoch 9/50
[1m6544/6544[0

In [None]:
# Make Predictions
y_pred = model.predict(X_test).flatten()

import numpy as np
from sklearn.metrics import r2_score, mean_squared_error

# Calculate R² Score (Higher is better)
r2 = r2_score(y_test, y_pred)

# Calculate RMSE (Lower is better)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = mean_absolute_percentage_error(y_test, y_pred)

# Print Accuracy Metrics
print(f"ANN R² Score: {r2:.4f}")
print(f"ANN RMSE: {rmse:.2f}")
print(f"ANN MAPE : {mape:4f}")


[1m1636/1636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
ANN R² Score: 0.6353
ANN RMSE: 0.36
ANN MAPE : 0.028114


In [None]:
# Make Predictions
y_pred = model.predict(X_train).flatten()

[1m6544/6544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step


In [None]:
# Calculate R² Score (Higher is better)
r2 = r2_score(y_train, y_pred)

# Calculate RMSE (Lower is better)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
mape = mean_absolute_percentage_error(y_train, y_pred)

# Print Accuracy Metrics
print(f"ANN R² Score: {r2:.4f}")
print(f"ANN RMSE: {rmse:.2f}")
print(f"ANN MAPE : {mape:4f}")

ANN R² Score: 0.6403
ANN RMSE: 0.35
ANN MAPE : 0.027826


In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Ensure Train & Test Have the Same Features
X_train, X_test = X_train.align(X_test, join="inner", axis=1)

# Verify Columns Match
print("Train Columns:", X_train.columns.tolist())
print("Test Columns:", X_test.columns.tolist())

# Train XGBoost Model
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

# Train Gradient Boosting Model
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
gbr.fit(X_train, y_train)
y_pred_gbr = gbr.predict(X_test)

# Evaluate Performance
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

r2_xgb = r2_score(y_test, y_pred_xgb)
rmse_xgb = mean_squared_error(y_test, y_pred_xgb) ** 0.5
mape_xgb = mean_absolute_percentage_error(y_test, y_pred_xgb)

# Print Results
print(f"XGBoost - R²: {r2_xgb:.4f}, RMSE: {rmse_xgb:.4f}, MAPE: {mape_xgb:.2f}%")
print(f"Gradient Boosting - R²: {r2_gbr:.4f}, RMSE: {rmse_gbr:.4f}, MAPE: {mape_gbr:.2f}%")

Train Columns: ['HH Size (For FDQ)', 'Male_Count', 'Female_Count', 'Other_Count', 'Age_0_18', 'Age_18_60', 'Age_60_above', 'Highest educational level attained_head', 'Total year of education completed_head', 'Highest educational level attained_median', 'Total year of education completed_median', 'No. of days stayed away from home during last 30 days_avg', 'No. of meals usually taken in a day_avg', 'No. of meals taken during last 30 days from school, balwadi etc._avg', 'No. of meals taken during last 30 days from employer as perquisites or part of wage_avg', 'No. of meals taken during last 30 days others_avg', 'No. of meals taken during last 30 days on payment_avg', 'No. of meals taken during last 30 days at home_avg', 'Sector_1', 'Sector_2', 'State_1', 'State_2', 'State_3', 'State_4', 'State_5', 'State_6', 'State_7', 'State_8', 'State_9', 'State_10', 'State_11', 'State_12', 'State_13', 'State_14', 'State_15', 'State_16', 'State_17', 'State_18', 'State_19', 'State_20', 'State_21', 'Stat

In [None]:
y_pred_gbr = gbr.predict(X_train)

In [None]:
# Evaluate Performance
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

r2_xgb = r2_score(y_train, y_pred_gbr)
rmse_xgb = mean_squared_error(y_train, y_pred_gbr) ** 0.5
mape_xgb = mean_absolute_percentage_error(y_train, y_pred_gbr)

r2_gbr = r2_score(y_train, y_pred_gbr)
rmse_gbr = mean_squared_error(y_train, y_pred_gbr) ** 0.5
mape_gbr = mean_absolute_percentage_error(y_train, y_pred_gbr)

# Print Results
print(f"XGBoost - R²: {r2_xgb:.4f}, RMSE: {rmse_xgb:.4f}, MAPE: {mape_xgb:.2f}%")
print(f"Gradient Boosting - R²: {r2_gbr:.4f}, RMSE: {rmse_gbr:.4f}, MAPE: {mape_gbr:.2f}%")

XGBoost - R²: 0.7001, RMSE: 0.3221, MAPE: 2.55%
Gradient Boosting - R²: 0.7001, RMSE: 0.3221, MAPE: 2.55%


In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error

# Ensure Train & Test Have the Same Features
X_train, X_test = X_train.align(X_test, join="inner", axis=1)

# Train Ridge Regression Model (L2 Regularization)
ridge = Ridge(alpha=1.0)  # alpha controls regularization strength
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)

# Define MAPE Function
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Evaluate Performance
r2_ridge = r2_score(y_test, y_pred_ridge)
rmse_ridge = mean_squared_error(y_test, y_pred_ridge) ** 0.5
mape_ridge = mean_absolute_percentage_error(y_test, y_pred_ridge)

# Print Results
print(f"Ridge Regression - R²: {r2_ridge:.4f}, RMSE: {rmse_ridge:.4f}, MAPE: {mape_ridge:.2f}%")


Ridge Regression - R²: 0.6510, RMSE: 0.3490, MAPE: 2.76%


In [None]:
y_pred_ridge = ridge.predict(X_train)

# Define MAPE Function
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Evaluate Performance
r2_ridge = r2_score(y_train, y_pred_ridge)
rmse_ridge = mean_squared_error(y_train, y_pred_ridge) ** 0.5
mape_ridge = mean_absolute_percentage_error(y_train, y_pred_ridge)

# Print Results
print(f"Ridge Regression - R²: {r2_ridge:.4f}, RMSE: {rmse_ridge:.4f}, MAPE: {mape_ridge:.2f}%")


Ridge Regression - R²: 0.6541, RMSE: 0.3460, MAPE: 2.74%


In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Ensure Train & Test Have the Same Features
X_train, X_test = X_train.align(X_test, join="inner", axis=1)

# Define Base Models
base_models = [
    ('RandomForest', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('GradientBoost', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)),
    ('XGBoost', XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42))
]

# Define Meta-Model (Ridge Regression)
meta_model = Ridge(alpha=1.0)

# Create Stacking Regressor
stacking_model = StackingRegressor(estimators=base_models, final_estimator=meta_model, cv=5)

# Train Stacking Model
stacking_model.fit(X_train, y_train)

# Make Predictions
y_pred_stack = stacking_model.predict(X_test)

# Define MAPE Function
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Evaluate Performance
r2_stack = r2_score(y_test, y_pred_stack)
rmse_stack = mean_squared_error(y_test, y_pred_stack) ** 0.5
mape_stack = mean_absolute_percentage_error(y_test, y_pred_stack)

# Print Results
print(f"Stacking Regressor - R²: {r2_stack:.4f}, RMSE: {rmse_stack:.4f}, MAPE: {mape_stack:.2f}%")
