In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error

# Load train & test feature datasets
train_features = pd.read_csv("/content/final_train1.csv")
test_features = pd.read_csv("/content/final_test1.csv")

# Load target variables
train_target = pd.read_excel("/content/Target_train.xlsx")
test_target = pd.read_excel("/content/Target_test.xlsx") # Fixed path from // to /

# Merge target with feature datasets
final_train = train_features.copy()
final_train["log_TotalExpense"] = train_target["log_TotalExpense"]

final_test = test_features.copy()
final_test["log_totalexpense"] = test_target["log_totalexpense"]

# Separate Features (X) and Target Variable (y)
X_train = final_train
y_train = final_train["log_TotalExpense"]

X_test = final_test
y_test = final_test["log_totalexpense"]

print(X_train.isnull().sum())  # Shows missing values per column
print(y_train.isnull().sum())  # Shows missing values in the target variable


HH Size (For FDQ)         0
Male_Count                0
Female_Count              0
Other_Count               0
Age_0_18                  0
                      ...  
Unnamed: 408         209396
Unnamed: 409         209396
Unnamed: 410         209396
Unnamed: 411         209396
log_TotalExpense          0
Length: 413, dtype: int64
0


In [4]:
X_train.columns

Index(['HH Size (For FDQ)', 'Male_Count', 'Female_Count', 'Other_Count',
       'Age_0_18', 'Age_18_60', 'Age_60_above',
       'Highest educational level attained_head',
       'Total year of education completed_head',
       'Highest educational level attained_median',
       ...
       'Marital Status_head_3', 'Marital Status_head_4',
       'Whether used internet from any location during last 30 days_1',
       'Whether used internet from any location during last 30 days_2',
       'Unnamed: 407', 'Unnamed: 408', 'Unnamed: 409', 'Unnamed: 410',
       'Unnamed: 411', 'log_TotalExpense'],
      dtype='object', length=413)

In [5]:
# List of columns to drop
columns_to_drop = ['Unnamed: 407', 'Unnamed: 408', 'Unnamed: 409', 'Unnamed: 410', 'Unnamed: 411', 'log_TotalExpense']

# Drop from both train and test sets (if they exist)
X_train.drop(columns=[col for col in columns_to_drop if col in X_train.columns], inplace=True)
X_test.drop(columns=[col for col in columns_to_drop if col in X_test.columns], inplace=True)

In [10]:
# Ensure Train & Test Have the Same Features
X_train, X_test = X_train.align(X_test, join="inner", axis=1)

# Build the ANN Model
model = keras.Sequential([
    keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),  # Input Layer
    keras.layers.Dense(64, activation='relu'),  # Hidden Layer 1
    keras.layers.Dense(32, activation='relu'),  # Hidden Layer 2
    keras.layers.Dense(1)  # Output Layer (Regression)
])

# Compile the Model
model.compile(optimizer='adam', loss='mse')

# Train the Model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=32, verbose=1)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m6544/6544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 4ms/step - loss: 1.0380 - val_loss: 0.1283
Epoch 2/50
[1m6544/6544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 4ms/step - loss: 0.1211 - val_loss: 0.1138
Epoch 3/50
[1m6544/6544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 4ms/step - loss: 0.1129 - val_loss: 0.1130
Epoch 4/50
[1m6544/6544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 4ms/step - loss: 0.1103 - val_loss: 0.1293
Epoch 5/50
[1m6544/6544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 4ms/step - loss: 0.1067 - val_loss: 0.1103
Epoch 6/50
[1m6544/6544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 4ms/step - loss: 0.1035 - val_loss: 0.1125
Epoch 7/50
[1m6544/6544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 4ms/step - loss: 0.1015 - val_loss: 0.1040
Epoch 8/50
[1m6544/6544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 4ms/step - loss: 0.1017 - val_loss: 0.1008
Epoch 9/50
[1m6

In [7]:
# Make Predictions
y_pred = model.predict(X_test).flatten()

import numpy as np
from sklearn.metrics import r2_score, mean_squared_error

# Calculate R² Score (Higher is better)
r2 = r2_score(y_test, y_pred)

# Calculate RMSE (Lower is better)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = mean_absolute_percentage_error(y_test, y_pred)

# Print Accuracy Metrics
print(f"ANN R² Score: {r2:.4f}")
print(f"ANN RMSE: {rmse:.2f}")
print(f"ANN MAPE : {mape:4f}")


[1m1636/1636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step
ANN R² Score: 0.7258
ANN RMSE: 0.31
ANN MAPE : 0.023968


In [8]:
# Make Predictions
y_pred = model.predict(X_train).flatten()

[1m6544/6544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step


In [9]:
# Calculate R² Score (Higher is better)
r2 = r2_score(y_train, y_pred)

# Calculate RMSE (Lower is better)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
mape = mean_absolute_percentage_error(y_train, y_pred)

# Print Accuracy Metrics
print(f"ANN R² Score: {r2:.4f}")
print(f"ANN RMSE: {rmse:.2f}")
print(f"ANN MAPE : {mape:4f}")

ANN R² Score: 0.7509
ANN RMSE: 0.29
ANN MAPE : 0.022829
