In [None]:
!pip install lightgbm



In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error


In [2]:
# Load train & test feature datasets
train_features = pd.read_csv("/content/final_train1.csv")
test_features = pd.read_csv("/content/final_test1.csv")

# Load target variables
train_target = pd.read_excel("/content/Target_train.xlsx")
test_target = pd.read_excel("/content/Target_test.xlsx") # Fixed path from // to /


# Merge target with feature datasets
final_train = train_features.copy()
final_train["log_TotalExpense"] = train_target["log_TotalExpense"]

final_test = test_features.copy()
final_test["log_totalexpense"] = test_target["log_totalexpense"]

# Assuming your DataFrame is named 'df'
def delete_unnamed_columns(df):
  """Deletes all columns with 'Unnamed' in their name from a DataFrame.

  Args:
    df: The pandas DataFrame to modify.

  Returns:
    The DataFrame with 'Unnamed' columns removed.
  """
  df = df.loc[:, ~df.columns.str.contains('^Unnamed:')]
  return df

# Example usage:
final_train = delete_unnamed_columns(final_train)
final_test = delete_unnamed_columns(final_test)


# Separate Features (X) and Target Variable (y)
X_train = final_train
y_train = final_train["log_TotalExpense"]

X_test = final_test
y_test = final_test["log_totalexpense"]

# List of columns to drop
columns_to_drop = ['Unnamed: 407', 'Unnamed: 408', 'Unnamed: 409', 'Unnamed: 410', 'Unnamed: 411', 'log_TotalExpense']

# Drop from both train and test sets (if they exist)
X_train.drop(columns=[col for col in columns_to_drop if col in X_train.columns], inplace=True)
X_test.drop(columns=[col for col in columns_to_drop if col in X_test.columns], inplace=True)

In [3]:
# Rename columns by replacing special characters with underscores (_)
X_train.columns = X_train.columns.str.replace(r"[^\w]", "_", regex=True)
X_test.columns = X_test.columns.str.replace(r"[^\w]", "_", regex=True)

print("Updated Feature Names:", X_train.columns.tolist())


Updated Feature Names: ['HH_Size__For_FDQ_', 'Male_Count', 'Female_Count', 'Other_Count', 'Age_0_18', 'Age_18_60', 'Age_60_above', 'Highest_educational_level_attained_head', 'Total_year_of_education_completed_head', 'Highest_educational_level_attained_median', 'Total_year_of_education_completed_median', 'No__of_days_stayed_away_from_home_during_last_30_days_avg', 'No__of_meals_usually_taken_in_a_day_avg', 'No__of_meals_taken_during_last_30_days_from_school__balwadi_etc__avg', 'No__of_meals_taken_during_last_30_days_from_employer_as_perquisites_or_part_of_wage_avg', 'No__of_meals_taken_during_last_30_days_others_avg', 'No__of_meals_taken_during_last_30_days_on_payment_avg', 'No__of_meals_taken_during_last_30_days_at_home_avg', 'Sector_1', 'Sector_2', 'State_1', 'State_2', 'State_3', 'State_4', 'State_5', 'State_6', 'State_7', 'State_8', 'State_9', 'State_10', 'State_11', 'State_12', 'State_13', 'State_14', 'State_15', 'State_16', 'State_17', 'State_18', 'State_19', 'State_20', 'State_21

In [4]:
# Ensure both X_train and X_test have the same columns
X_train, X_test = X_train.align(X_test, join="inner", axis=1)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

lgb_model = lgb.LGBMRegressor(objective='regression', metric='rmse', random_state=42)
lgb_model.fit(X_train, y_train)  # Train with corrected features

y_pred = lgb_model.predict(X_test)  # Make predictions with aligned test set

print("Model trained and tested successfully!")


X_train shape: (209396, 249)
X_test shape: (52350, 249)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.193593 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 988
[LightGBM] [Info] Number of data points in the train set: 209396, number of used features: 249
[LightGBM] [Info] Start training from score 9.764769
Model trained and tested successfully!


In [5]:
# Find extra/missing columns
train_cols = set(X_train.columns)
test_cols = set(X_test.columns)

extra_cols = test_cols - train_cols
missing_cols = train_cols - test_cols

print("Extra columns in X_test:", extra_cols)
print("Missing columns in X_test:", missing_cols)


Extra columns in X_test: set()
Missing columns in X_test: set()


In [6]:
import numpy as np
import lightgbm as lgb
from sklearn.metrics import r2_score, mean_squared_error

# Function to calculate MAPE
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


In [7]:
# Define the model
lgb_model = lgb.LGBMRegressor(objective='regression', metric='rmse', random_state=42)

# Train the model
lgb_model.fit(X_train, y_train)

# Make predictions on test data
y_pred = lgb_model.predict(X_test)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.192336 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 988
[LightGBM] [Info] Number of data points in the train set: 209396, number of used features: 249
[LightGBM] [Info] Start training from score 9.764769


In [8]:
# Calculate metrics
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = mean_absolute_percentage_error(y_test, y_pred)

# Print results
print(f"✅ LightGBM R² Score: {r2:.4f}")
print(f"✅ LightGBM RMSE: {rmse:.4f}")
print(f"✅ LightGBM MAPE: {mape:.2f}")


✅ LightGBM R² Score: 0.6890
✅ LightGBM RMSE: 0.3295
✅ LightGBM MAPE: 2.60


In [9]:
# Make predictions on test data
y_pred = lgb_model.predict(X_train)


In [10]:
# Calculate metrics
r2 = r2_score(y_train, y_pred)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
mape = mean_absolute_percentage_error(y_train, y_pred)

# Print results
print(f"✅ LightGBM R² Score: {r2:.4f}")
print(f"✅ LightGBM RMSE: {rmse:.4f}")
print(f"✅ LightGBM MAPE: {mape:.2f}")


✅ LightGBM R² Score: 0.6953
✅ LightGBM RMSE: 0.3248
✅ LightGBM MAPE: 2.57
