<a href="https://colab.research.google.com/github/Pbhacks/AiGen/blob/main/Best_Priyant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Please note that kindly give it sometime after running the code for trials to complete and the submission file with parameters will auto download
# Scroll at bottom most of this page to check status of code compilation
# Install required packages
!pip install pandas numpy scikit-learn lightgbm xgboost catboost optuna joblib

# Import libraries
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.ensemble import VotingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import SelectFromModel
import optuna
import warnings
from joblib import Parallel, delayed
import time
import os
from google.colab import files

# Set output paths
OUTPUT_DIR = '/content'
SUBMISSION_PATH = os.path.join(OUTPUT_DIR, 'submission.csv')
PERFORMANCE_PATH = os.path.join(OUTPUT_DIR, 'model_performance.txt')

# Track execution time
start_time = time.time()
warnings.filterwarnings('ignore')

print("Loading data...")
# Load data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Fix column names (strip spaces)
train_df.columns = train_df.columns.str.strip()
test_df.columns = test_df.columns.str.strip()

# Ensure correct target column name
target_col = "Fat_Content"
if target_col not in train_df.columns:
    raise KeyError(f"Column '{target_col}' not found in training dataset. Check column names.")

# Save test IDs for submission
test_ids = test_df["Id"]

print("Preprocessing data...")
# Feature Engineering - Optimized to create fewer but more meaningful features
def create_features(df, is_train=True):
    df_new = df.copy()

    # Create numerical feature interactions - only the most important ones
    numeric_cols = df_new.select_dtypes(include=['int64', 'float64']).columns.tolist()
    if 'Id' in numeric_cols:
        numeric_cols.remove('Id')

    # For training data, remove target from numeric cols for feature engineering
    if is_train and target_col in numeric_cols:
        numeric_cols.remove(target_col)

    # Reduced set of transformations - focus on the most useful ones
    for col in numeric_cols:
        df_new[f'{col}_log'] = np.log1p(df_new[col].clip(lower=0))

    # Create fewer interactions - only between pairs likely to be informative
    # Limit to top 5 numeric columns if there are many
    top_numeric = numeric_cols[:min(5, len(numeric_cols))]
    for i in range(len(top_numeric)):
        for j in range(i+1, len(top_numeric)):
            col1, col2 = top_numeric[i], top_numeric[j]
            df_new[f'{col1}_times_{col2}'] = df_new[col1] * df_new[col2]
            df_new[f'{col1}_div_{col2}'] = df_new[col1] / (df_new[col2] + 1e-5)

    return df_new

# Create new features
train_df = create_features(train_df, is_train=True)
test_df = create_features(test_df, is_train=False)

# Handle missing values in training and test data with more efficient methods
for col in train_df.columns:
    if col != target_col and col != 'Id':
        if train_df[col].dtype == "object":
            most_frequent = train_df[col].mode()[0]
            train_df[col] = train_df[col].fillna(most_frequent)
        else:
            train_df[col] = train_df[col].fillna(train_df[col].median())

for col in test_df.columns:
    if col != 'Id':
        if test_df[col].dtype == "object":
            most_frequent = test_df[col].mode()[0] if not test_df[col].mode().empty else "Unknown"
            test_df[col] = test_df[col].fillna(most_frequent)
        else:
            test_df[col] = test_df[col].fillna(test_df[col].median())

# Advanced categorical encoding - focus on the most effective techniques
categorical_cols = [col for col in train_df.select_dtypes(include=['object']).columns
                    if col != target_col and col in test_df.columns]

# Target encoding for categorical features
for col in categorical_cols:
    # Calculate target mean for each category
    target_means = train_df.groupby(col)[target_col].mean()

    # Map these means to both train and test data
    train_df[f'{col}_target_mean'] = train_df[col].map(target_means)
    test_df[f'{col}_target_mean'] = test_df[col].map(target_means)

    # Fill missing values in the new column
    train_df[f'{col}_target_mean'] = train_df[f'{col}_target_mean'].fillna(train_df[target_col].mean())
    test_df[f'{col}_target_mean'] = test_df[f'{col}_target_mean'].fillna(train_df[target_col].mean())

# Label encoding for categorical features
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    label_encoders[col] = le
    if col in test_df.columns:
        # Handle unseen categories in test set
        test_categories = set(test_df[col].astype(str).unique())
        train_categories = set(le.classes_)
        new_categories = test_categories - train_categories

        if new_categories:
            test_df[col] = test_df[col].apply(lambda x: str(x) if str(x) in train_categories else 'Unknown')
            # Re-fit with the new 'Unknown' category if needed
            if 'Unknown' not in train_categories:
                train_df.loc[len(train_df)] = train_df.iloc[0].copy()
                train_df.loc[len(train_df)-1, col] = 'Unknown'
                le = LabelEncoder()
                train_df[col] = le.fit_transform(train_df[col].astype(str))
                label_encoders[col] = le

        test_df[col] = test_df[col].astype(str).map(
            lambda x: le.transform([x])[0] if x in le.classes_ else le.transform(['Unknown'])[0]
        )

# Ensure both datasets have the same columns (except target and Id)
train_features = [col for col in train_df.columns if col != target_col and col != 'Id']
test_features = [col for col in test_df.columns if col != 'Id']

# Find columns in train but not in test
for col in train_features:
    if col not in test_features:
        test_df[col] = 0  # Add missing columns to test

# Find columns in test but not in train
X_cols = [col for col in train_df.columns if col != target_col and col != 'Id']
for col in test_features:
    if col not in X_cols and col != 'Id':
        train_df[col] = 0  # Add missing columns to train

# Scale numeric features
numeric_cols = [col for col in train_df.select_dtypes(include=['int64', 'float64']).columns
                if col != target_col and col != 'Id']
common_numeric_cols = [col for col in numeric_cols if col in test_df.columns]

scaler = StandardScaler()
train_df[common_numeric_cols] = scaler.fit_transform(train_df[common_numeric_cols])
test_df[common_numeric_cols] = scaler.transform(test_df[common_numeric_cols])

# Split data
X = train_df.drop(columns=[target_col, 'Id'] if 'Id' in train_df.columns else [target_col])
y = train_df[target_col]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure X_train and test_df have the same columns for prediction
common_columns = [col for col in X_train.columns if col in test_df.columns]
test_df_features = test_df[common_columns]

print("Running feature selection...")
# Feature selection using LightGBM importance - with faster params
initial_model = lgb.LGBMRegressor(random_state=42, n_estimators=100, verbose=-1)
initial_model.fit(X_train, y_train)
feature_selector = SelectFromModel(initial_model, threshold='median')
feature_selector.fit(X_train, y_train)
X_train_selected = feature_selector.transform(X_train)
X_valid_selected = feature_selector.transform(X_valid)
test_df_selected = feature_selector.transform(test_df_features)

# Get selected feature names
selected_features_mask = feature_selector.get_support()
selected_features = [feature for feature, selected in zip(X_train.columns, selected_features_mask) if selected]
print(f"Selected {len(selected_features)} features out of {len(X_train.columns)}")

print("Optimizing LightGBM model...")
# Use faster objective function with fewer folds and early stopping
def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'mae',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-1, 10.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-1, 10.0, log=True),
        'random_state': 42
    }

    # Use 3-fold CV instead of 5-fold for faster execution
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    cv_scores = []

    for train_idx, val_idx in kf.split(X_train_selected):
        X_fold_train, X_fold_val = X_train_selected[train_idx], X_train_selected[val_idx]
        y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model = lgb.LGBMRegressor(**params)
        model.fit(
            X_fold_train, y_fold_train,
            eval_set=[(X_fold_val, y_fold_val)],
            eval_metric='mae',
            callbacks=[lgb.early_stopping(20, verbose=False)]  # More aggressive early stopping
        )

        preds = model.predict(X_fold_val)
        mae = mean_absolute_error(y_fold_val, preds)
        cv_scores.append(mae)

    return np.mean(cv_scores)

# Run Optuna tuning with fewer trials
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)  # Further reduced for Colab

# Get best parameters
best_params = study.best_params
print("\n========== LightGBM Hyperparameter Tuning Results ==========")
print(f"Best Parameters: {best_params}")
print(f"Best Trial MAE: {study.best_trial.value:.4f}")

print("Training ensemble model...")
# Train models in parallel using joblib
def train_model(model_type, params, X_train, y_train):
    if model_type == 'lgb':
        model = lgb.LGBMRegressor(**params, random_state=42)
    elif model_type == 'xgb':
        model = xgb.XGBRegressor(**params, random_state=42)
    elif model_type == 'cb':
        model = cb.CatBoostRegressor(**params, random_seed=42, verbose=0)
    elif model_type == 'rf':
        model = RandomForestRegressor(**params, random_state=42)

    model.fit(X_train, y_train)
    return model

# 1. LightGBM parameters
lgb_params = best_params

# 2. XGBoost parameters
xgb_params = {
    'n_estimators': min(best_params['n_estimators'], 500),  # Cap at 500 for speed
    'max_depth': best_params['max_depth'],
    'learning_rate': best_params['learning_rate'],
    'subsample': best_params['subsample'],
    'colsample_bytree': best_params['colsample_bytree'],
    'reg_lambda': best_params['reg_lambda'],
    'reg_alpha': best_params['reg_alpha']
}

# 3. CatBoost parameters
cb_params = {
    'iterations': min(best_params['n_estimators'], 300),  # Cap at 300 for Colab
    'depth': best_params['max_depth'],
    'learning_rate': best_params['learning_rate']
}

# 4. Random Forest parameters
rf_params = {
    'n_estimators': 80,  # Further reduced for Colab
    'max_depth': 10,
    'min_samples_split': 5,
    'min_samples_leaf': 4
}

# Train models one by one (more reliable in Colab environment)
print("Training LightGBM model...")
lgb_model = train_model('lgb', lgb_params, X_train_selected, y_train)

print("Training XGBoost model...")
xgb_model = train_model('xgb', xgb_params, X_train_selected, y_train)

print("Training CatBoost model...")
cb_model = train_model('cb', cb_params, X_train_selected, y_train)

print("Training Random Forest model...")
rf_model = train_model('rf', rf_params, X_train_selected, y_train)

# Create a voting ensemble
ensemble = VotingRegressor([
    ('lgb', lgb_model),
    ('xgb', xgb_model),
    ('cb', cb_model),
    ('rf', rf_model)
])
print("Training ensemble model...")
ensemble.fit(X_train_selected, y_train)

# Evaluate models
print("\n========== Model Evaluation ==========")
lgb_preds = lgb_model.predict(X_valid_selected)
xgb_preds = xgb_model.predict(X_valid_selected)
cb_preds = cb_model.predict(X_valid_selected)
rf_preds = rf_model.predict(X_valid_selected)
ensemble_preds = ensemble.predict(X_valid_selected)

# Calculate MAE for each model
lgb_mae = mean_absolute_error(y_valid, lgb_preds)
xgb_mae = mean_absolute_error(y_valid, xgb_preds)
cb_mae = mean_absolute_error(y_valid, cb_preds)
rf_mae = mean_absolute_error(y_valid, rf_preds)
ensemble_mae = mean_absolute_error(y_valid, ensemble_preds)

print(f"LightGBM MAE: {lgb_mae:.4f}")
print(f"XGBoost MAE: {xgb_mae:.4f}")
print(f"CatBoost MAE: {cb_mae:.4f}")
print(f"Random Forest MAE: {rf_mae:.4f}")
print(f"Ensemble MAE: {ensemble_mae:.4f}")

# Choose the best model
best_model_name = "Ensemble"
best_model = ensemble
best_mae = ensemble_mae

if lgb_mae < best_mae:
    best_model_name = "LightGBM"
    best_model = lgb_model
    best_mae = lgb_mae

if xgb_mae < best_mae:
    best_model_name = "XGBoost"
    best_model = xgb_model
    best_mae = xgb_mae

if cb_mae < best_mae:
    best_model_name = "CatBoost"
    best_model = cb_model
    best_mae = cb_mae

if rf_mae < best_mae:
    best_model_name = "Random Forest"
    best_model = rf_model
    best_mae = rf_mae

print(f"\nBest Model: {best_model_name} with MAE: {best_mae:.4f}")

# Final prediction using the best model
print("\nGenerating final predictions...")
test_preds = best_model.predict(test_df_selected)

# Create a weighted prediction by averaging with the original LightGBM model
weighted_preds = 0.7 * test_preds + 0.3 * lgb_model.predict(test_df_selected)

# Save submission file
submission = pd.DataFrame({"Id": test_ids, "Fat_Content": weighted_preds})
submission.to_csv(SUBMISSION_PATH, index=False)

# Calculate execution time
end_time = time.time()
execution_time = end_time - start_time
minutes, seconds = divmod(execution_time, 60)

# Save performance metrics
with open(PERFORMANCE_PATH, "w") as f:
    f.write(f"Best Model: {best_model_name}\n")
    f.write(f"LightGBM Best Parameters: {best_params}\n")
    f.write(f"LightGBM MAE: {lgb_mae:.4f}\n")
    f.write(f"XGBoost MAE: {xgb_mae:.4f}\n")
    f.write(f"CatBoost MAE: {cb_mae:.4f}\n")
    f.write(f"Random Forest MAE: {rf_mae:.4f}\n")
    f.write(f"Ensemble MAE: {ensemble_mae:.4f}\n")
    f.write(f"Best Model MAE: {best_mae:.4f}\n")
    f.write(f"Total execution time: {int(minutes)} minutes and {seconds:.2f} seconds\n")

print("\n✅ Submission file saved as 'submission.csv'")
print(f"✅ Model achieved validation MAE of {best_mae:.4f}")
print(f"✅ Model performance metrics saved to 'model_performance.txt'")
print(f"⏱️ Total execution time: {int(minutes)} minutes and {seconds:.2f} seconds")

# Print feature importance for the best model if it's a tree-based model
if best_model_name in ["LightGBM", "XGBoost", "CatBoost", "Random Forest"]:
    importance = None
    if best_model_name == "LightGBM":
        importance = best_model.feature_importances_
    elif best_model_name == "XGBoost":
        importance = best_model.feature_importances_
    elif best_model_name == "CatBoost":
        importance = best_model.feature_importances_
    else:  # Random Forest
        importance = best_model.feature_importances_

    if importance is not None:
        print("\nTop 10 Feature Importance:")
        feature_importance = sorted(zip(selected_features, importance), key=lambda x: x[1], reverse=True)
        for feature, importance in feature_importance[:10]:
            print(f"{feature}: {importance:.4f}")

# Download the results
files.download(SUBMISSION_PATH)
files.download(PERFORMANCE_PATH)

Loading data...
Preprocessing data...
Running feature selection...


[I 2025-02-25 04:46:08,859] A new study created in memory with name: no-name-2f078db5-ca01-44ca-9338-d115bdcbb285


Selected 37 features out of 71
Optimizing LightGBM model...


[I 2025-02-25 04:46:14,393] Trial 0 finished with value: 3.468801393229427 and parameters: {'num_leaves': 145, 'max_depth': 6, 'learning_rate': 0.011479376336363895, 'n_estimators': 687, 'subsample': 0.6120777962188625, 'colsample_bytree': 0.6519284241138991, 'reg_lambda': 4.412414704580997, 'reg_alpha': 0.6740782634109747}. Best is trial 0 with value: 3.468801393229427.
[I 2025-02-25 04:46:15,764] Trial 1 finished with value: 5.28440067380212 and parameters: {'num_leaves': 66, 'max_depth': 8, 'learning_rate': 0.025714404267908407, 'n_estimators': 101, 'subsample': 0.660157237637398, 'colsample_bytree': 0.7156375745166882, 'reg_lambda': 0.1255045058976736, 'reg_alpha': 1.947606679789437}. Best is trial 0 with value: 3.468801393229427.
[I 2025-02-25 04:46:19,261] Trial 2 finished with value: 3.817155339465001 and parameters: {'num_leaves': 123, 'max_depth': 7, 'learning_rate': 0.034124862190485744, 'n_estimators': 849, 'subsample': 0.6182021443858142, 'colsample_bytree': 0.6277603871366


Best Parameters: {'num_leaves': 91, 'max_depth': 3, 'learning_rate': 0.07255572138573113, 'n_estimators': 329, 'subsample': 0.7209294246375855, 'colsample_bytree': 0.9813313724209274, 'reg_lambda': 3.311208917905888, 'reg_alpha': 0.13612481590450956}
Best Trial MAE: 2.3362
Training ensemble model...
Training LightGBM model...
Training XGBoost model...
Training CatBoost model...
Training Random Forest model...
Training ensemble model...

LightGBM MAE: 2.1204
XGBoost MAE: 1.9470
CatBoost MAE: 2.2915
Random Forest MAE: 1.2039
Ensemble MAE: 1.6335

Best Model: Random Forest with MAE: 1.2039

Generating final predictions...

✅ Submission file saved as 'submission.csv'
✅ Model achieved validation MAE of 1.2039
✅ Model performance metrics saved to 'model_performance.txt'
⏱️ Total execution time: 1 minutes and 3.57 seconds

Top 10 Feature Importance:
Heart_Risk_Index_target_mean: 0.9842
Iron_Concentration_target_mean: 0.0061
Energy_Content_times_Cholesterol_Level: 0.0010
Energy_Content: 0.000

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>