In [None]:
import pandas as pd
import numpy as np
from scipy import stats 
from sklearn.model_selection import KFold, StratifiedKFold
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import time
import warnings
warnings.filterwarnings("ignore")

In [None]:
train_path = 'C:/Users/shiva/Desktop/Kaggle/train.csv'
test_path = 'C:/Users/shiva/Desktop/Kaggle/test.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [None]:
train_df.head()

In [None]:
test_df.head(10)

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.describe()

In [None]:
test_df.describe()

In [None]:
df_all_feats = [col for col in train_df.columns if (col != 'Calories' and col != 'id')]
df_all_feats

In [None]:
num_feats = df_all_feats[1:]

In [None]:
plt.figure(figsize=(6, 4))
sns.histplot(train_df['Calories'], bins=50, kde=True, stat="density", linewidth=0, color='g')
plt.savefig("density_histplot.png")
plt.show()

In [None]:
gender_dist = train_df['Sex'].value_counts()
gender_dist

In [None]:
plt.pie(gender_dist, labels=gender_dist.index, colors=["#65a479", "#a564c9"], autopct="%.2f%%")
plt.savefig("pieplot.png")
plt.show()

In [None]:
gender_dist = test_df['Sex'].value_counts()
plt.pie(gender_dist, labels=gender_dist.index, colors=["#d5695d", "#5d8ca8"], autopct="%.2f%%")
plt.savefig("pieplot.png")
plt.show()

In [None]:
plt.figure(figsize=(8, 12))
palette = plt.get_cmap('Set2').colors
for i, col in enumerate(num_feats, 1):
    plt.subplot(3, 2, i)
    sns.boxplot(train_df[col], color=palette[i % len(palette)])
    plt.title(f'Boxplot of {col}')
plt.savefig("boxplot.png")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 10))
i = 0
for col in num_feats:
    i += 1
    ax = plt.subplot(6, 2, i)
    sns.distplot(train_df[col], fit=stats.norm)

    i += 1
    ax = plt.subplot(6, 2, i)
    res = stats.probplot(train_df[col], plot=plt)

plt.tight_layout()
plt.savefig("distplot.png")
plt.show()

In [None]:
plt.figure(figsize=(8, 12))
palette1 = plt.get_cmap('Set2').colors
palette2 = plt.get_cmap('Set3').colors
for i, col in enumerate(num_feats, 1):
    plt.subplot(3, 2, i)
    sns.kdeplot(train_df[col], color=palette[i % len(palette1)], fill=True) 
    sns.kdeplot(test_df[col], color=palette[i % len(palette2)], fill=True)
    plt.title(f'kdeplot of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.legend(['Train', 'Test'])

plt.savefig("kdeplot.png")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 12))
palette = plt.get_cmap('Set3').colors
for i, col in enumerate(num_feats, 1):
    plt.subplot(3, 2, i)
    sns.scatterplot(x = train_df[col], y = train_df['Calories'], alpha=0.2, color=palette[i % len(palette)])
    sns.regplot(x = train_df[col], y = train_df["Calories"], scatter = False, color = "red")
    plt.title(f'{col} vs Calories')
plt.tight_layout()
plt.savefig("scatterplot.png")
plt.show()

In [None]:
sns.pairplot(train_df[num_feats], corner=True, plot_kws={'alpha': 0.5})
plt.title('Pairwise Scatter Plots')
plt.savefig("pairplot.png")
plt.show()

In [None]:
df_feats = num_feats + ['Calories']
N = len(df_feats)
feats_corr_mat = np.zeros((N, N))
feats_corr_mat = np.corrcoef(train_df[df_feats], rowvar=False)

plt.figure(figsize=(10, 10))
sns.heatmap(feats_corr_mat, annot=True, fmt='.2f', xticklabels=df_feats, yticklabels=df_feats)

plt.tight_layout()
plt.savefig("heatmap.png")
plt.show()

In [None]:
def impute_outliers(df):
    q1 = np.percentile(df, 25)
    q3 = np.percentile(df, 75)
    IQR = q3 - q1
    for i in range(df.shape[0]):
        if df[i] < q1 - 1.5*IQR:
            df[i] = q1 - 1.5*IQR
        elif df[i] > q3 + 1.5*IQR:
            df[i] = q3 + 1.5*IQR
    return df

In [None]:
have_outliers = ['Height', 'Weight', 'Heart_Rate', 'Body_Temp']
for feat in have_outliers:
    train_df[feat] = impute_outliers(train_df[feat])

In [None]:
plt.figure(figsize=(8, 12))
palette = plt.get_cmap('Set2').colors
for i, col in enumerate(num_feats, 1):
    plt.subplot(3, 2, i)
    sns.boxplot(train_df[col], color=palette[i % len(palette)])
    plt.title(f'Boxplot of {col}')
plt.savefig("boxplot_after.png")
plt.tight_layout()
plt.show()

In [None]:
label2code = {
    'male': 1,
    'female': 0,
}

train_df['Sex'] = train_df['Sex'].map(label2code)
test_df['Sex'] = test_df['Sex'].map(label2code)

In [None]:
train_df['Sex_Reversed'] = 1 - train_df['Sex']
test_df['Sex_Reversed'] = 1 - test_df['Sex']

In [None]:
train_df.head()

### Feature Generation

In [None]:
# Manually add some feature columns
def feats_generate(df):
    df['BMI'] = df['Weight'] / (df['Height']**2) * 10000
    BEE = []
    for i in range(df.shape[0]):
        if(df.loc[i, 'Sex'] == 1):
            bee = 66.4730 + 13.7516 * df.loc[i, 'Weight'] + 5.0033 * df.loc[i, 'Height'] - 6.7750 * df.loc[i, 'Age']
        else:
            bee = 655.0955 + 9.5634 * df.loc[i, 'Weight'] + 1.8496 * df.loc[i, 'Height'] - 4.6756 * df.loc[i, 'Age']
        BEE.append(bee)
    df['BEE'] = BEE
    df['BFR'] = 1.2 * df['BMI'] + 0.23 * df['Age'] - 5.4 - 10.8 * df['Sex']
    df['MHR'] = 206.3 - 0.711 * df['Age']
    
    # Since some data cannot be calculated, I simplify the formula for TDEE
    TDEE = []
    for i in range(df.shape[0]):
        intensity = df.loc[i, 'Heart_Rate'] / df.loc[i, 'MHR']
        if(intensity >= 0.5 and intensity < 0.6):
           tdee = df.loc[i, 'BEE'] * 1.2
        elif(intensity >= 0.6 and intensity < 0.7):
            tdee = df.loc[i, 'BEE'] * 1.375
        elif(intensity >= 0.7 and intensity < 0.8):
            tdee = df.loc[i, 'BEE'] * 1.55
        elif(intensity >= 0.8 and intensity < 0.9):
            tdee = df.loc[i, 'BEE'] * 1.725
        else:
            tdee = df.loc[i, 'BEE'] * 1.9       
        
        TDEE.append(tdee)
    df['TDEE'] = TDEE

    df['Heart_Rate_to_Weight'] = df['Heart_Rate'] / df['Weight']
    df['Heart_Rate_to_Height'] = df['Heart_Rate'] / df['Height']
    df['Heart_Rate_to_Age'] = df['Heart_Rate'] / df['Age']

    df['Body_Temp_to_Weight'] = df['Body_Temp'] / df['Weight']
    df['Body_Temp_to_Height'] = df['Body_Temp'] / df['Height']
    df['Body_Temp_to_Age'] = df['Body_Temp'] / df['Age']

    df['Duration_to_Weight'] = df['Duration'] / df['Weight']
    df['Duration_to_Height'] = df['Duration'] / df['Height']
    df['Duration_to_Age'] = df['Duration'] / df['Age']

    df['Heart_Rate_to_Duration'] = df['Heart_Rate'] / df['Duration']
    df['Temp_to_Duration'] = df['Body_Temp'] / df['Duration']
    df['Heart_Rate_to_Temp'] = df['Heart_Rate'] / df['Body_Temp']

# These features turned out not to help the model get better results but to increase the error, 
# so I decided not to generate them↓
    df['Duration^2'] = df['Duration']**2
    df['Body_Temp^2'] = df['Body_Temp']**2
    df['Heart_Rate^2'] = df['Heart_Rate']**2

    # Get age bins
    bins = [0, 18, 30, 45, 60, 100]
    labels = [1, 2, 3, 4, 5]
    df['Age_Range'] = pd.cut(df['Age'], bins=bins, labels=labels)
    
    return df

In [None]:
train_df = feats_generate(train_df)
train_df.head()

In [None]:
test_df = feats_generate(test_df)

In [None]:
def judge_unique_val(df):
    new_df = df.copy()
    unique_durations_val = new_df['Duration'].unique()
    for duration in unique_durations_val:
        heart_rate_col = f'Heart_Rate_Duration_{int(duration)}'
        body_temp_col = f'Body_Temp_Duration_{int(duration)}'
        new_df[heart_rate_col] = np.where(new_df['Duration'] == duration, new_df['Heart_Rate'], 0)
        new_df[body_temp_col] = np.where(new_df['Duration'] == duration, new_df['Body_Temp'], 0)

    unique_ages_val = new_df['Age'].unique()
    for age in unique_ages_val:
        heart_rate_col = f'Heart_Rate_Age_{int(age)}'
        body_temp_col = f'Body_Temp_Age_{int(age)}'
        new_df[heart_rate_col] = np.where(new_df['Age'] == age, new_df['Heart_Rate'], 0)
        new_df[body_temp_col] = np.where(new_df['Age'] == age, new_df['Body_Temp'], 0)
    return new_df

In [None]:
train_df = judge_unique_val(train_df)
test_df = judge_unique_val(test_df) 

In [None]:
train_df.head(3)

In [None]:
def add_feature_cross_terms(df, list1, list2):
    df_new = df.copy()
    for feature1 in list1:
        for feature2 in list2:
            cross_term_name = f"{feature1}_x_{feature2}"
            df_new[cross_term_name] = df_new[feature1] * df_new[feature2]
    return df_new

In [None]:
list1 = ['Duration', 'Heart_Rate', 'Body_Temp']
list2 = ['Sex', 'Sex_Reversed']
train_df = add_feature_cross_terms(train_df, list1, list2)
test_df = add_feature_cross_terms(test_df, list1, list2)

In [None]:
train_df.drop(columns=['Sex_Reversed'], inplace=True)
test_df.drop(columns=['Sex_Reversed'], inplace=True)

In [None]:
def add_categorical_aggregations(df):
    categorical_cols = ['Sex']
    numerical_cols = ['Height', 'Weight', 'Heart_Rate', 'Body_Temp']
    
    for i in range(1, len(categorical_cols) + 1):
        if i == 1:
            for cat_col in categorical_cols:
                aggs = df.groupby(cat_col).agg({num_col: ['min', 'max'] for num_col in numerical_cols})
                aggs.columns = [f"{cat_col}_{num_col}_{agg}" for num_col, agg in aggs.columns]
                df = df.merge(aggs, on=cat_col, how='left')
        elif i == 2:
            for j in range(len(categorical_cols)):
                for k in range(j+1, len(categorical_cols)):
                    cat_col1 = categorical_cols[j]
                    cat_col2 = categorical_cols[k]
                    aggs = df.groupby([cat_col1, cat_col2]).agg({num_col: ['min', 'max'] for num_col in numerical_cols})
                    aggs.columns = [f"{cat_col1}_{cat_col2}_{num_col}_{agg}" for num_col, agg in aggs.columns]
                    df = df.merge(aggs, on=[cat_col1, cat_col2], how='left')
        elif i == 3:
            aggs = df.groupby(categorical_cols).agg({num_col: ['min', 'max'] for num_col in numerical_cols})
            aggs.columns = [f"all_cat_{num_col}_{agg}" for num_col, agg in aggs.columns]
            df = df.merge(aggs, on=categorical_cols, how='left')
    return df

In [None]:
train_df = add_categorical_aggregations(train_df)
test_df = add_categorical_aggregations(test_df)

In [None]:
train_df.head()

In [None]:
df_feats = [col for col in train_df.columns if (col != 'Calories')]

In [None]:
def add_log_interactions(df, numerical_features):
    df_new = df.copy()
    for i in range(len(numerical_features)):
        for j in range(i + 1, len(numerical_features)):
            col1 = numerical_features[i]
            col2 = numerical_features[j]
            df_new[f"{col1}_m_{col2}"] = np.log1p(df_new[col1] * df_new[col2])
            df_new[f"{col1}_d_{col2}"] = np.log1p(df_new[col1] / (df_new[col2] + 1e-5))
    return df_new

In [None]:
numerical_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
train_df = add_log_interactions(train_df, numerical_features)
test_df = add_log_interactions(test_df, numerical_features)

In [None]:
columns_match = train_df.columns.equals(test_df.columns.append(pd.Index(['Calories'])))
if not columns_match:
    train_without_calories = train_df.drop(columns=['Calories'])
    common_columns = [col for col in test_df.columns if col in train_without_calories.columns]
    train_without_calories = train_without_calories[common_columns]
    test_df = test_df[common_columns]
    train_df = pd.concat([train_without_calories, train_df['Calories']], axis=1)

train_without_calories = train_df.drop(columns=['Calories'])
columns_match_after_drop = train_without_calories.columns.equals(test_df.columns)

df_cat_feats = ['Sex', 'Age_Range']
for col in df_cat_feats:
    train_df[col] = train_df[col].astype('int32').astype('category')
    test_df[col] = test_df[col].astype('int32').astype('category')

In [None]:
def target_transformation(df):
    trans_df = df.copy()
    trans_df['Calories'] = np.log1p(df['Calories'])
    return trans_df

In [None]:
trans_train_df = target_transformation(train_df)

In [None]:
trans_train_df.head()

In [None]:
plt.figure(figsize=(6, 4))
sns.histplot(trans_train_df['Calories'], bins=50, kde=True, stat="density", linewidth=0, color='g')
plt.savefig("density_histplot_after.png")
plt.show()

In [None]:
# StratifiedKFold is used here to set up hierarchical cross-validation
bins = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
duration_bins = bins.fit_transform(train_df[['Duration']]).astype(int).flatten()
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
params = {
    'max_depth': 9,
    'colsample_bytree': 0.65,
    'subsample': 0.88,
    'n_estimators': 3500,
    'learning_rate': 0.009,
    'gamma': 0.012,
    'max_delta_step': 2,
    'eval_metric': 'rmse',
    'enable_categorical': True,
    'random_state': 42,
    'early_stopping_rounds': 200,
    'tree_method': 'gpu_hist'
}

In [None]:
def xgb_cv(params, train_df, test_df, feat_cols, cat_feat_cols, target_col, stratified=False, nfold=5, num_boost_round=10000):
    # I kept the original KFold code so that you could make comparisons and verifications
    folds = KFold(n_splits=nfold, shuffle=True, random_state=42)    
    
    target = train_df[target_col]

    oof = np.zeros(train_df.shape[0], dtype=np.float64)
    pred = np.zeros(test_df.shape[0], dtype=np.float64)


    for i, (trn_idx, val_idx) in enumerate(folds.split(train_df.index)):
        print(f'fold={i}', '- ' * 20)
        trn_data = xgb.DMatrix(
            train_df.loc[trn_idx, feat_cols],
            label=target.loc[trn_idx],
            enable_categorical=True
        )
        val_data = xgb.DMatrix(
            train_df.loc[val_idx, feat_cols],
            label=target.loc[val_idx],
            enable_categorical=True
        )

        model = xgb.train(
            params,
            trn_data,
            num_boost_round,
            evals=[(trn_data, 'train'), (val_data, 'valid')],
            verbose_eval=1000,
            early_stopping_rounds=200
        )
        
        oof[val_idx] = model.predict(val_data, iteration_range=(0, model.best_iteration + 1))
        
        tst_data = xgb.DMatrix(test_df[feat_cols], enable_categorical=True)
        pred += model.predict(tst_data, iteration_range=(0, model.best_iteration + 1)) / nfold
        
    cv = mean_squared_error(target, oof)**0.5 
    return cv, oof, pred

In [None]:
params['tree_method'] = 'auto'  
params['gpu_id'] = -1          


In [None]:
xgb_cv, xgb_oof, xgb_pred = xgb_cv(params, trans_train_df, test_df, df_feats, df_cat_feats, 'Calories')

In [None]:
print(xgb_cv) 

In [None]:
params = {
    'iterations': 3000,
    'learning_rate': 0.02,
    'depth': 12,
    'loss_function': 'RMSE',
    'l2_leaf_reg': 3,
    'random_seed': 42,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'early_stopping_rounds': 200,
    'verbose': 1000,
    'task_type': 'GPU'
}

In [None]:
def cb_cv(params, train_df, test_df, feat_cols, cat_feat_cols, target_col, stratified=False, nfold=5, num_boost_round=10000):
    # if use KFold, the code changes are the same as those in XGBoost
    folds = KFold(n_splits=nfold, shuffle=True, random_state=42)
    
    oof = np.zeros(train_df.shape[0])
    pred = np.zeros(test_df.shape[0]) 
    
    target = train_df[target_col]
    
    # for i, (trn_idx, val_idx) in enumerate(skf.split(train_df, duration_bins)):
    for i, (trn_idx, val_idx) in enumerate(folds.split(train_df.index)):
        print(f'fold={i}', '- ' * 20)
        # Training and validation data converted to Pool format
        trn_data = cb.Pool(
            train_df.loc[trn_idx, feat_cols],
            label=target.iloc[trn_idx], 
            feature_names=feat_cols, 
            cat_features=cat_feat_cols
        )
        val_data = cb.Pool(
            train_df.loc[val_idx, feat_cols], 
            label=target.iloc[val_idx], 
            feature_names=feat_cols, 
            cat_features=cat_feat_cols
        )
        
        model = cb.train(
            trn_data,
            params=params,
            eval_set=val_data
        )
        oof[val_idx] = model.predict(val_data)
        tst_data = cb.Pool(test_df[feat_cols], cat_features=cat_feat_cols)
        pred += model.predict(tst_data) / nfold
    
    cv = mean_squared_error(target, oof)**0.5
    return cv, oof, pred


In [None]:
cb_cv, cb_oof, cb_pred = cb_cv(params, trans_train_df, test_df, df_feats, df_cat_feats, 'Calories')

In [None]:
print(cb_cv) 

In [None]:
# weighted average
total_cv = (1 - xgb_cv) + (1 - cb_cv)
weight_xgb = (1 - xgb_cv) / total_cv
weight_cb = (1 - cb_cv) / total_cv
xgb_cb = (weight_xgb * xgb_pred + weight_cb * cb_pred)
# inverse transform to original space
inv_pred = np.expm1(xgb_cb)

In [None]:
inv_pred_df = pd.read_csv('C:/Users/shiva/Desktop/Kaggle/sample_submission.csv')
inv_pred_df['Calories'] = inv_pred
inv_pred_df.to_csv('submission.csv', index=False, header=True)