In [None]:
import pandas as pd

train_file_path = "/kaggle/input/playground-series-s4e12/train.csv"
test_file_path = "/kaggle/input/playground-series-s4e12/test.csv"
submission_file_path = "/kaggle/input/playground-series-s4e12/sample_submission.csv"

df_train = pd.read_csv(train_file_path)
df_test = pd.read_csv(test_file_path)
df_submission = pd.read_csv(submission_file_path)

In [None]:
target = "premium_amount"
df_train.columns = [col.lower().replace(' ', '_') for col in df_train.columns]
df_test.columns = [col.lower().replace(' ', '_') for col in df_test.columns]

df_train = df_train.drop(columns=['id'])
df_test = df_test.drop(columns=['id'])

cat_cols = [col for col in df_train.select_dtypes(include=['object']).columns if col != target]
num_cols = [col for col in df_train.select_dtypes(include=['int', 'float']).columns if col != target]

cat_cols, num_cols

In [None]:
for col in cat_cols:
    print(col, df_train[col].dtype, df_train[col].isna().sum())

In [None]:
from sklearn.impute import SimpleImputer

cat_imputer = SimpleImputer(strategy='most_frequent')
num_imputer = SimpleImputer(strategy='mean')

df_train_imputed = df_train.copy()
df_train_imputed[cat_cols] = cat_imputer.fit_transform(df_train[cat_cols], cat_cols)
df_train_imputed[num_cols] = num_imputer.fit_transform(df_train[num_cols], num_cols)
print(df_train_imputed.isna().sum())

In [None]:
test_cat_imputer = SimpleImputer(strategy='most_frequent')
test_num_imputer = SimpleImputer(strategy='mean')
test_cat_cols = df_test.select_dtypes(include=['object']).columns
test_num_cols = df_test.select_dtypes(exclude=['object']).columns
df_test_imputed = df_test.copy()
df_test_imputed[test_cat_cols] = test_cat_imputer.fit_transform(df_test[test_cat_cols])
df_test_imputed[test_num_cols] = test_num_imputer.fit_transform(df_test[test_num_cols])
print(df_test_imputed.isna().sum())
for col in num_cols:
    if (df_train_imputed[col].nunique() <= 10):
        df_train_imputed[col] = df_train_imputed[col].astype('int64')

In [None]:

test_cat_imputer = SimpleImputer(strategy='most_frequent')
test_num_imputer = SimpleImputer(strategy='mean')
test_cat_cols = df_test.select_dtypes(include=['object']).columns
test_num_cols = df_test.select_dtypes(exclude=['object']).columns
df_test_imputed = df_test.copy()
df_test_imputed[test_cat_cols] = test_cat_imputer.fit_transform(df_test[test_cat_cols])
df_test_imputed[test_num_cols] = test_num_imputer.fit_transform(df_test[test_num_cols])
print(df_test_imputed.isna().sum())
for col in test_num_cols:
    if (df_test_imputed[col].nunique() <= 10):
        df_test_imputed[col] = df_test_imputed[col].astype('int64')

In [None]:

import seaborn as sns
import matplotlib.pyplot as plt

n_num_cols = len(num_cols)
fig, axes = plt.subplots(n_num_cols // 3, 3, figsize=(12, 12))

for col, ax in zip(num_cols, axes.flatten()):
    sns.boxplot(df_train_imputed[col], ax=ax)
    ax.set_title(str(col))
plt.show()

In [None]:
from sklearn.neighbors import LocalOutlierFactor

lof = LocalOutlierFactor(n_neighbors=35)
outlier_labels = lof.fit_predict(df_train_imputed[num_cols])
df_train_imputed['outlier'] = outlier_labels

df_clean = df_train_imputed[df_train_imputed['outlier'] == 1]
print(df_clean.shape, df_train_imputed.shape)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

n_num_cols = len(num_cols)
fig, axes = plt.subplots(n_num_cols // 3, 3, figsize=(12, 12))

for col, ax in zip(num_cols, axes.flatten()):
    sns.boxplot(df_clean[col], ax=ax)
    ax.set_title(str(col))
plt.show()

In [None]:
for col in num_cols:
    print(col, df_train[col].nunique())

In [None]:
df_train_imputed['insurance_duration']

In [None]:
def check_train_test_cols(train_df, test_df):
    test_cols = set(test_df.columns)
    train_cols = set(train_df.columns)

    return train_cols.difference(test_cols)

In [None]:
df_clean = df_clean.drop(columns=['policy_start_date'])
df_test_imputed = df_test_imputed.drop(columns=['policy_start_date'])

In [None]:
df_train_clean = df_clean.copy()
train_cat_cols = [col for col in cat_cols if col != 'policy_start_date']
train_num_cols = num_cols

for col in train_cat_cols:
    print(col, df_clean[col].unique())

In [None]:
df_train_encoded = pd.get_dummies(data=df_train_clean, columns=train_cat_cols)
df_test_encoded = pd.get_dummies(data=df_test_imputed, columns=[col for col in test_cat_cols if col!= 'policy_start_date'])
df_train_encoded = df_train_encoded.drop(columns=['outlier'])
check_train_test_cols(df_train_encoded, df_test_encoded)

In [None]:
all_corr = df_train_encoded.corr()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


# Set up the matplotlib figure
plt.figure(figsize=(20, 20))  # Adjusted figure size

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(data=all_corr, vmin=-1.0, vmax=1.0, cmap='coolwarm', annot=True, annot_kws={"size": 8}, linewidths=.5)

# Set the title
plt.title('Correlation Matrix Heatmap', fontsize=20)

# Display the heatmap
plt.show()

In [None]:
def find_collinear_columns(corr_matrix, threshold):
     
    collinear_pairs = []
    columns = corr_matrix.columns
    
    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                col_pair = (columns[i], columns[j])
                collinear_pairs.append(col_pair)
    
    return collinear_pairs


collinear_columns = find_collinear_columns(all_corr, 0.8)
print("Column pairs with collinearity exceeding threshold:", collinear_columns)

In [None]:
collinear_cols_to_remove = [collinear_columns[i][0] for i in range(len(collinear_columns))]
collinear_cols_to_remove

In [None]:
predictor_cols = [col for col in df_train_encoded.columns if col!= target]
X_corr_y = df_train_encoded[predictor_cols].corrwith(df_train_encoded[target])

In [None]:
abs(X_corr_y).sort_values(ascending=False)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Your existing code
scaler = StandardScaler()
X = df_train_encoded.drop(columns=target)
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_scaled)

# Get feature importance scores
feature_importance = np.abs(pca.components_)

# Get top features for each component
n_features = X.shape[1]  # Number of original features
selected_features = []

for i in range(pca.n_components_):
    # Get indices of top features for this component
    top_features_idx = np.argsort(feature_importance[i])[::-1]
    # Get the corresponding feature names
    top_features = X.columns[top_features_idx]
    # Add to selected features
    selected_features.extend(top_features)

# Remove duplicates while preserving order
selected_features = list(dict.fromkeys(selected_features))
# Take only as many features as we have components
selected_features = selected_features[:X_reduced.shape[1]]

# Create DataFrame with selected original feature names
df_train_reduced = pd.DataFrame(X_reduced, columns=selected_features, index=X.index)
df_train_reduced[target] = df_train_encoded.loc[:, target]
print(df_train_reduced.shape, df_train_encoded.shape)
df_train_reduced.head(5)

In [None]:
df_test_reduced = df_test_encoded[[col for col in df_train_reduced.columns if col!= target]]
df_test_reduced_scaled = pd.DataFrame(scaler.fit_transform(df_test_reduced), columns=df_test_reduced.columns, index=df_test_reduced.index)
check_train_test_cols(df_train_reduced, df_test_reduced_scaled)

In [None]:
df_train_final = df_train_reduced.copy()
df_test_final = df_test_reduced_scaled.copy()
df_train_final.to_csv('df_train_preprocessed.csv')
df_test_final.to_csv('df_test_preprocessed.csv')

In [None]:
df_train_not_scaled = df_train_encoded[selected_features + [target]]
df_test_not_scaled = df_test_encoded[selected_features]
check_train_test_cols(df_train_not_scaled, df_test_not_scaled)
df_train_not_scaled.to_csv('df_train.csv')
df_test_not_scaled.to_csv('df_test.csv')

In [None]:
import pandas as pd

df_train = pd.read_csv('/kaggle/working/df_train.csv')
df_test = pd.read_csv('/kaggle/working/df_test.csv')
df_train.shape, df_test.shape

In [None]:
def get_top_n_imp_cols(corr_matrix, n):
    c = abs(corr_matrix).sort_values(ascending=False)
    top_n_cols = c[:n].index
    return top_n_cols.tolist()


def prepare_train_validate(
    X, y,
    corr,
    top_n_cols=-1,
    target="premium_amount",
    norm=True,
    scale=False,
    
):
    
    predictor_cols = get_top_n_imp_cols(corr, n=top_n_cols)
    X = X.loc[:, predictor_cols]
    if norm:
        min_max_scaler = MinMaxScaler()
        X = min_max_scaler.fit_transform(X=X)
    if scale:
        standard_scaler = StandardScaler()
        X = standard_scaler.fit_transform(X=X)
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=0
    )
    return (X_train, X_val,y_train,  y_val)



def train_lin_reg(model, X_train, y_train, cv):
    scores = cross_validate(estimator=model, X=X_train, y=y_train, scoring=scorer, cv=cv, n_jobs=-1, return_train_score=True)
    mean_train_rmlse = np.mean(scores['train_score'])
    mean_val_rmsle = np.mean(scores['test_score'])
    train_mean_rmlse_scores.append(mean_train_rmlse)
    val_mean_rmlse_scores.append(mean_val_rmsle)
    print('rmlse_mean_train_scores', mean_train_rmlse)
    print('rmlse_mean_validation_scores',mean_val_rmsle)

In [None]:
train_mean_rmlse_scores = []
val_mean_rmlse_scores = []


target = 'premium_amount'
X = df_train.drop(columns=target)
y = df_train.loc[:, target]
corr = X.corrwith(y)
n_top_cols_array = np.random.choice(np.arange(2, 27), size=10, replace=False)
print(n_top_cols_array)
for n_top_col in n_top_cols_array:
    print('number of top cols = ', n_top_col)
    X_train, X_test, y_train,  y_test = prepare_train_validate(X, y, corr, n_top_col)
    train_lin_reg(lin_reg, X_train, y_train, cv=100)

print(train_mean_rmlse_scores, val_mean_rmlse_scores)

In [None]:
plt.plot(n_top_cols_array, train_mean_rmlse_scores, label='Train RMSLE', marker='o')  # Plot train scores
plt.plot(n_top_cols_array, val_mean_rmlse_scores, label='Validation RMSLE', marker='o')
plt.legend()
plt.show()

In [None]:
df_train.columns

In [None]:
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

target='premium_amount'
X = df_train.drop(columns=target)
y = df_train.loc[:, target]
print(X.shape, y.shape)

corr = X.corrwith(y)
X_train, X_test, y_train, y_test = prepare_train_validate(X, y, corr, top_n_cols=-1, scale=True)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

# train_dmatrix = xgb.DMatrix(data=X_train, label=y_train)
# val_dmatrix = xgb.DMatrix(data=X_val, label=y_val)

#print(train_dmatrix.num_row(), train_dmatrix.num_col())
#print(val_dmatrix.num_row(), val_dmatrix.num_col())
print(X_train.shape, X_test.shape)

In [None]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
import optuna

# Define RMSLE function
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# Custom evaluation function for RMSLE
def rmsle_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    score = rmsle(y_true, y_pred)
    return 'rmsle', score

# Objective function for Optuna
def objective(trial):
    num_boost_round = trial.suggest_int("num_boost_round", low=1700, high=1900, step=2)  # centered around 1750-1835
    learning_rate = trial.suggest_float("learning_rate", low=0.0205, high=0.04, log=True)  # centered around 0.0207-0.0210
    max_depth = trial.suggest_int("max_depth", low=2, high=8)  # centered around 5
    reg_alpha = trial.suggest_float("reg_alpha", low=7.70, high=7.85)  # centered around 7.73-7.81
    reg_lambda = trial.suggest_float("reg_lambda", low=5.35, high=5.42)  # centered around 5.37-5.41
    subsample = trial.suggest_float("subsample", low=0.8, high=0.9)  # centered around 0.810-0.811
    colsample_bytree = trial.suggest_float("colsample_bytree", low=0.7, high=0.9)  # centered around 0.790-0.796
    gamma = trial.suggest_float("gamma", low=4.60, high=4.72)  # centered around 4.63-4.71  # centered around 4.28-4.90
    # Create the DMatrix for training
    dtrain = xgb.DMatrix(X_train, label=y_train)

    # Set parameters for XGBoost
    params = {
        'objective': 'reg:squaredlogerror',  # Use squared log error objective
        'eval_metric': 'rmsle',              # Track RMSLE
        
        'learning_rate': learning_rate,
        'max_depth': max_depth,
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'gamma': gamma,
        'random_state': 42,
        'tree_method': 'hist',  # Use 'hist' for faster training
        'device': 'cuda',       # Use GPU
    }

    # Perform cross-validation
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        nfold=5,
        early_stopping_rounds=10,
        as_pandas=True,
        seed=42,
        custom_metric=rmsle_eval  # Use custom RMSLE evaluation function
    )

    # Return the best RMSLE from cross-validation
    return cv_results['test-rmsle-mean'].min()


# Create or load the study
xgb_reg_study = optuna.create_study(
    storage="sqlite:///xgb_reg_study.db",
    direction="minimize",  # Minimize the RMSLE
    load_if_exists=True,
    study_name="xgb_reg_study",
)

# Optimize the study
xgb_reg_study.optimize(objective, n_trials=100, show_progress_bar=True)

# Print the best parameters
print("Best parameters:", xgb_reg_study.best_params)
print("Best validation score (RMSLE):", xgb_reg_study.best_value)

[I 2024-12-20 08:14:37,190] Using an existing study with name 'xgb_reg_study' instead of creating a new one.


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2024-12-20 08:17:12,055] Trial 227 finished with value: 1.09733 and parameters: {'num_boost_round': 1846, 'learning_rate': 0.022069421526422394, 'max_depth': 4, 'reg_alpha': 7.807668719443181, 'reg_lambda': 5.373259758944974, 'subsample': 0.8175300294302966, 'colsample_bytree': 0.8312194865894137, 'gamma': 4.688020645182235}. Best is trial 176 with value: 1.0958708.
[I 2024-12-20 08:19:49,591] Trial 228 finished with value: 1.097441194283634 and parameters: {'num_boost_round': 1842, 'learning_rate': 0.02149688831523411, 'max_depth': 4, 'reg_alpha': 7.805184350613749, 'reg_lambda': 5.3683064205396125, 'subsample': 0.8192169580070954, 'colsample_bytree': 0.8453456598543642, 'gamma': 4.680092943073093}. Best is trial 176 with value: 1.0958708.
[I 2024-12-20 08:22:30,861] Trial 229 finished with value: 1.0958735855303048 and parameters: {'num_boost_round': 1834, 'learning_rate': 0.021116624585106793, 'max_depth': 4, 'reg_alpha': 7.81436384562176, 'reg_lambda': 5.371989843356312, 'subsam

In [None]:
print("Best parameters:", xgb_reg_study.best_params)
print("Best validation score (RMSLE):", xgb_reg_study.best_value)

In [None]:
# Convert trials to DataFrame
trials_df = xgb_reg_study.trials_dataframe()

# Print basic info about the study
print(f"Total number of trials: {len(trials_df)}")
print(f"Best RMSLE achieved: {xgb_reg_study.best_value:.6f}")
print("-" * 50)

# Get the best parameters directly from the study
best_params = xgb_reg_study.best_params

# List of hyperparameters
params = [
    'params_num_boost_round',
    'params_learning_rate',
    'params_max_depth',
    'params_reg_alpha',
    'params_reg_lambda',
    'params_subsample',
    'params_colsample_bytree',
    'params_gamma'
]

# Calculate the threshold for top 10% of results
n_best_trials = max(int(len(trials_df) * 0.1), 1)  # at least 1 trial
top_trials = trials_df.nsmallest(n_best_trials, 'value')

print(f"\nAnalyzing top {n_best_trials} trials:")
print(f"RMSLE range in top trials: {top_trials['value'].min():.6f} to {top_trials['value'].max():.6f}")
print("-" * 50)

# Analyze each parameter
for param in params:
    if param in trials_df.columns:
        param_name = param.replace('params_', '')
        print(f"\n{param_name}:")
        print(f"Best value: {best_params[param_name]:.6f}")
        print(f"Range in top trials: {top_trials[param].min():.6f} to {top_trials[param].max():.6f}")
        print(f"Median in top trials: {top_trials[param].median():.6f}")

print("\n" + "="*50)
print("Best Trial Configuration:")
print(f"RMSLE: {xgb_reg_study.best_value:.6f}")
for param_name, value in best_params.items():
    print(f"{param_name}: {value:.6f}")

In [None]:
# Get the best parameters from the study
best_params = xgb_reg_study.best_params

# Create final model parameters by combining best parameters with other required settings
final_params = {
    'objective': 'reg:squaredlogerror',
    'eval_metric': 'rmsle',
    'learning_rate': best_params['learning_rate'],
    'max_depth': best_params['max_depth'],
    'reg_alpha': best_params['reg_alpha'],
    'reg_lambda': best_params['reg_lambda'],
    'subsample': best_params['subsample'],
    'colsample_bytree': best_params['colsample_bytree'],
    'gamma': best_params['gamma'],
    'random_state': 42,
    'tree_method': 'hist',
    'device': 'cuda'
}

# Create DMatrix for training and testing
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Train the final model
final_model = xgb.train(
    final_params,
    dtrain,
    num_boost_round=best_params['num_boost_round'],
    evals=[(dtrain, 'train'), (dtest, 'test')],
    early_stopping_rounds=10,
    verbose_eval=1000,  # Print evaluation every 10 rounds
    custom_metric=rmsle_eval
)

# Make predictions
train_predictions = final_model.predict(dtrain)
test_predictions = final_model.predict(dtest)

# Calculate RMSLE on both training and test sets
train_rmsle = rmsle(y_train, train_predictions)
test_rmsle = rmsle(y_test, test_predictions)

print(f"Training RMSLE: {train_rmsle:.4f}")
print(f"Test RMSLE: {test_rmsle:.4f}")




# Calculate additional metrics if needed
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

mae = mean_absolute_error(y_test, test_predictions)
r2 = r2_score(y_test, test_predictions)
rmse = np.sqrt(mean_squared_error(y_test, test_predictions))

print("\nAdditional Metrics:")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

In [None]:
import tensorflow as tf
import keras
from keras import layers, initializers, regularizers, activations, Model, losses
import numpy as np
import os
from sklearn.model_selection import train_test_split

# Initialize MirroredStrategy
strategy = tf.distribute.MirroredStrategy()
print(f'Number of devices: {strategy.num_replicas_in_sync}')

MODEL_PATH = 'best_ann_model.keras'

# Custom LR Logger Callback
class LRLogger(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        lr = tf.keras.backend.get_value(self.model.optimizer.learning_rate)
        print(f'\nCurrent learning rate: {lr:.6f}')

# Check if model exists and get user choice
if os.path.exists(MODEL_PATH):
    while True:
        choice = input("\nSaved model found. Choose an option:\n1: Load saved model\n2: Resume training saved model\n3: Train new model\nYour choice (1, 2 or 3): ")
        if choice in ['1', '2', '3']:
            break
        print("Invalid choice. Please enter 1, 2 or 3.")
else:
    print("\nNo saved model found.")
    choice = '3'

if choice == '1':
    print("Loading saved model...")
    with strategy.scope():
        ann_model = keras.models.load_model(MODEL_PATH)
    print("Model loaded successfully!")
    
else:
    # Common data preparation code for both resume training and new training
    print("Preparing data...")
    y_train_pred = final_model.predict(dtrain)
    train_residual_errors = abs(y_train - y_train_pred)

    y_test_pred = final_model.predict(dtest)
    test_residual_errors = abs(y_test - y_test_pred)

    # Create train/validation split manually
    X_train_main, X_val, residual_train, residual_val = train_test_split(
        X_train, train_residual_errors, test_size=0.2, random_state=42
    )

    # Create datasets
    GLOBAL_BATCH_SIZE = 512 * 2 * strategy.num_replicas_in_sync

    # Training dataset
    train_dataset = tf.data.Dataset.from_tensor_slices((X_train_main, residual_train))
    train_dataset = train_dataset.shuffle(1000).batch(GLOBAL_BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

    # Validation dataset
    val_dataset = tf.data.Dataset.from_tensor_slices((X_val, residual_val))
    val_dataset = val_dataset.batch(GLOBAL_BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

    # Test dataset
    test_dataset = tf.data.Dataset.from_tensor_slices((X_test, test_residual_errors))
    test_dataset = test_dataset.batch(GLOBAL_BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

    if choice == '2':
        print("Loading saved model for continued training...")
        with strategy.scope():
            ann_model = keras.models.load_model(MODEL_PATH)
            
            # Get current learning rate before asking
            current_lr = float(tf.keras.backend.get_value(ann_model.optimizer.learning_rate))
            
            # Ask about learning rate
            while True:
                lr_choice = input(f"\nCurrent learning rate is: {current_lr:.6f}\nChoose learning rate option:\n1: Reset to new learning rate\n2: Continue with current learning rate\nYour choice (1 or 2): ")
                if lr_choice in ['1', '2']:
                    break
                print("Invalid choice. Please enter 1 or 2.")
            
            if lr_choice == '1':
                # Ask for new learning rate
                while True:
                    try:
                        new_lr = float(input(f"Current learning rate is {current_lr:.6f}\nEnter new learning rate (e.g., 0.001): "))
                        if new_lr > 0:
                            break
                        print("Learning rate must be positive.")
                    except ValueError:
                        print("Please enter a valid number.")
                
                # Reset learning rate to new value
                ann_model.compile(
                    optimizer=tf.keras.optimizers.Adam(learning_rate=new_lr),
                    loss=losses.MeanSquaredLogarithmicError()
                )
                print(f"Learning rate changed from {current_lr:.6f} to {new_lr:.6f}")
            else:
                # Keep existing learning rate
                ann_model.compile(
                    optimizer=tf.keras.optimizers.Adam(learning_rate=current_lr),
                    loss=losses.MeanSquaredLogarithmicError()
                )
                print(f"Continuing with current learning rate: {current_lr:.6f}")
                
        print("Model loaded successfully!")
        
    else:  # choice == '3'
        print("Creating new model...")
        if os.path.exists(MODEL_PATH):
            confirm = input("Warning: Existing model will be overwritten. Continue? (y/n): ")
            if confirm.lower() != 'y':
                print("Training cancelled.")
                exit()
        
        # Ask for initial learning rate for new model
        while True:
            try:
                initial_lr = float(input("Enter initial learning rate (e.g., 0.001): "))
                if initial_lr > 0:
                    break
                print("Learning rate must be positive.")
            except ValueError:
                print("Please enter a valid number.")
                
        # Create and compile model within strategy scope
        with strategy.scope():
            # Define model
            input_layer = layers.Input(shape=(X_train.shape[1],))
            x = input_layer
            units = 520
            for _ in range(5):
                if units < 5:
                    units = 5
                x = layers.Dense(
                    units,
                    kernel_initializer=initializers.HeUniform,
                    kernel_regularizer=regularizers.L1,
                    activation=activations.gelu,
                )(x)
                x = layers.BatchNormalization()(x)
                x = layers.Dropout(rate=0.7)(x)
                units //= 2
            output_layer = layers.Dense(1, activation=activations.linear)(x)

            # Create model
            ann_model = Model(inputs=input_layer, outputs=output_layer)
            
            # Compile model with MSLE and user-specified learning rate
            ann_model.compile(
                optimizer=tf.keras.optimizers.Adam(learning_rate=initial_lr),
                loss=losses.MeanSquaredLogarithmicError()
            )
            print(f"Model created with initial learning rate: {initial_lr:.6f}")

    # Add callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=35,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.25,
            patience=3,
            min_lr=1e-6,
            verbose=1
        ),
        tf.keras.callbacks.ModelCheckpoint(
            MODEL_PATH,
            monitor='val_loss',
            save_best_only=True,
            mode='min'
        ),
        tf.keras.callbacks.CSVLogger('training_log.csv', separator=',', append=True),
        #LRLogger()
    ]

    # Get number of epochs for training
    if choice == '2':
        additional_epochs = int(input("Enter number of additional epochs for training: "))
        total_epochs = additional_epochs
    else:
        total_epochs = 50

    # Train the model
    print(f"\nStarting training for {total_epochs} epochs...")
    history = ann_model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=total_epochs,
        callbacks=callbacks,
        verbose=1
    )

    # Plot training history if matplotlib is available
    try:
        import matplotlib.pyplot as plt
        
        plt.figure(figsize=(12, 4))
        
        # Plot training loss
        plt.subplot(1, 2, 1)
        plt.plot(history.history['loss'], label='Training Loss')
        plt.plot(history.history['val_loss'], label='Validation Loss')
        plt.title('Model Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        
        # Plot learning rate if available
        if 'lr' in history.history:
            plt.subplot(1, 2, 2)
            plt.plot(history.history['lr'], label='Learning Rate')
            plt.title('Learning Rate')
            plt.xlabel('Epoch')
            plt.ylabel('Learning Rate')
            plt.yscale('log')
            plt.legend()
        
        plt.tight_layout()
        plt.show()
    except Exception as e:
        print(f"Could not plot training history: {str(e)}")

print("Model is ready for use!")

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_log_error
import json
import gc

def rmsle(y_true, y_pred):
    try:
        return np.sqrt(mean_squared_log_error(y_true, y_pred))
    except ValueError as e:
        print(f"Error in RMSLE calculation: {str(e)}")
        print(f"Shapes: y_true: {y_true.shape}, y_pred: {y_pred.shape}")
        raise

def calculate_score_in_batches(y_true, base_pred, residuals, learning_rate, batch_size=1000):
    total_squared_log_error = 0
    total_samples = 0
    
    try:
        # Ensure all inputs are numpy arrays and have correct shapes
        y_true = np.array(y_true).reshape(-1)
        base_pred = np.array(base_pred).reshape(-1)
        residuals = np.array(residuals).reshape(-1)
        
        for i in range(0, len(y_true), batch_size):
            # Get batch indices
            end_idx = min(i + batch_size, len(y_true))
            
            # Get batch data
            y_batch = y_true[i:end_idx]
            base_pred_batch = base_pred[i:end_idx]
            residuals_batch = residuals[i:end_idx]
            
            # Combine predictions for this batch
            combined_pred_batch = base_pred_batch + learning_rate * residuals_batch
            
            # Calculate error for this batch
            batch_score = rmsle(y_batch, combined_pred_batch)
            total_squared_log_error += (batch_score ** 2) * len(y_batch)
            total_samples += len(y_batch)
            
            # Clear batch variables
            del y_batch, base_pred_batch, residuals_batch, combined_pred_batch
            gc.collect()
            
    except Exception as e:
        print(f"Error in batch processing: {str(e)}")
        print(f"Shapes: y_true: {y_true.shape}, base_pred: {base_pred.shape}, residuals: {residuals.shape}")
        return None
    
    return np.sqrt(total_squared_log_error / total_samples)

try:
    # Define learning rates to test
    learning_rates = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
    results = {}

    # Get ANN predictions
    print("Getting ANN predictions...")
    ann_train_residuals = ann_model.predict(X_train, batch_size=256)
    ann_test_residuals = ann_model.predict(X_test, batch_size=256)

    # Test each learning rate
    best_test_score = float('inf')
    best_lr = None

    for lr in learning_rates:
        print(f"\nTesting learning rate: {lr}")
        
        print("Calculating training score...")
        train_score = calculate_score_in_batches(
            y_train, 
            train_predictions, 
            ann_train_residuals, 
            lr,
            batch_size=2500
        )
        
        if train_score is None:
            print(f"Skipping learning rate {lr} due to training score calculation failure")
            continue

        print("Calculating test score...")
        test_score = calculate_score_in_batches(
            y_test, 
            test_predictions, 
            ann_test_residuals, 
            lr,
            batch_size=2500
        )
        
        if test_score is None:
            print(f"Skipping learning rate {lr} due to test score calculation failure")
            continue

        # Calculate improvement
        improvement = ((test_rmsle - test_score) / test_rmsle) * 100

        print(f"Learning Rate: {lr}")
        print(f"Train RMSLE: {train_score:.6f}")
        print(f"Test RMSLE: {test_score:.6f}")
        print(f"Improvement: {improvement:.2f}%")

        # Store results
        results[lr] = {
            'train_rmsle': float(train_score),
            'test_rmsle': float(test_score),
            'improvement_percentage': float(improvement)
        }

        # Track best performing learning rate
        if test_score < best_test_score:
            best_test_score = test_score
            best_lr = lr

    # Print summary of results
    print("\nResults Summary:")
    print("-" * 60)
    print(f"{'Learning Rate':^12} {'Train RMSLE':^15} {'Test RMSLE':^15} {'Improvement':^15}")
    print("-" * 60)
    for lr in learning_rates:
        if lr in results:
            print(f"{lr:^12.2f} {results[lr]['train_rmsle']:^15.6f} {results[lr]['test_rmsle']:^15.6f} {results[lr]['improvement_percentage']:^15.2f}%")
    print("-" * 60)
    print(f"\nBest Learning Rate: {best_lr} (Test RMSLE: {best_test_score:.6f})")

    # Save the model and results
    ann_model.save('ann_residual_model.keras')
    model_info = {
        'learning_rates_tested': learning_rates,
        'best_learning_rate': best_lr,
        'results': results,
        'xgb_test_rmsle': float(test_rmsle)
    }
    with open('ann_model_info.json', 'w') as f:
        json.dump(model_info, f, indent=4)
    print("\nModel and results saved successfully!")

except Exception as e:
    print(f"Error occurred: {str(e)}")
    print("Current shapes:")
    print(f"y_train shape: {y_train.shape}")
    print(f"train_predictions shape: {train_predictions.shape}")
    print(f"ann_train_residuals shape: {ann_train_residuals.shape}")
    
finally:
    # Clean up memory
    print("\nCleaning up memory...")
    gc.collect()

In [None]:
df_test = pd.read_csv('/kaggle/working/df_test.csv')
df_test = df_test.drop(columns=['Unnamed: 0'])
df_test.columns

In [None]:
final_test_dmatrix = xgb.DMatrix(data=df_test)
y_pred= final_model.predict(final_test_dmatrix)
y_pred.shape, df_test.shape

In [None]:
df_initial_final_test = pd.read_csv('/kaggle/input/playground-series-s4e12/test.csv')
final_sub_df = pd.DataFrame(
    {
        'id': df_initial_final_test['id'],
        'Premium Amount':y_pred
    }
)
final_sub_df.tail(10)

In [None]:
final_sub_df.to_csv('submission_2.csv', index=False)

In [None]:
sub_df = pd.read_csv('/kaggle/input/playground-series-s4e12/sample_submission.csv')
sub_df.head(5)

In [None]:
# Convert trials to DataFrame
trials_df = xgb_reg_study.trials_dataframe()

# Print basic info about the study
print(f"Total number of trials: {len(trials_df)}")
print(f"Best RMSLE achieved: {xgb_reg_study.best_value:.6f}")
print("-" * 50)

# Get the best parameters directly from the study
best_params = xgb_reg_study.best_params

# List of hyperparameters
params = [
    'params_num_boost_round',
    'params_learning_rate',
    'params_max_depth',
    'params_reg_alpha',
    'params_reg_lambda',
    'params_subsample',
    'params_colsample_bytree',
    'params_gamma'
]

# Calculate the threshold for top 10% of results
n_best_trials = max(int(len(trials_df) * 0.1), 1)  # at least 1 trial
top_trials = trials_df.nsmallest(n_best_trials, 'value')

print(f"\nAnalyzing top {n_best_trials} trials:")
print(f"RMSLE range in top trials: {top_trials['value'].min():.6f} to {top_trials['value'].max():.6f}")
print("-" * 50)

# Analyze each parameter
for param in params:
    if param in trials_df.columns:
        param_name = param.replace('params_', '')
        print(f"\n{param_name}:")
        print(f"Best value: {best_params[param_name]:.6f}")
        print(f"Range in top trials: {top_trials[param].min():.6f} to {top_trials[param].max():.6f}")
        print(f"Median in top trials: {top_trials[param].median():.6f}")

print("\n" + "="*50)
print("Best Trial Configuration:")
print(f"RMSLE: {xgb_reg_study.best_value:.6f}")
for param_name, value in best_params.items():
    print(f"{param_name}: {value:.6f}")