In [1]:
# ==============================================================================
# VERSION 5: High-Performance Single Regressor
# - Extreme Feature Engineering (Lag, Rolling Windows)
# - Tweedie Regressor Objective
# - K-Fold Ensemble Training for Robustness
# ==============================================================================
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GroupKFold
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error
import optuna
import gc
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# --- Define File Paths ---
TRAIN_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/train_data.csv'
TEST_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/test_data.csv'
N_SPLITS = 5 # Using 5 folds for our robust training

# ==============================================================================
# PART 1: EXTREME FEATURE ENGINEERING
# ==============================================================================

def create_all_features(df):
    """
    Master function to create all features.
    This is run on the combined train+test dataframe to ensure consistency.
    """
    print("Starting feature engineering...")
    
    # 1. Basic Date & Time Features
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
    df['sessionHour'] = pd.to_datetime(df['sessionStart'], unit='s').dt.hour
    df['sessionDayOfWeek'] = df['date'].dt.dayofweek
    df['sessionMonth'] = df['date'].dt.month
    
    # 2. Interaction & Ratio Features
    df['browser_os_interaction'] = df['browser'].astype(str) + '_' + df['os'].astype(str)
    df['hits_per_pageview'] = df['totalHits'] / (df['pageViews'].fillna(0) + 1)
    
    # 3. User-Level Aggregates (Global)
    # Ensure userId is treated as a string
    df['userId'] = df['userId'].astype(str)
    user_agg = df.groupby('userId').agg(
        user_total_hits=('totalHits', 'sum'),
        user_avg_hits_per_session=('totalHits', 'mean'),
        user_session_count=('sessionId', 'nunique'),
        user_unique_days=('date', 'nunique'),
        user_avg_session_per_day=('sessionId', lambda x: x.nunique() / x.dt.date.nunique()),
    )
    df = pd.merge(df, user_agg, on='userId', how='left')

    # 4. Time-Series Features (Lag & Rolling for each user)
    # Sort data to create meaningful lag/rolling features
    df_sorted = df.sort_values(['userId', 'date', 'sessionStart']).copy()
    
    # Lag Features: Value from the previous session for that user
    print("Creating lag features...")
    df_sorted['prev_session_hits'] = df_sorted.groupby('userId')['totalHits'].shift(1)
    df_sorted['time_since_last_session'] = df_sorted.groupby('userId')['sessionStart'].diff()

    # Rolling Window Features: Aggregates over the user's last N sessions
    print("Creating rolling window features...")
    df_sorted['user_rolling_avg_hits_3'] = df_sorted.groupby('userId')['totalHits'].transform(
        lambda s: s.rolling(3, min_periods=1).mean()
    )
    df_sorted['user_rolling_sum_pageviews_3'] = df_sorted.groupby('userId')['pageViews'].transform(
        lambda s: s.rolling(3, min_periods=1).sum()
    )
    
    # 5. Drop temporary/redundant columns
    cols_to_drop = ['date', 'sessionStart', 'sessionId']
    df_sorted = df_sorted.drop(columns=cols_to_drop, errors='ignore')
    
    print("Feature engineering complete.")
    gc.collect() # Clean up memory
    return df_sorted

# ==============================================================================
# PART 2: DATA PREPARATION
# ==============================================================================

print("Loading data...")
df_train = pd.read_csv(TRAIN_FILE_PATH, dtype={'fullVisitorId': 'str'})
df_test = pd.read_csv(TEST_FILE_PATH, dtype={'fullVisitorId': 'str'})

# Store test IDs and original index for submission
test_ids = df_test['userId']
test_indices = df_test.index

# Align columns and combine
one_value_cols = [col for col in df_train.columns if df_train[col].nunique(dropna=False) == 1]
df_train = df_train.drop(columns=one_value_cols)
df_test = df_test.drop(columns=[c for c in one_value_cols if c in df_test.columns], errors='ignore')

train_len = len(df_train)
df_train['purchaseValue'] = df_train['purchaseValue'].fillna(0) # IMPORTANT: Keep as original value for Tweedie
combined_df = pd.concat([df_train, df_test], axis=0, sort=False)

# Run the master feature engineering function
combined_df_featured = create_all_features(combined_df)

# Separate back into train and test
X = combined_df_featured[:train_len].drop(columns=['purchaseValue'])
y = combined_df_featured[:train_len]['purchaseValue'] / 1e6 # Scale target for numerical stability
X_test = combined_df_featured[train_len:].drop(columns=['purchaseValue'])

# --- Define Column Groups for Preprocessing ---
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()

# Split categoricals into low and high cardinality
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_cols.remove('userId') # userId is for grouping, not a feature
low_card_cols = [col for col in categorical_cols if X[col].nunique() <= 10]
high_card_cols = [col for col in categorical_cols if X[col].nunique() > 10]

print(f"\nIdentified {len(numerical_cols)} numerical features.")
print(f"Identified {len(low_card_cols)} low-cardinality categorical features.")
print(f"Identified {len(high_card_cols)} high-cardinality categorical features.")

# ==============================================================================
# PART 3: HYPERPARAMETER TUNING (OPTUNA)
# ==============================================================================

# Define the preprocessor pipeline
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numerical_cols),
    ('low_card_cat', Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ]), low_card_cols),
    # For high cardinality, we can use a simpler approach or a dedicated encoder
    # Here, we will just use pandas .cat.codes which is fast and often effective
    # Note: We won't use TargetEncoder here to keep the objective function simpler
], remainder='passthrough') # Keep remaining columns (like high_card_cols)

def objective(trial):
    # This objective uses a single hold-out validation set for speed
    # A full CV inside Optuna is very time-consuming
    train_indices, val_indices = next(GroupKFold(n_splits=N_SPLITS).split(X, y, groups=X['userId']))
    X_train, X_val = X.iloc[train_indices], X.iloc[val_indices]
    y_train, y_val = y.iloc[train_indices], y.iloc[val_indices]
    
    # Handle high-cardinality features with simple category codes
    for col in high_card_cols:
        X_train[col] = X_train[col].astype('category').cat.codes
        X_val[col] = X_val[col].astype('category').cat.codes

    X_train_processed = preprocessor.fit_transform(X_train.drop(columns=['userId']))
    X_val_processed = preprocessor.transform(X_val.drop(columns=['userId']))

    params = {
        'objective': 'reg:tweedie',
        'eval_metric': 'rmse',
        'random_state': 42,
        'n_jobs': -1,
        'tweedie_variance_power': trial.suggest_float('tweedie_variance_power', 1.1, 1.9),
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }

    model = xgb.XGBRegressor(**params)
    model.fit(X_train_processed, y_train)
    preds = model.predict(X_val_processed)
    
    # Use RMSE on non-zero purchases for a stable metric
    rmse = np.sqrt(mean_squared_error(y_val[y_val > 0], preds[y_val > 0]))
    return rmse

print("\n--- Tuning Tweedie Regressor with Optuna ---")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30) # Increase trials for better results (e.g., 50-100)
best_params = study.best_params

# ==============================================================================
# PART 4: FINAL MODEL TRAINING (K-FOLD ENSEMBLE) & SUBMISSION
# ==============================================================================

print("\n--- Training Final Ensemble Model with Best Parameters ---")
test_predictions = np.zeros(len(X_test))
oof_predictions = np.zeros(len(X)) # Out-of-fold predictions for validation
gkf = GroupKFold(n_splits=N_SPLITS)

# Re-define preprocessor for the final loop
final_preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numerical_cols),
    ('low_card_cat', Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ]), low_card_cols),
], remainder='passthrough')


for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=X['userId'])):
    print(f"--- Training Fold {fold + 1}/{N_SPLITS} ---")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    X_test_fold = X_test.copy()
    
    # Handle high-cardinality features for this fold
    for col in high_card_cols:
        all_cats = pd.concat([X_train[col], X_val[col], X_test_fold[col]]).astype('category')
        X_train[col] = X_train[col].astype(all_cats.dtype).cat.codes
        X_val[col] = X_val[col].astype(all_cats.dtype).cat.codes
        X_test_fold[col] = X_test_fold[col].astype(all_cats.dtype).cat.codes

    # Preprocess data
    X_train_processed = final_preprocessor.fit_transform(X_train.drop(columns=['userId']))
    X_val_processed = final_preprocessor.transform(X_val.drop(columns=['userId']))
    X_test_processed = final_preprocessor.transform(X_test_fold.drop(columns=['userId']))

    model = xgb.XGBRegressor(**best_params, random_state=42, n_jobs=-1)
    model.fit(X_train_processed, y_train, 
              eval_set=[(X_val_processed, y_val)], 
              early_stopping_rounds=50, 
              verbose=100)
              
    val_preds = model.predict(X_val_processed)
    oof_predictions[val_idx] = val_preds
    
    test_fold_preds = model.predict(X_test_processed)
    test_predictions += test_fold_preds / N_SPLITS
    
    gc.collect()

# Evaluate overall OOF R2 score
oof_r2 = r2_score(y, oof_predictions)
print(f"\nOverall Out-of-Fold R2 Score: {oof_r2:.5f}")

# --- Generate Submission ---
print("\n--- Generating Final Kaggle Submission ---")
test_predictions[test_predictions < 0] = 0
final_predictions_scaled = test_predictions * 1e6

submission_df = pd.DataFrame({'ID': test_indices, 'purchaseValue': final_predictions_scaled})
submission_df.to_csv('submission_high_performance.csv', index=False)
print("Submission file 'submission_high_performance.csv' created successfully.")
print(submission_df.head())

  from .autonotebook import tqdm as notebook_tqdm


Loading data...
Starting feature engineering...


AttributeError: Can only use .dt accessor with datetimelike values

In [2]:
# ==============================================================================
# VERSION 5.1: High-Performance Single Regressor (Bug Fix in Feature Engineering)
# - Fixed .dt accessor bug during user-level aggregation.
# ==============================================================================
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GroupKFold
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error
import optuna
import gc
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# --- Define File Paths ---
TRAIN_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/train_data.csv'
TEST_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/test_data.csv'
N_SPLITS = 5 # Using 5 folds for our robust training

# ==============================================================================
# PART 1: EXTREME FEATURE ENGINEERING (Corrected)
# ==============================================================================

def create_all_features(df):
    """
    Master function to create all features.
    This is run on the combined train+test dataframe to ensure consistency.
    """
    print("Starting feature engineering...")
    
    # 1. Basic Date & Time Features
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
    df['sessionHour'] = pd.to_datetime(df['sessionStart'], unit='s').dt.hour
    df['sessionDayOfWeek'] = df['date'].dt.dayofweek
    df['sessionMonth'] = df['date'].dt.month
    
    # 2. Interaction & Ratio Features
    df['browser_os_interaction'] = df['browser'].astype(str) + '_' + df['os'].astype(str)
    df['hits_per_pageview'] = df['totalHits'] / (df['pageViews'].fillna(0) + 1)
    
    # 3. User-Level Aggregates (Global)
    print("Creating user-level aggregates...")
    df['userId'] = df['userId'].astype(str)
    
    # !!!!! MAJOR CHANGE HERE: Corrected the aggregation logic !!!!!
    user_agg = df.groupby('userId').agg(
        user_total_hits=('totalHits', 'sum'),
        user_avg_hits_per_session=('totalHits', 'mean'),
        user_session_count=('sessionId', 'nunique'),
        user_unique_days=('date', 'nunique') # Aggregate unique days here
    )
    # Now, calculate the ratio feature from the aggregated columns
    user_agg['user_avg_session_per_day'] = user_agg['user_session_count'] / user_agg['user_unique_days']
    
    df = pd.merge(df, user_agg, on='userId', how='left')

    # 4. Time-Series Features (Lag & Rolling for each user)
    # Sort data to create meaningful lag/rolling features
    df_sorted = df.sort_values(['userId', 'date', 'sessionStart']).copy()
    
    # Lag Features: Value from the previous session for that user
    print("Creating lag features...")
    df_sorted['prev_session_hits'] = df_sorted.groupby('userId')['totalHits'].shift(1)
    df_sorted['time_since_last_session'] = df_sorted.groupby('userId')['sessionStart'].diff()

    # Rolling Window Features: Aggregates over the user's last N sessions
    print("Creating rolling window features...")
    df_sorted['user_rolling_avg_hits_3'] = df_sorted.groupby('userId')['totalHits'].transform(
        lambda s: s.rolling(3, min_periods=1).mean()
    )
    df_sorted['user_rolling_sum_pageviews_3'] = df_sorted.groupby('userId')['pageViews'].transform(
        lambda s: s.rolling(3, min_periods=1).sum()
    )
    
    # 5. Drop temporary/redundant columns
    cols_to_drop = ['date', 'sessionStart', 'sessionId']
    df_sorted = df_sorted.drop(columns=cols_to_drop, errors='ignore')
    
    print("Feature engineering complete.")
    gc.collect() # Clean up memory
    return df_sorted

# ==============================================================================
# PART 2: DATA PREPARATION
# ==============================================================================

print("Loading data...")
df_train = pd.read_csv(TRAIN_FILE_PATH, dtype={'fullVisitorId': 'str'})
df_test = pd.read_csv(TEST_FILE_PATH, dtype={'fullVisitorId': 'str'})

# Store test IDs and original index for submission
test_ids = df_test['userId']
test_indices = df_test.index

# Align columns and combine
one_value_cols = [col for col in df_train.columns if df_train[col].nunique(dropna=False) == 1]
df_train = df_train.drop(columns=one_value_cols)
df_test = df_test.drop(columns=[c for c in one_value_cols if c in df_test.columns], errors='ignore')

train_len = len(df_train)
df_train['purchaseValue'] = df_train['purchaseValue'].fillna(0) # IMPORTANT: Keep as original value for Tweedie
combined_df = pd.concat([df_train, df_test], axis=0, sort=False)

# Run the master feature engineering function
combined_df_featured = create_all_features(combined_df)

# Separate back into train and test
X = combined_df_featured[:train_len].drop(columns=['purchaseValue'])
y = combined_df_featured[:train_len]['purchaseValue'] / 1e6 # Scale target for numerical stability
X_test = combined_df_featured[train_len:].drop(columns=['purchaseValue'])

# --- Define Column Groups for Preprocessing ---
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()

# Split categoricals into low and high cardinality
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_cols.remove('userId') # userId is for grouping, not a feature
low_card_cols = [col for col in categorical_cols if X[col].nunique() <= 10]
high_card_cols = [col for col in categorical_cols if X[col].nunique() > 10]

print(f"\nIdentified {len(numerical_cols)} numerical features.")
print(f"Identified {len(low_card_cols)} low-cardinality categorical features.")
print(f"Identified {len(high_card_cols)} high-cardinality categorical features.")

# ==============================================================================
# PART 3: HYPERPARAMETER TUNING (OPTUNA)
# ==============================================================================

# Define the preprocessor pipeline
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numerical_cols),
    ('low_card_cat', Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ]), low_card_cols),
], remainder='passthrough') # Keep remaining columns (like high_card_cols)

def objective(trial):
    train_indices, val_indices = next(GroupKFold(n_splits=N_SPLITS).split(X, y, groups=X['userId']))
    X_train, X_val = X.iloc[train_indices], X.iloc[val_indices]
    y_train, y_val = y.iloc[train_indices], y.iloc[val_indices]
    
    # Handle high-cardinality features with simple category codes
    for col in high_card_cols:
        all_cats = pd.concat([X_train[col], X_val[col]]).astype('category')
        X_train[col] = X_train[col].astype(all_cats.dtype).cat.codes
        X_val[col] = X_val[col].astype(all_cats.dtype).cat.codes

    X_train_processed = preprocessor.fit_transform(X_train.drop(columns=['userId']))
    X_val_processed = preprocessor.transform(X_val.drop(columns=['userId']))

    params = {
        'objective': 'reg:tweedie',
        'eval_metric': 'rmse',
        'random_state': 42,
        'n_jobs': -1,
        'tweedie_variance_power': trial.suggest_float('tweedie_variance_power', 1.1, 1.9),
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }

    model = xgb.XGBRegressor(**params)
    model.fit(X_train_processed, y_train, 
              eval_set=[(X_val_processed, y_val)],
              early_stopping_rounds=50, verbose=False)
    preds = model.predict(X_val_processed)
    
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    return rmse

print("\n--- Tuning Tweedie Regressor with Optuna ---")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30) 
best_params = study.best_params

# ==============================================================================
# PART 4: FINAL MODEL TRAINING (K-FOLD ENSEMBLE) & SUBMISSION
# ==============================================================================

print("\n--- Training Final Ensemble Model with Best Parameters ---")
test_predictions = np.zeros(len(X_test))
oof_predictions = np.zeros(len(X)) 
gkf = GroupKFold(n_splits=N_SPLITS)

final_preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numerical_cols),
    ('low_card_cat', Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ]), low_card_cols),
], remainder='passthrough')


for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=X['userId'])):
    print(f"--- Training Fold {fold + 1}/{N_SPLITS} ---")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    X_test_fold = X_test.copy()
    
    for col in high_card_cols:
        all_cats = pd.concat([X_train[col], X_val[col], X_test_fold[col]]).astype('category')
        X_train[col] = X_train[col].astype(all_cats.dtype).cat.codes
        X_val[col] = X_val[col].astype(all_cats.dtype).cat.codes
        X_test_fold[col] = X_test_fold[col].astype(all_cats.dtype).cat.codes

    X_train_processed = final_preprocessor.fit_transform(X_train.drop(columns=['userId']))
    X_val_processed = final_preprocessor.transform(X_val.drop(columns=['userId']))
    X_test_processed = final_preprocessor.transform(X_test_fold.drop(columns=['userId']))

    # Use the best params found by Optuna
    model = xgb.XGBRegressor(**best_params, random_state=42+fold, n_jobs=-1) # Add fold to random_state for variety
    model.fit(X_train_processed, y_train, 
              eval_set=[(X_val_processed, y_val)], 
              early_stopping_rounds=50, 
              verbose=100)
              
    val_preds = model.predict(X_val_processed)
    oof_predictions[val_idx] = val_preds
    
    test_fold_preds = model.predict(X_test_processed)
    test_predictions += test_fold_preds / N_SPLITS
    
    gc.collect()

# Evaluate overall OOF R2 score
oof_r2 = r2_score(y, oof_predictions)
print(f"\nOverall Out-of-Fold R2 Score: {oof_r2:.5f}")

# --- Generate Submission ---
print("\n--- Generating Final Kaggle Submission ---")
test_predictions[test_predictions < 0] = 0
final_predictions_scaled = test_predictions * 1e6

submission_df = pd.DataFrame({'ID': test_indices, 'purchaseValue': final_predictions_scaled})
submission_df.to_csv('submission_high_performance.csv', index=False)
print("Submission file 'submission_high_performance.csv' created successfully.")
print(submission_df.head())

Loading data...
Starting feature engineering...
Creating user-level aggregates...
Creating lag features...
Creating rolling window features...
Feature engineering complete.

Identified 20 numerical features.
Identified 10 low-cardinality categorical features.
Identified 13 high-cardinality categorical features.


TypeError: __init__() got an unexpected keyword argument 'sparse'

In [3]:
# ==============================================================================
# VERSION 5.2: High-Performance Single Regressor (Fix for older Scikit-Learn)
# - Removed 'sparse' argument from OneHotEncoder for backward compatibility.
# ==============================================================================
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GroupKFold
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error
import optuna
import gc
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# --- Define File Paths ---
TRAIN_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/train_data.csv'
TEST_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/test_data.csv'
N_SPLITS = 5 # Using 5 folds for our robust training

# ==============================================================================
# PART 1: EXTREME FEATURE ENGINEERING
# ==============================================================================

def create_all_features(df):
    """
    Master function to create all features.
    This is run on the combined train+test dataframe to ensure consistency.
    """
    print("Starting feature engineering...")
    
    # 1. Basic Date & Time Features
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
    df['sessionHour'] = pd.to_datetime(df['sessionStart'], unit='s').dt.hour
    df['sessionDayOfWeek'] = df['date'].dt.dayofweek
    df['sessionMonth'] = df['date'].dt.month
    
    # 2. Interaction & Ratio Features
    df['browser_os_interaction'] = df['browser'].astype(str) + '_' + df['os'].astype(str)
    df['hits_per_pageview'] = df['totalHits'] / (df['pageViews'].fillna(0) + 1)
    
    # 3. User-Level Aggregates (Global)
    print("Creating user-level aggregates...")
    df['userId'] = df['userId'].astype(str)
    
    user_agg = df.groupby('userId').agg(
        user_total_hits=('totalHits', 'sum'),
        user_avg_hits_per_session=('totalHits', 'mean'),
        user_session_count=('sessionId', 'nunique'),
        user_unique_days=('date', 'nunique')
    )
    user_agg['user_avg_session_per_day'] = user_agg['user_session_count'] / user_agg['user_unique_days']
    
    df = pd.merge(df, user_agg, on='userId', how='left')

    # 4. Time-Series Features (Lag & Rolling for each user)
    df_sorted = df.sort_values(['userId', 'date', 'sessionStart']).copy()
    
    print("Creating lag features...")
    df_sorted['prev_session_hits'] = df_sorted.groupby('userId')['totalHits'].shift(1)
    df_sorted['time_since_last_session'] = df_sorted.groupby('userId')['sessionStart'].diff()

    print("Creating rolling window features...")
    df_sorted['user_rolling_avg_hits_3'] = df_sorted.groupby('userId')['totalHits'].transform(
        lambda s: s.rolling(3, min_periods=1).mean()
    )
    df_sorted['user_rolling_sum_pageviews_3'] = df_sorted.groupby('userId')['pageViews'].transform(
        lambda s: s.rolling(3, min_periods=1).sum()
    )
    
    # 5. Drop temporary/redundant columns
    cols_to_drop = ['date', 'sessionStart', 'sessionId']
    df_sorted = df_sorted.drop(columns=cols_to_drop, errors='ignore')
    
    print("Feature engineering complete.")
    gc.collect()
    return df_sorted

# ==============================================================================
# PART 2: DATA PREPARATION
# ==============================================================================

print("Loading data...")
df_train = pd.read_csv(TRAIN_FILE_PATH, dtype={'fullVisitorId': 'str'})
df_test = pd.read_csv(TEST_FILE_PATH, dtype={'fullVisitorId': 'str'})

test_ids = df_test['userId']
test_indices = df_test.index

one_value_cols = [col for col in df_train.columns if df_train[col].nunique(dropna=False) == 1]
df_train = df_train.drop(columns=one_value_cols)
df_test = df_test.drop(columns=[c for c in one_value_cols if c in df_test.columns], errors='ignore')

train_len = len(df_train)
df_train['purchaseValue'] = df_train['purchaseValue'].fillna(0)
combined_df = pd.concat([df_train, df_test], axis=0, sort=False)

combined_df_featured = create_all_features(combined_df)

X = combined_df_featured[:train_len].drop(columns=['purchaseValue'])
y = combined_df_featured[:train_len]['purchaseValue'] / 1e6
X_test = combined_df_featured[train_len:].drop(columns=['purchaseValue'])

numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_cols.remove('userId')
low_card_cols = [col for col in categorical_cols if X[col].nunique() <= 10]
high_card_cols = [col for col in categorical_cols if X[col].nunique() > 10]

print(f"\nIdentified {len(numerical_cols)} numerical features.")
print(f"Identified {len(low_card_cols)} low-cardinality categorical features.")
print(f"Identified {len(high_card_cols)} high-cardinality categorical features.")

# ==============================================================================
# PART 3: HYPERPARAMETER TUNING (OPTUNA)
# ==============================================================================

# !!!!! MAJOR CHANGE HERE: Removed 'sparse=False' for compatibility !!!!!
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numerical_cols),
    ('low_card_cat', Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('ohe', OneHotEncoder(handle_unknown='ignore')) 
    ]), low_card_cols),
], remainder='passthrough')

def objective(trial):
    train_indices, val_indices = next(GroupKFold(n_splits=N_SPLITS).split(X, y, groups=X['userId']))
    X_train, X_val = X.iloc[train_indices], X.iloc[val_indices]
    y_train, y_val = y.iloc[train_indices], y.iloc[val_indices]
    
    for col in high_card_cols:
        all_cats = pd.concat([X_train[col], X_val[col]]).astype('category')
        X_train[col] = X_train[col].astype(all_cats.dtype).cat.codes
        X_val[col] = X_val[col].astype(all_cats.dtype).cat.codes

    X_train_processed = preprocessor.fit_transform(X_train.drop(columns=['userId']))
    X_val_processed = preprocessor.transform(X_val.drop(columns=['userId']))

    params = {
        'objective': 'reg:tweedie',
        'eval_metric': 'rmse',
        'random_state': 42,
        'n_jobs': -1,
        'tweedie_variance_power': trial.suggest_float('tweedie_variance_power', 1.1, 1.9),
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }

    model = xgb.XGBRegressor(**params)
    # Using early stopping requires a modern XGBoost version. 
    # If this line fails, remove the 'eval_set', 'early_stopping_rounds', and 'verbose' arguments.
    model.fit(X_train_processed, y_train, 
              eval_set=[(X_val_processed, y_val)],
              early_stopping_rounds=50, verbose=False)
    preds = model.predict(X_val_processed)
    
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    return rmse

print("\n--- Tuning Tweedie Regressor with Optuna ---")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)
best_params = study.best_params

# ==============================================================================
# PART 4: FINAL MODEL TRAINING (K-FOLD ENSEMBLE) & SUBMISSION
# ==============================================================================

print("\n--- Training Final Ensemble Model with Best Parameters ---")
test_predictions = np.zeros(len(X_test))
oof_predictions = np.zeros(len(X))
gkf = GroupKFold(n_splits=N_SPLITS)

# !!!!! MAJOR CHANGE HERE: Removed 'sparse=False' for compatibility !!!!!
final_preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numerical_cols),
    ('low_card_cat', Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('ohe', OneHotEncoder(handle_unknown='ignore'))
    ]), low_card_cols),
], remainder='passthrough')


for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=X['userId'])):
    print(f"--- Training Fold {fold + 1}/{N_SPLITS} ---")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    X_test_fold = X_test.copy()
    
    for col in high_card_cols:
        all_cats = pd.concat([X_train[col], X_val[col], X_test_fold[col]]).astype('category')
        X_train[col] = X_train[col].astype(all_cats.dtype).cat.codes
        X_val[col] = X_val[col].astype(all_cats.dtype).cat.codes
        X_test_fold[col] = X_test_fold[col].astype(all_cats.dtype).cat.codes

    X_train_processed = final_preprocessor.fit_transform(X_train.drop(columns=['userId']))
    X_val_processed = final_preprocessor.transform(X_val.drop(columns=['userId']))
    X_test_processed = final_preprocessor.transform(X_test_fold.drop(columns=['userId']))

    model = xgb.XGBRegressor(**best_params, random_state=42+fold, n_jobs=-1)
    model.fit(X_train_processed, y_train, 
              eval_set=[(X_val_processed, y_val)], 
              early_stopping_rounds=50, 
              verbose=100)
              
    val_preds = model.predict(X_val_processed)
    oof_predictions[val_idx] = val_preds
    
    test_fold_preds = model.predict(X_test_processed)
    test_predictions += test_fold_preds / N_SPLITS
    
    gc.collect()

oof_r2 = r2_score(y, oof_predictions)
print(f"\nOverall Out-of-Fold R2 Score: {oof_r2:.5f}")

print("\n--- Generating Final Kaggle Submission ---")
test_predictions[test_predictions < 0] = 0
final_predictions_scaled = test_predictions * 1e6

submission_df = pd.DataFrame({'ID': test_indices, 'purchaseValue': final_predictions_scaled})
submission_df.to_csv('submission_high_performance.csv', index=False)
print("Submission file 'submission_high_performance.csv' created successfully.")
print(submission_df.head())

Loading data...
Starting feature engineering...
Creating user-level aggregates...
Creating lag features...
Creating rolling window features...
Feature engineering complete.


[I 2025-07-15 15:15:46,838] A new study created in memory with name: no-name-1eca91f5-f37e-4523-9002-82616f73bbdd



Identified 20 numerical features.
Identified 10 low-cardinality categorical features.
Identified 13 high-cardinality categorical features.

--- Tuning Tweedie Regressor with Optuna ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].astype(all_cats.dtype).cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val[col] = X_val[col].astype(all_cats.dtype).cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].astype(all_cats.dtype).cat.codes
A value is trying to be se

TypeError: Encoders require their input argument must be uniformly strings or numbers. Got ['bool', 'str']

In [4]:
# ==============================================================================
# VERSION 5.3: High-Performance Single Regressor (Fix for Mixed Data Types)
# - Explicitly converts all categorical columns to strings before encoding.
# ==============================================================================
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GroupKFold
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error
import optuna
import gc
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# --- Define File Paths ---
TRAIN_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/train_data.csv'
TEST_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/test_data.csv'
N_SPLITS = 5 # Using 5 folds for our robust training

# ==============================================================================
# PART 1: EXTREME FEATURE ENGINEERING
# ==============================================================================

def create_all_features(df):
    """
    Master function to create all features.
    This is run on the combined train+test dataframe to ensure consistency.
    """
    print("Starting feature engineering...")
    
    # 1. Basic Date & Time Features
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
    df['sessionHour'] = pd.to_datetime(df['sessionStart'], unit='s').dt.hour
    df['sessionDayOfWeek'] = df['date'].dt.dayofweek
    df['sessionMonth'] = df['date'].dt.month
    
    # 2. Interaction & Ratio Features
    df['browser_os_interaction'] = df['browser'].astype(str) + '_' + df['os'].astype(str)
    df['hits_per_pageview'] = df['totalHits'] / (df['pageViews'].fillna(0) + 1)
    
    # 3. User-Level Aggregates (Global)
    print("Creating user-level aggregates...")
    df['userId'] = df['userId'].astype(str)
    
    user_agg = df.groupby('userId').agg(
        user_total_hits=('totalHits', 'sum'),
        user_avg_hits_per_session=('totalHits', 'mean'),
        user_session_count=('sessionId', 'nunique'),
        user_unique_days=('date', 'nunique')
    )
    user_agg['user_avg_session_per_day'] = user_agg['user_session_count'] / user_agg['user_unique_days']
    
    df = pd.merge(df, user_agg, on='userId', how='left')

    # 4. Time-Series Features (Lag & Rolling for each user)
    df_sorted = df.sort_values(['userId', 'date', 'sessionStart']).copy()
    
    print("Creating lag features...")
    df_sorted['prev_session_hits'] = df_sorted.groupby('userId')['totalHits'].shift(1)
    df_sorted['time_since_last_session'] = df_sorted.groupby('userId')['sessionStart'].diff()

    print("Creating rolling window features...")
    df_sorted['user_rolling_avg_hits_3'] = df_sorted.groupby('userId')['totalHits'].transform(
        lambda s: s.rolling(3, min_periods=1).mean()
    )
    df_sorted['user_rolling_sum_pageviews_3'] = df_sorted.groupby('userId')['pageViews'].transform(
        lambda s: s.rolling(3, min_periods=1).sum()
    )
    
    # 5. Drop temporary/redundant columns
    cols_to_drop = ['date', 'sessionStart', 'sessionId']
    df_sorted = df_sorted.drop(columns=cols_to_drop, errors='ignore')
    
    print("Feature engineering complete.")
    gc.collect()
    return df_sorted

# ==============================================================================
# PART 2: DATA PREPARATION
# ==============================================================================

print("Loading data...")
df_train = pd.read_csv(TRAIN_FILE_PATH, dtype={'fullVisitorId': 'str'})
df_test = pd.read_csv(TEST_FILE_PATH, dtype={'fullVisitorId': 'str'})

test_ids = df_test['userId']
test_indices = df_test.index

one_value_cols = [col for col in df_train.columns if df_train[col].nunique(dropna=False) == 1]
df_train = df_train.drop(columns=one_value_cols)
df_test = df_test.drop(columns=[c for c in one_value_cols if c in df_test.columns], errors='ignore')

train_len = len(df_train)
df_train['purchaseValue'] = df_train['purchaseValue'].fillna(0)
combined_df = pd.concat([df_train, df_test], axis=0, sort=False)

combined_df_featured = create_all_features(combined_df)

X = combined_df_featured[:train_len].drop(columns=['purchaseValue'])
y = combined_df_featured[:train_len]['purchaseValue'] / 1e6
X_test = combined_df_featured[train_len:].drop(columns=['purchaseValue'])

numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_cols.remove('userId')

# !!!!! MAJOR CHANGE HERE: Enforce string type on all categorical columns !!!!!
print("\nEnforcing string type on categorical columns to prevent mixed-type errors...")
for col in categorical_cols:
    X[col] = X[col].astype(str)
    X_test[col] = X_test[col].astype(str)

low_card_cols = [col for col in categorical_cols if X[col].nunique() <= 10]
high_card_cols = [col for col in categorical_cols if X[col].nunique() > 10]

print(f"Identified {len(numerical_cols)} numerical features.")
print(f"Identified {len(low_card_cols)} low-cardinality categorical features.")
print(f"Identified {len(high_card_cols)} high-cardinality categorical features.")

# ==============================================================================
# PART 3: HYPERPARAMETER TUNING (OPTUNA)
# ==============================================================================

preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numerical_cols),
    ('low_card_cat', Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('ohe', OneHotEncoder(handle_unknown='ignore')) 
    ]), low_card_cols),
], remainder='passthrough')

def objective(trial):
    train_indices, val_indices = next(GroupKFold(n_splits=N_SPLITS).split(X, y, groups=X['userId']))
    X_train, X_val = X.iloc[train_indices], X.iloc[val_indices]
    y_train, y_val = y.iloc[train_indices], y.iloc[val_indices]
    
    for col in high_card_cols:
        all_cats = pd.concat([X_train[col], X_val[col]]).astype('category')
        X_train[col] = X_train[col].astype(all_cats.dtype).cat.codes
        X_val[col] = X_val[col].astype(all_cats.dtype).cat.codes

    X_train_processed = preprocessor.fit_transform(X_train.drop(columns=['userId']))
    X_val_processed = preprocessor.transform(X_val.drop(columns=['userId']))

    params = {
        'objective': 'reg:tweedie',
        'eval_metric': 'rmse',
        'random_state': 42,
        'n_jobs': -1,
        'tweedie_variance_power': trial.suggest_float('tweedie_variance_power', 1.1, 1.9),
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }

    model = xgb.XGBRegressor(**params)
    # Using early stopping requires a modern XGBoost version. 
    model.fit(X_train_processed, y_train, 
              eval_set=[(X_val_processed, y_val)],
              early_stopping_rounds=50, verbose=False)
    preds = model.predict(X_val_processed)
    
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    return rmse

print("\n--- Tuning Tweedie Regressor with Optuna ---")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)
best_params = study.best_params

# ==============================================================================
# PART 4: FINAL MODEL TRAINING (K-FOLD ENSEMBLE) & SUBMISSION
# ==============================================================================

print("\n--- Training Final Ensemble Model with Best Parameters ---")
test_predictions = np.zeros(len(X_test))
oof_predictions = np.zeros(len(X))
gkf = GroupKFold(n_splits=N_SPLITS)

final_preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numerical_cols),
    ('low_card_cat', Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('ohe', OneHotEncoder(handle_unknown='ignore'))
    ]), low_card_cols),
], remainder='passthrough')


for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=X['userId'])):
    print(f"--- Training Fold {fold + 1}/{N_SPLITS} ---")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    X_test_fold = X_test.copy()
    
    for col in high_card_cols:
        all_cats = pd.concat([X_train[col], X_val[col], X_test_fold[col]]).astype('category')
        X_train[col] = X_train[col].astype(all_cats.dtype).cat.codes
        X_val[col] = X_val[col].astype(all_cats.dtype).cat.codes
        X_test_fold[col] = X_test_fold[col].astype(all_cats.dtype).cat.codes

    X_train_processed = final_preprocessor.fit_transform(X_train.drop(columns=['userId']))
    X_val_processed = final_preprocessor.transform(X_val.drop(columns=['userId']))
    X_test_processed = final_preprocessor.transform(X_test_fold.drop(columns=['userId']))

    model = xgb.XGBRegressor(**best_params, random_state=42+fold, n_jobs=-1)
    model.fit(X_train_processed, y_train, 
              eval_set=[(X_val_processed, y_val)], 
              early_stopping_rounds=50, 
              verbose=100)
              
    val_preds = model.predict(X_val_processed)
    oof_predictions[val_idx] = val_preds
    
    test_fold_preds = model.predict(X_test_processed)
    test_predictions += test_fold_preds / N_SPLITS
    
    gc.collect()

oof_r2 = r2_score(y, oof_predictions)
print(f"\nOverall Out-of-Fold R2 Score: {oof_r2:.5f}")

print("\n--- Generating Final Kaggle Submission ---")
test_predictions[test_predictions < 0] = 0
final_predictions_scaled = test_predictions * 1e6

submission_df = pd.DataFrame({'ID': test_indices, 'purchaseValue': final_predictions_scaled})
submission_df.to_csv('submission_high_performance.csv', index=False)
print("Submission file 'submission_high_performance.csv' created successfully.")
print(submission_df.head())

Loading data...
Starting feature engineering...
Creating user-level aggregates...
Creating lag features...
Creating rolling window features...
Feature engineering complete.

Enforcing string type on categorical columns to prevent mixed-type errors...


[I 2025-07-15 15:17:22,515] A new study created in memory with name: no-name-5ca63c2f-fb91-4af6-a0cf-7449582e0877


Identified 20 numerical features.
Identified 10 low-cardinality categorical features.
Identified 13 high-cardinality categorical features.

--- Tuning Tweedie Regressor with Optuna ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].astype(all_cats.dtype).cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val[col] = X_val[col].astype(all_cats.dtype).cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].astype(all_cats.dtype).cat.codes
A value is trying to be se

TypeError: fit() got an unexpected keyword argument 'early_stopping_rounds'

In [5]:
# ==============================================================================
# VERSION 5.4: High-Performance Single Regressor (Fix for All Library Versions)
# - Removed early stopping from all .fit() calls for backward compatibility.
# ==============================================================================
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GroupKFold
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error
import optuna
import gc
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# --- Define File Paths ---
TRAIN_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/train_data.csv'
TEST_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/test_data.csv'
N_SPLITS = 5 # Using 5 folds for our robust training

# ==============================================================================
# PART 1: EXTREME FEATURE ENGINEERING
# ==============================================================================

def create_all_features(df):
    """
    Master function to create all features.
    This is run on the combined train+test dataframe to ensure consistency.
    """
    print("Starting feature engineering...")
    
    # 1. Basic Date & Time Features
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
    df['sessionHour'] = pd.to_datetime(df['sessionStart'], unit='s').dt.hour
    df['sessionDayOfWeek'] = df['date'].dt.dayofweek
    df['sessionMonth'] = df['date'].dt.month
    
    # 2. Interaction & Ratio Features
    df['browser_os_interaction'] = df['browser'].astype(str) + '_' + df['os'].astype(str)
    df['hits_per_pageview'] = df['totalHits'] / (df['pageViews'].fillna(0) + 1)
    
    # 3. User-Level Aggregates (Global)
    print("Creating user-level aggregates...")
    df['userId'] = df['userId'].astype(str)
    
    user_agg = df.groupby('userId').agg(
        user_total_hits=('totalHits', 'sum'),
        user_avg_hits_per_session=('totalHits', 'mean'),
        user_session_count=('sessionId', 'nunique'),
        user_unique_days=('date', 'nunique')
    )
    user_agg['user_avg_session_per_day'] = user_agg['user_session_count'] / user_agg['user_unique_days']
    
    df = pd.merge(df, user_agg, on='userId', how='left')

    # 4. Time-Series Features (Lag & Rolling for each user)
    df_sorted = df.sort_values(['userId', 'date', 'sessionStart']).copy()
    
    print("Creating lag features...")
    df_sorted['prev_session_hits'] = df_sorted.groupby('userId')['totalHits'].shift(1)
    df_sorted['time_since_last_session'] = df_sorted.groupby('userId')['sessionStart'].diff()

    print("Creating rolling window features...")
    df_sorted['user_rolling_avg_hits_3'] = df_sorted.groupby('userId')['totalHits'].transform(
        lambda s: s.rolling(3, min_periods=1).mean()
    )
    df_sorted['user_rolling_sum_pageviews_3'] = df_sorted.groupby('userId')['pageViews'].transform(
        lambda s: s.rolling(3, min_periods=1).sum()
    )
    
    # 5. Drop temporary/redundant columns
    cols_to_drop = ['date', 'sessionStart', 'sessionId']
    df_sorted = df_sorted.drop(columns=cols_to_drop, errors='ignore')
    
    print("Feature engineering complete.")
    gc.collect()
    return df_sorted

# ==============================================================================
# PART 2: DATA PREPARATION
# ==============================================================================

print("Loading data...")
df_train = pd.read_csv(TRAIN_FILE_PATH, dtype={'fullVisitorId': 'str'})
df_test = pd.read_csv(TEST_FILE_PATH, dtype={'fullVisitorId': 'str'})

test_ids = df_test['userId']
test_indices = df_test.index

one_value_cols = [col for col in df_train.columns if df_train[col].nunique(dropna=False) == 1]
df_train = df_train.drop(columns=one_value_cols)
df_test = df_test.drop(columns=[c for c in one_value_cols if c in df_test.columns], errors='ignore')

train_len = len(df_train)
df_train['purchaseValue'] = df_train['purchaseValue'].fillna(0)
combined_df = pd.concat([df_train, df_test], axis=0, sort=False)

combined_df_featured = create_all_features(combined_df)

X = combined_df_featured[:train_len].drop(columns=['purchaseValue'])
y = combined_df_featured[:train_len]['purchaseValue'] / 1e6
X_test = combined_df_featured[train_len:].drop(columns=['purchaseValue'])

numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_cols.remove('userId')

print("\nEnforcing string type on categorical columns to prevent mixed-type errors...")
for col in categorical_cols:
    X[col] = X[col].astype(str)
    X_test[col] = X_test[col].astype(str)

low_card_cols = [col for col in categorical_cols if X[col].nunique() <= 10]
high_card_cols = [col for col in categorical_cols if X[col].nunique() > 10]

print(f"Identified {len(numerical_cols)} numerical features.")
print(f"Identified {len(low_card_cols)} low-cardinality categorical features.")
print(f"Identified {len(high_card_cols)} high-cardinality categorical features.")

# ==============================================================================
# PART 3: HYPERPARAMETER TUNING (OPTUNA)
# ==============================================================================

preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numerical_cols),
    ('low_card_cat', Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('ohe', OneHotEncoder(handle_unknown='ignore')) 
    ]), low_card_cols),
], remainder='passthrough')

def objective(trial):
    train_indices, val_indices = next(GroupKFold(n_splits=N_SPLITS).split(X, y, groups=X['userId']))
    X_train, X_val = X.iloc[train_indices], X.iloc[val_indices]
    y_train, y_val = y.iloc[train_indices], y.iloc[val_indices]
    
    for col in high_card_cols:
        all_cats = pd.concat([X_train[col], X_val[col]]).astype('category')
        X_train[col] = X_train[col].astype(all_cats.dtype).cat.codes
        X_val[col] = X_val[col].astype(all_cats.dtype).cat.codes

    X_train_processed = preprocessor.fit_transform(X_train.drop(columns=['userId']))
    X_val_processed = preprocessor.transform(X_val.drop(columns=['userId']))

    params = {
        'objective': 'reg:tweedie',
        'eval_metric': 'rmse',
        'random_state': 42,
        'n_jobs': -1,
        'tweedie_variance_power': trial.suggest_float('tweedie_variance_power', 1.1, 1.9),
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }

    model = xgb.XGBRegressor(**params)
    # !!!!! MAJOR CHANGE HERE: Removed early stopping for compatibility !!!!!
    model.fit(X_train_processed, y_train)
    preds = model.predict(X_val_processed)
    
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    return rmse

print("\n--- Tuning Tweedie Regressor with Optuna ---")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)
best_params = study.best_params

# ==============================================================================
# PART 4: FINAL MODEL TRAINING (K-FOLD ENSEMBLE) & SUBMISSION
# ==============================================================================

print("\n--- Training Final Ensemble Model with Best Parameters ---")
test_predictions = np.zeros(len(X_test))
oof_predictions = np.zeros(len(X))
gkf = GroupKFold(n_splits=N_SPLITS)

final_preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numerical_cols),
    ('low_card_cat', Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('ohe', OneHotEncoder(handle_unknown='ignore'))
    ]), low_card_cols),
], remainder='passthrough')


for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=X['userId'])):
    print(f"--- Training Fold {fold + 1}/{N_SPLITS} ---")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    X_test_fold = X_test.copy()
    
    for col in high_card_cols:
        all_cats = pd.concat([X_train[col], X_val[col], X_test_fold[col]]).astype('category')
        X_train[col] = X_train[col].astype(all_cats.dtype).cat.codes
        X_val[col] = X_val[col].astype(all_cats.dtype).cat.codes
        X_test_fold[col] = X_test_fold[col].astype(all_cats.dtype).cat.codes

    X_train_processed = final_preprocessor.fit_transform(X_train.drop(columns=['userId']))
    X_val_processed = final_preprocessor.transform(X_val.drop(columns=['userId']))
    X_test_processed = final_preprocessor.transform(X_test_fold.drop(columns=['userId']))

    model = xgb.XGBRegressor(**best_params, random_state=42+fold, n_jobs=-1)
    # !!!!! MAJOR CHANGE HERE: Removed early stopping for compatibility !!!!!
    model.fit(X_train_processed, y_train)
              
    val_preds = model.predict(X_val_processed)
    oof_predictions[val_idx] = val_preds
    
    test_fold_preds = model.predict(X_test_processed)
    test_predictions += test_fold_preds / N_SPLITS
    
    gc.collect()

oof_r2 = r2_score(y, oof_predictions)
print(f"\nOverall Out-of-Fold R2 Score: {oof_r2:.5f}")

print("\n--- Generating Final Kaggle Submission ---")
test_predictions[test_predictions < 0] = 0
final_predictions_scaled = test_predictions * 1e6

submission_df = pd.DataFrame({'ID': test_indices, 'purchaseValue': final_predictions_scaled})
submission_df.to_csv('submission_high_performance.csv', index=False)
print("Submission file 'submission_high_performance.csv' created successfully.")
print(submission_df.head())

Loading data...
Starting feature engineering...
Creating user-level aggregates...
Creating lag features...
Creating rolling window features...
Feature engineering complete.

Enforcing string type on categorical columns to prevent mixed-type errors...


[I 2025-07-15 15:19:03,262] A new study created in memory with name: no-name-a4c5a3f5-d4d2-4de1-9e8b-c6d6dcaa82ad


Identified 20 numerical features.
Identified 10 low-cardinality categorical features.
Identified 13 high-cardinality categorical features.

--- Tuning Tweedie Regressor with Optuna ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].astype(all_cats.dtype).cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val[col] = X_val[col].astype(all_cats.dtype).cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].astype(all_cats.dtype).cat.codes
A value is trying to be se

XGBoostError: [15:19:04] /Users/runner/work/xgboost/xgboost/src/data/data.cc:514: Check failed: valid: Label contains NaN, infinity or a value too large.
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x000000015e20c428 dmlc::LogMessageFatal::~LogMessageFatal() + 124
  [bt] (1) 2   libxgboost.dylib                    0x000000015e36a450 xgboost::MetaInfo::SetInfoFromHost(xgboost::Context const&, xgboost::StringView, xgboost::Json) + 2776
  [bt] (2) 3   libxgboost.dylib                    0x000000015e3697ec xgboost::MetaInfo::SetInfo(xgboost::Context const&, xgboost::StringView, xgboost::StringView) + 464
  [bt] (3) 4   libxgboost.dylib                    0x000000015e223a60 XGDMatrixSetInfoFromInterface + 228
  [bt] (4) 5   libffi.dylib                        0x00000001ac3ec050 ffi_call_SYSV + 80
  [bt] (5) 6   libffi.dylib                        0x00000001ac3f4af0 ffi_call_int + 1220
  [bt] (6) 7   _ctypes.cpython-39-darwin.so        0x000000010561b414 PyInit__ctypes + 25272
  [bt] (7) 8   _ctypes.cpython-39-darwin.so        0x0000000105613fcc _ctypes.cpython-39-darwin.so + 16332
  [bt] (8) 9   Python3                             0x0000000102ae6c34 _PyObject_MakeTpCall + 360



In [6]:
# ==============================================================================
# VERSION 5.5: High-Performance Single Regressor (Fix for NaN in Target)
# - Correctly handles missing values in the target variable 'y'.
# ==============================================================================
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GroupKFold
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error
import optuna
import gc
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# --- Define File Paths ---
TRAIN_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/train_data.csv'
TEST_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/test_data.csv'
N_SPLITS = 5 # Using 5 folds for our robust training

# ==============================================================================
# PART 1: EXTREME FEATURE ENGINEERING
# ==============================================================================

def create_all_features(df):
    """
    Master function to create all features.
    This is run on the combined train+test dataframe to ensure consistency.
    """
    print("Starting feature engineering...")
    
    # 1. Basic Date & Time Features
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
    df['sessionHour'] = pd.to_datetime(df['sessionStart'], unit='s').dt.hour
    df['sessionDayOfWeek'] = df['date'].dt.dayofweek
    df['sessionMonth'] = df['date'].dt.month
    
    # 2. Interaction & Ratio Features
    df['browser_os_interaction'] = df['browser'].astype(str) + '_' + df['os'].astype(str)
    df['hits_per_pageview'] = df['totalHits'] / (df['pageViews'].fillna(0) + 1)
    
    # 3. User-Level Aggregates (Global)
    print("Creating user-level aggregates...")
    df['userId'] = df['userId'].astype(str)
    
    user_agg = df.groupby('userId').agg(
        user_total_hits=('totalHits', 'sum'),
        user_avg_hits_per_session=('totalHits', 'mean'),
        user_session_count=('sessionId', 'nunique'),
        user_unique_days=('date', 'nunique')
    )
    user_agg['user_avg_session_per_day'] = user_agg['user_session_count'] / user_agg['user_unique_days']
    
    df = pd.merge(df, user_agg, on='userId', how='left')

    # 4. Time-Series Features (Lag & Rolling for each user)
    df_sorted = df.sort_values(['userId', 'date', 'sessionStart']).copy()
    
    print("Creating lag features...")
    df_sorted['prev_session_hits'] = df_sorted.groupby('userId')['totalHits'].shift(1)
    df_sorted['time_since_last_session'] = df_sorted.groupby('userId')['sessionStart'].diff()

    print("Creating rolling window features...")
    df_sorted['user_rolling_avg_hits_3'] = df_sorted.groupby('userId')['totalHits'].transform(
        lambda s: s.rolling(3, min_periods=1).mean()
    )
    df_sorted['user_rolling_sum_pageviews_3'] = df_sorted.groupby('userId')['pageViews'].transform(
        lambda s: s.rolling(3, min_periods=1).sum()
    )
    
    # 5. Drop temporary/redundant columns
    cols_to_drop = ['date', 'sessionStart', 'sessionId']
    df_sorted = df_sorted.drop(columns=cols_to_drop, errors='ignore')
    
    print("Feature engineering complete.")
    gc.collect()
    return df_sorted

# ==============================================================================
# PART 2: DATA PREPARATION
# ==============================================================================

print("Loading data...")
df_train = pd.read_csv(TRAIN_FILE_PATH, dtype={'fullVisitorId': 'str'})
df_test = pd.read_csv(TEST_FILE_PATH, dtype={'fullVisitorId': 'str'})

test_ids = df_test['userId']
test_indices = df_test.index

one_value_cols = [col for col in df_train.columns if df_train[col].nunique(dropna=False) == 1]
df_train = df_train.drop(columns=one_value_cols)
df_test = df_test.drop(columns=[c for c in one_value_cols if c in df_test.columns], errors='ignore')

train_len = len(df_train)
# Note: We do NOT fillna for purchaseValue here yet.
combined_df = pd.concat([df_train, df_test], axis=0, sort=False)

combined_df_featured = create_all_features(combined_df)

# !!!!! MAJOR CHANGE HERE: Handle NaNs in target AFTER feature creation !!!!!
# Fill NaN in purchaseValue (from test set) with 0, then create X and y
combined_df_featured['purchaseValue'] = combined_df_featured['purchaseValue'].fillna(0)

X = combined_df_featured[:train_len].drop(columns=['purchaseValue'])
y = combined_df_featured[:train_len]['purchaseValue'] / 1e6
X_test = combined_df_featured[train_len:].drop(columns=['purchaseValue'])

numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_cols.remove('userId')

print("\nEnforcing string type on categorical columns to prevent mixed-type errors...")
for col in categorical_cols:
    X[col] = X[col].astype(str)
    X_test[col] = X_test[col].astype(str)

low_card_cols = [col for col in categorical_cols if X[col].nunique() <= 10]
high_card_cols = [col for col in categorical_cols if X[col].nunique() > 10]

print(f"Identified {len(numerical_cols)} numerical features.")
print(f"Identified {len(low_card_cols)} low-cardinality categorical features.")
print(f"Identified {len(high_card_cols)} high-cardinality categorical features.")

# ==============================================================================
# PART 3: HYPERPARAMETER TUNING (OPTUNA)
# ==============================================================================

preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numerical_cols),
    ('low_card_cat', Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('ohe', OneHotEncoder(handle_unknown='ignore')) 
    ]), low_card_cols),
], remainder='passthrough')

def objective(trial):
    train_indices, val_indices = next(GroupKFold(n_splits=N_SPLITS).split(X, y, groups=X['userId']))
    X_train, X_val = X.iloc[train_indices], X.iloc[val_indices]
    y_train, y_val = y.iloc[train_indices], y.iloc[val_indices]
    
    for col in high_card_cols:
        all_cats = pd.concat([X_train[col], X_val[col]]).astype('category')
        X_train[col] = X_train[col].astype(all_cats.dtype).cat.codes
        X_val[col] = X_val[col].astype(all_cats.dtype).cat.codes

    X_train_processed = preprocessor.fit_transform(X_train.drop(columns=['userId']))
    X_val_processed = preprocessor.transform(X_val.drop(columns=['userId']))

    params = {
        'objective': 'reg:tweedie',
        'eval_metric': 'rmse',
        'random_state': 42,
        'n_jobs': -1,
        'tweedie_variance_power': trial.suggest_float('tweedie_variance_power', 1.1, 1.9),
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }

    model = xgb.XGBRegressor(**params)
    model.fit(X_train_processed, y_train)
    preds = model.predict(X_val_processed)
    
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    return rmse

print("\n--- Tuning Tweedie Regressor with Optuna ---")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)
best_params = study.best_params

# ==============================================================================
# PART 4: FINAL MODEL TRAINING (K-FOLD ENSEMBLE) & SUBMISSION
# ==============================================================================

print("\n--- Training Final Ensemble Model with Best Parameters ---")
test_predictions = np.zeros(len(X_test))
oof_predictions = np.zeros(len(X))
gkf = GroupKFold(n_splits=N_SPLITS)

final_preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numerical_cols),
    ('low_card_cat', Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('ohe', OneHotEncoder(handle_unknown='ignore'))
    ]), low_card_cols),
], remainder='passthrough')


for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=X['userId'])):
    print(f"--- Training Fold {fold + 1}/{N_SPLITS} ---")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    X_test_fold = X_test.copy()
    
    for col in high_card_cols:
        all_cats = pd.concat([X_train[col], X_val[col], X_test_fold[col]]).astype('category')
        X_train[col] = X_train[col].astype(all_cats.dtype).cat.codes
        X_val[col] = X_val[col].astype(all_cats.dtype).cat.codes
        X_test_fold[col] = X_test_fold[col].astype(all_cats.dtype).cat.codes

    X_train_processed = final_preprocessor.fit_transform(X_train.drop(columns=['userId']))
    X_val_processed = final_preprocessor.transform(X_val.drop(columns=['userId']))
    X_test_processed = final_preprocessor.transform(X_test_fold.drop(columns=['userId']))

    model = xgb.XGBRegressor(**best_params, random_state=42+fold, n_jobs=-1)
    model.fit(X_train_processed, y_train)
              
    val_preds = model.predict(X_val_processed)
    oof_predictions[val_idx] = val_preds
    
    test_fold_preds = model.predict(X_test_processed)
    test_predictions += test_fold_preds / N_SPLITS
    
    gc.collect()

oof_r2 = r2_score(y, oof_predictions)
print(f"\nOverall Out-of-Fold R2 Score: {oof_r2:.5f}")

print("\n--- Generating Final Kaggle Submission ---")
test_predictions[test_predictions < 0] = 0
final_predictions_scaled = test_predictions * 1e6

submission_df = pd.DataFrame({'ID': test_indices, 'purchaseValue': final_predictions_scaled})
submission_df.to_csv('submission_high_performance.csv', index=False)
print("Submission file 'submission_high_performance.csv' created successfully.")
print(submission_df.head())

Loading data...
Starting feature engineering...
Creating user-level aggregates...
Creating lag features...
Creating rolling window features...
Feature engineering complete.

Enforcing string type on categorical columns to prevent mixed-type errors...


[I 2025-07-15 15:20:30,894] A new study created in memory with name: no-name-01e5343e-9f87-47d1-88a1-6f68894dde6b


Identified 20 numerical features.
Identified 10 low-cardinality categorical features.
Identified 13 high-cardinality categorical features.

--- Tuning Tweedie Regressor with Optuna ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].astype(all_cats.dtype).cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val[col] = X_val[col].astype(all_cats.dtype).cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].astype(all_cats.dtype).cat.codes
A value is trying to be se

ValueError: Input contains NaN.