In [2]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error

# ==============================================================================
# PART 1: PREPROCESSING (This part remains mostly the same)
# ==============================================================================

class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    # ... (This class is unchanged)
    def fit(self, X, y=None): return self
    def transform(self, X):
        X_copy = X.copy()
        X_copy['date'] = pd.to_datetime(X_copy['date'], format='%Y%m%d')
        X_copy['sessionYear'] = X_copy['date'].dt.year
        X_copy['sessionMonth'] = X_copy['date'].dt.month
        X_copy['sessionDayOfWeek'] = X_copy['date'].dt.dayofweek
        X_copy['sessionHour'] = pd.to_datetime(X_copy['sessionStart'], unit='s').dt.hour
        X_copy['ad_page_binned'] = X_copy['trafficSource.adwordsClickInfo.page'].apply(lambda p: 1 if p == 1.0 else (2 if pd.notna(p) else 0))
        cols_to_drop = ['date', 'sessionStart', 'userId', 'sessionId', 'trafficSource.adwordsClickInfo.page']
        X_copy = X_copy.drop(columns=cols_to_drop, errors='ignore')
        return X_copy

# For this simpler model, Target Encoding is fine, but we learn from the log_purchaseValue
class TargetEncoder(BaseEstimator, TransformerMixin):
    # ... (This class is unchanged)
    def __init__(self, columns=None):
        self.columns = columns
        self.mappings_ = {}
        self.global_mean_ = 0
    def fit(self, X, y):
        X_fit = X.copy()
        y_fit = y.copy()
        self.global_mean_ = np.mean(y_fit)
        for col in self.columns:
            X_fit[col] = X_fit[col].fillna('missing')
            mapping = y_fit.groupby(X_fit[col]).mean().to_dict()
            self.mappings_[col] = mapping
        return self
    def transform(self, X):
        X_transform = X.copy()
        for col in self.columns:
            X_transform[col] = X_transform[col].fillna('missing')
            X_transform[col] = X_transform[col].map(self.mappings_[col]).fillna(self.global_mean_)
        return X_transform

# ==============================================================================
# PART 2: LOADING DATA AND TRAINING THE SINGLE-REGRESSOR MODEL
# ==============================================================================

# --- Define File Paths ---
TRAIN_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/train_data.csv'
TEST_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/test_data.csv'

# --- Load and Prepare Raw Training Data ---
print("Loading and preparing training data...")
df = pd.read_csv(TRAIN_FILE_PATH, dtype={'fullVisitorId': 'str'})
one_value_cols = [col for col in df.columns if df[col].nunique(dropna=False) == 1]
df = df.drop(columns=one_value_cols)

# Create the single target variable: log_purchaseValue
df['purchaseValue'] = df['purchaseValue'].fillna(0) / 1e6 # Rescale
df['log_purchaseValue'] = np.log1p(df['purchaseValue'])

# Separate features (X) and our single target (y_log)
X = df.drop(columns=['purchaseValue', 'log_purchaseValue'])
y_log = df['log_purchaseValue']

# Split data
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.25, random_state=42)

# --- Define the full preprocessing pipeline ---
print("Defining the preprocessing pipeline...")
temp_engineered_df = FeatureEngineeringTransformer().fit_transform(X_train)
numerical_cols = ['sessionNumber', 'pageViews', 'totalHits', 'sessionYear', 'sessionMonth', 'sessionDayOfWeek', 'sessionHour']
categorical_cols = [col for col in temp_engineered_df.columns if col not in numerical_cols]

# The target encoder will now learn from the log_purchaseValue on the whole training set
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numerical_cols),
        ('cat', TargetEncoder(columns=categorical_cols), categorical_cols)
    ],
    remainder='drop'
)

# --- Build and Train the SINGLE Regressor Pipeline ---
print("\n--- Building and Training Single Regressor Model ---")
model_pipeline = Pipeline(steps=[
    ('feature_engineering', FeatureEngineeringTransformer()),
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse', random_state=42, n_estimators=250, max_depth=7, learning_rate=0.05))
])

# Fit the entire pipeline on the training data and log-transformed target
model_pipeline.fit(X_train, y_train_log)

# ==============================================================================
# PART 3: EVALUATION AND KAGGLE SUBMISSION
# ==============================================================================

# --- Evaluate on the local test set ---
print("\n--- Evaluating Model on Local Test Set---")
y_pred_log = model_pipeline.predict(X_test)

# Inverse transform predictions and cap at zero as you suggested
y_pred = np.expm1(y_pred_log)
y_pred[y_pred < 0] = 0

# Get original test values for comparison
y_test_orig = np.expm1(y_test_log)

r2 = r2_score(y_test_orig, y_pred)
rmse = np.sqrt(mean_squared_error(y_test_orig, y_pred))
print(f"Local Test Set R² Score: {r2:.4f}")
print(f"Local Test Set RMSE: ${rmse:.2f}")

# --- Generate Kaggle Submission File ---
print("\n--- Generating Kaggle Submission File ---")
try:
    kaggle_test_df = pd.read_csv(TEST_FILE_PATH, dtype={'fullVisitorId': 'str'})
    
    # Use the TRAINED pipeline to predict on the new, unseen test data
    kaggle_pred_log = model_pipeline.predict(kaggle_test_df)
    
    # Inverse transform and cap at zero
    kaggle_final_predictions = np.expm1(kaggle_pred_log)
    kaggle_final_predictions[kaggle_final_predictions < 0] = 0
    
    # Create the submission DataFrame
    submission_df = pd.DataFrame({
        'ID': kaggle_test_df.index,
        'purchaseValue': kaggle_final_predictions
    })
    
    submission_df.to_csv('submission.csv', index=False)
    print("Submission file 'submission.csv' created successfully.")
    
except FileNotFoundError:
    print(f"\nKaggle '{TEST_FILE_PATH}' not found. Skipping submission file generation.")
except Exception as e:
    print(f"\nAn error occurred during submission generation: {e}")

Loading and preparing training data...
Defining the preprocessing pipeline...

--- Building and Training Single Regressor Model ---

--- Evaluating Model on Local Test Set---
Local Test Set R² Score: 0.1493
Local Test Set RMSE: $196.78

--- Generating Kaggle Submission File ---
Submission file 'submission.csv' created successfully.


In [4]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error

# ==============================================================================
# PART 1: DEFINING THE PREPROCESSING PIPELINE
# These are the reusable building blocks of our system.
# ==============================================================================

class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    """
    Performs initial feature engineering: creates date/time features,
    bins the AdWords page feature, and drops unneeded ID columns.
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        
        # Date and Timestamp Engineering
        X_copy['date'] = pd.to_datetime(X_copy['date'], format='%Y%m%d')
        X_copy['sessionYear'] = X_copy['date'].dt.year
        X_copy['sessionMonth'] = X_copy['date'].dt.month
        X_copy['sessionDayOfWeek'] = X_copy['date'].dt.dayofweek
        X_copy['sessionHour'] = pd.to_datetime(X_copy['sessionStart'], unit='s').dt.hour
        
        # Binning the AdWords Page feature
        X_copy['ad_page_binned'] = X_copy['trafficSource.adwordsClickInfo.page'].apply(
            lambda p: 1 if p == 1.0 else (2 if pd.notna(p) else 0)
        )
        
        # Drop original/processed columns
        cols_to_drop = [
            'date', 'sessionStart', 'userId', 'sessionId', 
            'trafficSource.adwordsClickInfo.page'
        ]
        X_copy = X_copy.drop(columns=cols_to_drop, errors='ignore')
        
        return X_copy

class TargetEncoder(BaseEstimator, TransformerMixin):
    """
    A leak-proof Target Encoder. It learns the mean of the target for each category
    from the training data ONLY and applies it to all data.
    """
    def __init__(self, columns=None):
        self.columns = columns
        self.mappings_ = {}
        self.global_mean_ = 0

    def fit(self, X, y):
        X_fit = X.copy()
        y_fit = y.copy()
        
        self.global_mean_ = np.mean(y_fit)
        
        for col in self.columns:
            # Fill NaNs in the feature column before grouping
            X_fit[col] = X_fit[col].fillna('missing')
            mapping = y_fit.groupby(X_fit[col]).mean().to_dict()
            self.mappings_[col] = mapping
        return self

    def transform(self, X):
        X_transform = X.copy()
        for col in self.columns:
            X_transform[col] = X_transform[col].fillna('missing')
            # Apply the learned mapping. For new categories in test data, use the global mean.
            X_transform[col] = X_transform[col].map(self.mappings_[col]).fillna(self.global_mean_)
        return X_transform

# ==============================================================================
# PART 2: LOADING DATA AND TRAINING THE MODELS
# ==============================================================================

# --- Define File Paths ---
TRAIN_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/train_data.csv'
TEST_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/test_data.csv'

# --- Load and Prepare Raw Training Data ---
print("Loading and preparing training data...")
df = pd.read_csv(TRAIN_FILE_PATH, dtype={'fullVisitorId': 'str'}) # Load ID as string

# Initial cleanup: Drop columns with only one unique value
one_value_cols = [col for col in df.columns if df[col].nunique(dropna=False) == 1]
df = df.drop(columns=one_value_cols)

# Create target variables
df['purchaseValue'] = df['purchaseValue'].fillna(0) / 1e6 # Handle NaNs and rescale
df['made_purchase'] = (df['purchaseValue'] > 0).astype(int)
df['log_purchaseValue'] = np.log1p(df['purchaseValue'])

# Separate features and targets
X = df.drop(columns=['purchaseValue', 'made_purchase', 'log_purchaseValue'])
y = df[['purchaseValue', 'made_purchase', 'log_purchaseValue']]

# Split data before any fitting to prevent leakage
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y['made_purchase']
)

# --- Define the full preprocessing pipeline ---
print("Defining the preprocessing pipeline...")
# First, apply feature engineering to get the final column names
temp_engineered_df = FeatureEngineeringTransformer().fit_transform(X_train)
numerical_cols = ['sessionNumber', 'pageViews', 'totalHits', 'sessionYear', 'sessionMonth', 'sessionDayOfWeek', 'sessionHour']
categorical_cols = [col for col in temp_engineered_df.columns if col not in numerical_cols]

# This pipeline encapsulates all preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numerical_cols),
        ('cat', TargetEncoder(columns=categorical_cols), categorical_cols)
    ],
    remainder='drop'
)

# --- Build and Train the Two-Part Model ---

# PART 2A: CLASSIFIER (Will they buy?)
print("\n--- Building and Training Classifier ---")
clf_pipeline = Pipeline(steps=[
    ('feature_engineering', FeatureEngineeringTransformer()),
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=42))
])

# Define a parameter grid for tuning
param_grid_clf = {
    'classifier__n_estimators': [150, 250],
    'classifier__max_depth': [5, 7],
    'classifier__learning_rate': [0.05],
}
grid_search_clf = GridSearchCV(clf_pipeline, param_grid_clf, scoring='roc_auc', cv=3, verbose=1, n_jobs=-1)
grid_search_clf.fit(X_train, y_train['made_purchase'])
best_clf_pipeline = grid_search_clf.best_estimator_
print(f"Best Classifier Params: {grid_search_clf.best_params_}")

# PART 2B: REGRESSOR (How much will they spend?)
print("\n--- Building and Training Regressor ---")
reg_pipeline = Pipeline(steps=[
    ('feature_engineering', FeatureEngineeringTransformer()),
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse', random_state=42))
])

# Fit the regressor pipeline ONLY on the buyer data
X_train_buyers = X_train[y_train['made_purchase'] == 1]
y_train_buyers_log = y_train.loc[y_train['made_purchase'] == 1, 'log_purchaseValue']
reg_pipeline.fit(X_train_buyers, y_train_buyers_log)

# ==============================================================================
# PART 3: EVALUATION AND KAGGLE SUBMISSION (Corrected Submission Part)
# ==============================================================================

# ... (The evaluation part on the local test set is correct and can remain as is) ...
# print(f"Local Test Set R² Score: {r2:.4f}")
# print(f"Local Test Set RMSE: ${rmse:.2f}")

# --- Generate Kaggle Submission File ---
print("\n--- Generating Kaggle Submission File (with correct scaling) ---")
try:
    kaggle_test_df = pd.read_csv(TEST_FILE_PATH, dtype={'fullVisitorId': 'str'})
    
    # Use the TRAINED pipeline to predict on the new, unseen test data
    kaggle_pred_log = model_pipeline.predict(kaggle_test_df)
    
    # Inverse transform to get predicted dollar amounts
    kaggle_final_predictions_dollars = np.expm1(kaggle_pred_log)
    kaggle_final_predictions_dollars[kaggle_final_predictions_dollars < 0] = 0
    
    # ====================================================================
    # THE FIX: Scale the final dollar predictions back up by 1,000,000
    # ====================================================================
    scaling_factor = 1e6
    kaggle_final_predictions_scaled = kaggle_final_predictions_dollars * scaling_factor
    
    # Create the submission DataFrame with the scaled values
    submission_df = pd.DataFrame({
        'ID': kaggle_test_df.index,
        'purchaseValue': kaggle_final_predictions_scaled
    })
    
    submission_df.to_csv('submission.csv', index=False)
    print("Submission file 'submission.csv' created successfully with scaled-up values.")
    
except FileNotFoundError:
    print(f"\nKaggle '{TEST_FILE_PATH}' not found. Skipping submission file generation.")
except Exception as e:
    print(f"\nAn error occurred during submission generation: {e}")

Loading and preparing training data...
Defining the preprocessing pipeline...

--- Building and Training Classifier ---
Fitting 3 folds for each of 4 candidates, totalling 12 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Best Classifier Params: {'classifier__learning_rate': 0.05, 'classifier__max_depth': 7, 'classifier__n_estimators': 250}

--- Building and Training Regressor ---

--- Generating Kaggle Submission File (with correct scaling) ---
Submission file 'submission.csv' created successfully with scaled-up values.


In [6]:
#Model improvmenet 2
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error

# ==============================================================================
# PART 1: ENHANCED PREPROCESSING PIPELINE
# ==============================================================================

class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    """
    Enhanced version: Creates date/time features, bins AdWords,
    AND adds a new interaction feature.
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        
        # Date and Timestamp Engineering
        X_copy['date'] = pd.to_datetime(X_copy['date'], format='%Y%m%d')
        X_copy['sessionYear'] = X_copy['date'].dt.year
        X_copy['sessionMonth'] = X_copy['date'].dt.month
        X_copy['sessionDayOfWeek'] = X_copy['date'].dt.dayofweek
        X_copy['sessionHour'] = pd.to_datetime(X_copy['sessionStart'], unit='s').dt.hour
        
        # *** NEW FEATURE ***: Interaction between month and day of week
        X_copy['month_day_interaction'] = X_copy['sessionMonth'].astype(str) + '_' + X_copy['sessionDayOfWeek'].astype(str)
        
        # Binning the AdWords Page feature
        X_copy['ad_page_binned'] = X_copy['trafficSource.adwordsClickInfo.page'].apply(
            lambda p: 1 if p == 1.0 else (2 if pd.notna(p) else 0)
        )
        
        # Drop original/processed columns
        cols_to_drop = [
            'date', 'sessionStart', 'userId', 'sessionId', 
            'trafficSource.adwordsClickInfo.page'
        ]
        X_copy = X_copy.drop(columns=cols_to_drop, errors='ignore')
        
        return X_copy

class TargetEncoder(BaseEstimator, TransformerMixin):
    # This class is unchanged, it's already robust and leak-proof
    def __init__(self, columns=None):
        self.columns = columns
        self.mappings_ = {}
        self.global_mean_ = 0
    def fit(self, X, y):
        X_fit = X.copy()
        y_fit = y.copy()
        self.global_mean_ = np.mean(y_fit)
        for col in self.columns:
            X_fit[col] = X_fit[col].fillna('missing')
            mapping = y_fit.groupby(X_fit[col]).mean().to_dict()
            self.mappings_[col] = mapping
        return self
    def transform(self, X):
        X_transform = X.copy()
        for col in self.columns:
            X_transform[col] = X_transform[col].fillna('missing')
            X_transform[col] = X_transform[col].map(self.mappings_[col]).fillna(self.global_mean_)
        return X_transform

# ==============================================================================
# PART 2: LOADING DATA AND TRAINING THE IMPROVED HURDLE MODEL
# ==============================================================================

# --- Define File Paths ---
TRAIN_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/train_data.csv'
TEST_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/test_data.csv'

# --- Load and Prepare Raw Training Data ---
print("Loading and preparing training data...")
df = pd.read_csv(TRAIN_FILE_PATH, dtype={'fullVisitorId': 'str'})
one_value_cols = [col for col in df.columns if df[col].nunique(dropna=False) == 1]
df = df.drop(columns=one_value_cols)

# Create target variables
df['purchaseValue'] = df['purchaseValue'].fillna(0) / 1e6 # Rescale
df['made_purchase'] = (df['purchaseValue'] > 0).astype(int)
df['log_purchaseValue'] = np.log1p(df['purchaseValue'])

# Separate features and targets
X = df.drop(columns=['purchaseValue', 'made_purchase', 'log_purchaseValue'])
y = df[['purchaseValue', 'made_purchase', 'log_purchaseValue']]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y['made_purchase']
)

# --- Define Column Groups based on the new Feature Transformer ---
print("Defining column groups for preprocessing...")
temp_engineered_df = FeatureEngineeringTransformer().fit_transform(X_train)
numerical_cols = ['sessionNumber', 'pageViews', 'totalHits', 'sessionYear', 'sessionMonth', 'sessionDayOfWeek', 'sessionHour']
categorical_cols = [col for col in temp_engineered_df.columns if col not in numerical_cols]

# --- Build and Train the Classifier Component (Part 1) ---
print("\n--- Building and Training Classifier ---")
clf_preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numerical_cols),
        ('cat', TargetEncoder(columns=categorical_cols), categorical_cols)
    ],
    remainder='drop'
)
clf_pipeline = Pipeline(steps=[
    ('feature_engineering', FeatureEngineeringTransformer()),
    ('preprocessor', clf_preprocessor),
    ('classifier', xgb.XGBClassifier(
        objective='binary:logistic', eval_metric='logloss', use_label_encoder=False,
        random_state=42, n_estimators=500, max_depth=6, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8
    ))
])

# Fit the classifier pipeline on the binary target
clf_pipeline.fit(X_train, y_train['made_purchase'])

# --- Build and Train the Regressor Component (Part 2) ---
print("\n--- Building and Training Regressor ---")
reg_preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numerical_cols),
        ('cat', TargetEncoder(columns=categorical_cols), categorical_cols)
    ],
    remainder='drop'
)
reg_pipeline = Pipeline(steps=[
    ('feature_engineering', FeatureEngineeringTransformer()),
    ('preprocessor', reg_preprocessor),
    ('regressor', xgb.XGBRegressor(
        objective='reg:squarederror', eval_metric='rmse', random_state=42,
        n_estimators=500, max_depth=6, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8
    ))
])

# Fit the regressor pipeline ONLY on the buyer data
X_train_buyers = X_train[y_train['made_purchase'] == 1]
y_train_buyers_log = y_train.loc[y_train['made_purchase'] == 1, 'log_purchaseValue']
reg_pipeline.fit(X_train_buyers, y_train_buyers_log)

# ==============================================================================
# PART 3: EVALUATION AND KAGGLE SUBMISSION
# ==============================================================================

# --- Evaluate on the local test set ---
print("\n--- Evaluating Improved Model on Local Test Set---")
prob_purchase = clf_pipeline.predict_proba(X_test)[:, 1]
log_value_pred = reg_pipeline.predict(X_test)
value_pred = np.expm1(log_value_pred)
final_predictions = prob_purchase * value_pred
final_predictions[final_predictions < 0] = 0

actual_values = y_test['purchaseValue']
r2 = r2_score(actual_values, final_predictions)
rmse = np.sqrt(mean_squared_error(actual_values, final_predictions))
print(f"Local Test Set R² Score: {r2:.4f}")
print(f"Local Test Set RMSE: ${rmse:.2f}")

# --- Generate Kaggle Submission File ---
print("\n--- Generating Kaggle Submission File ---")
try:
    kaggle_test_df = pd.read_csv(TEST_FILE_PATH, dtype={'fullVisitorId': 'str'})
    
    # Use the TRAINED pipelines to predict on the new, unseen test data
    kaggle_prob_purchase = clf_pipeline.predict_proba(kaggle_test_df)[:, 1]
    kaggle_log_value_pred = reg_pipeline.predict(kaggle_test_df)
    
    # Combine, inverse transform, and scale up for submission
    kaggle_value_pred = np.expm1(kaggle_log_value_pred)
    kaggle_final_predictions_dollars = kaggle_prob_purchase * kaggle_value_pred
    kaggle_final_predictions_dollars[kaggle_final_predictions_dollars < 0] = 0
    
    scaling_factor = 1e6
    kaggle_final_predictions_scaled = kaggle_final_predictions_dollars * scaling_factor
    
    # Create the submission DataFrame
    submission_df = pd.DataFrame({
        'ID': kaggle_test_df.index,
        'purchaseValue': kaggle_final_predictions_scaled
    })
    
    submission_df.to_csv('submission_v2.csv', index=False) # Saving as v2
    print("Submission file 'submission_v2.csv' created successfully.")
    
except FileNotFoundError:
    print(f"\nKaggle '{TEST_FILE_PATH}' not found. Skipping submission file generation.")
except Exception as e:
    print(f"\nAn error occurred during submission generation: {e}")

Loading and preparing training data...
Defining column groups for preprocessing...

--- Building and Training Classifier ---


Parameters: { "use_label_encoder" } are not used.




--- Building and Training Regressor ---

--- Evaluating Improved Model on Local Test Set---
Local Test Set R² Score: 0.4787
Local Test Set RMSE: $126.94

--- Generating Kaggle Submission File ---
Submission file 'submission_v2.csv' created successfully.


In [8]:
#Version 3
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold
import xgboost as xgb
from sklearn.metrics import roc_auc_score, mean_squared_error, r2_score
import optuna
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)

# ==============================================================================
# PART 1: ADVANCED FEATURE ENGINEERING
# ==============================================================================

class AdvancedFeatureEngineering(BaseEstimator, TransformerMixin):
    """
    Creates a rich set of features for the model.
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        
        # Date/Time Engineering
        X_copy['date'] = pd.to_datetime(X_copy['date'], format='%Y%m%d')
        X_copy['sessionYear'] = X_copy['date'].dt.year
        X_copy['sessionMonth'] = X_copy['date'].dt.month
        X_copy['sessionDayOfWeek'] = X_copy['date'].dt.dayofweek
        X_copy['sessionHour'] = pd.to_datetime(X_copy['sessionStart'], unit='s').dt.hour
        X_copy['is_weekend'] = (X_copy['sessionDayOfWeek'] >= 5).astype(int)
        
        # Interaction Features
        X_copy['month_day_interaction'] = X_copy['sessionMonth'].astype(str) + '_' + X_copy['sessionDayOfWeek'].astype(str)
        X_copy['browser_os_interaction'] = X_copy['browser'].astype(str) + '_' + X_copy['os'].astype(str)
        
        # Ratio Features (handle division by zero)
        X_copy['hits_per_pageview'] = X_copy['totalHits'] / (X_copy['pageViews'] + 1e-6)
        
        # Binning AdWords Page
        X_copy['ad_page_binned'] = X_copy['trafficSource.adwordsClickInfo.page'].apply(
            lambda p: 1 if p == 1.0 else (2 if pd.notna(p) else 0)
        )
        
        cols_to_drop = ['date', 'sessionStart', 'userId', 'sessionId', 'trafficSource.adwordsClickInfo.page']
        X_copy = X_copy.drop(columns=cols_to_drop, errors='ignore')
        
        return X_copy

# TargetEncoder class remains the same
class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns; self.mappings_ = {}; self.global_mean_ = 0
    def fit(self, X, y):
        X_fit, y_fit = X.copy(), y.copy()
        self.global_mean_ = np.mean(y_fit)
        for col in self.columns:
            X_fit[col] = X_fit[col].fillna('missing')
            self.mappings_[col] = y_fit.groupby(X_fit[col]).mean().to_dict()
        return self
    def transform(self, X):
        X_transform = X.copy()
        for col in self.columns:
            X_transform[col] = X_transform[col].fillna('missing')
            X_transform[col] = X_transform[col].map(self.mappings_[col]).fillna(self.global_mean_)
        return X_transform

# ==============================================================================
# PART 2: DATA PREPARATION
# ==============================================================================
TRAIN_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/train_data.csv'
TEST_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/test_data.csv'

print("Loading and preparing data...")
df = pd.read_csv(TRAIN_FILE_PATH, dtype={'fullVisitorId': 'str'})
one_value_cols = [col for col in df.columns if df[col].nunique(dropna=False) == 1]
df = df.drop(columns=one_value_cols)

df['purchaseValue'] = df['purchaseValue'].fillna(0) / 1e6
df['made_purchase'] = (df['purchaseValue'] > 0).astype(int)
df['log_purchaseValue'] = np.log1p(df['purchaseValue'])

X = df.drop(columns=['purchaseValue', 'made_purchase', 'log_purchaseValue'])
y = df[['made_purchase', 'log_purchaseValue']]

# --- Define Column Groups for Preprocessing ---
temp_engineered_df = AdvancedFeatureEngineering().fit_transform(X)
numerical_cols = ['sessionNumber', 'pageViews', 'totalHits', 'sessionYear', 'sessionMonth', 'sessionDayOfWeek', 'sessionHour', 'hits_per_pageview']
categorical_cols = [col for col in temp_engineered_df.columns if col not in numerical_cols]

# ==============================================================================
# PART 3: HYPERPARAMETER TUNING WITH OPTUNA
# ==============================================================================

# --- Tuning the Classifier ---
print("\n--- Tuning Classifier with Optuna ---")
X_engineered = AdvancedFeatureEngineering().fit_transform(X)
y_clf_target = y['made_purchase']

def objective_clf(trial):
    params = {
        'objective': 'binary:logistic', 'eval_metric': 'logloss', 'random_state': 42,
        'n_estimators': trial.suggest_int('n_estimators', 400, 1000),
        'max_depth': trial.suggest_int('max_depth', 4, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
    }
    
    cv = KFold(n_splits=4, shuffle=True, random_state=42)
    cv_scores = []
    
    for train_idx, val_idx in cv.split(X_engineered):
        X_train, X_val = X_engineered.iloc[train_idx], X_engineered.iloc[val_idx]
        y_train, y_val = y_clf_target.iloc[train_idx], y_clf_target.iloc[val_idx]

        preprocessor = ColumnTransformer([('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numerical_cols), ('cat', TargetEncoder(columns=categorical_cols), categorical_cols)], remainder='drop')
        
        preprocessor.fit(X_train, y_train)
        X_train_processed = preprocessor.transform(X_train)
        X_val_processed = preprocessor.transform(X_val)

        model = xgb.XGBClassifier(**params)
        model.fit(X_train_processed, y_train)
        preds = model.predict_proba(X_val_processed)[:, 1]
        cv_scores.append(roc_auc_score(y_val, preds))
        
    return np.mean(cv_scores)

study_clf = optuna.create_study(direction='maximize')
study_clf.optimize(objective_clf, n_trials=30) # Run 30 trials
best_clf_params = study_clf.best_params

# --- Tuning the Regressor ---
print("\n--- Tuning Regressor with Optuna ---")
X_buyers_engineered = X_engineered[y['made_purchase'] == 1]
y_reg_target = y.loc[y['made_purchase'] == 1, 'log_purchaseValue']

def objective_reg(trial):
    params = {
        'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'random_state': 42,
        'n_estimators': trial.suggest_int('n_estimators', 400, 1000),
        'max_depth': trial.suggest_int('max_depth', 4, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
    }

    cv = KFold(n_splits=4, shuffle=True, random_state=42)
    cv_scores = []

    for train_idx, val_idx in cv.split(X_buyers_engineered):
        X_train, X_val = X_buyers_engineered.iloc[train_idx], X_buyers_engineered.iloc[val_idx]
        y_train, y_val = y_reg_target.iloc[train_idx], y_reg_target.iloc[val_idx]

        preprocessor = ColumnTransformer([('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numerical_cols), ('cat', TargetEncoder(columns=categorical_cols), categorical_cols)], remainder='drop')

        preprocessor.fit(X_train, y_train)
        X_train_processed = preprocessor.transform(X_train)
        X_val_processed = preprocessor.transform(X_val)

        model = xgb.XGBRegressor(**params)
        model.fit(X_train_processed, y_train)
        preds = model.predict(X_val_processed)
        cv_scores.append(np.sqrt(mean_squared_error(y_val, preds)))

    return np.mean(cv_scores)

study_reg = optuna.create_study(direction='minimize')
study_reg.optimize(objective_reg, n_trials=30) # Run 30 trials
best_reg_params = study_reg.best_params

# ==============================================================================
# PART 4: FINAL MODEL TRAINING AND SUBMISSION
# ==============================================================================

print("\n--- Training Final Models with Best Parameters ---")
# Build and fit the final classifier pipeline on ALL training data
final_clf_pipeline = Pipeline([
    ('engineering', AdvancedFeatureEngineering()),
    ('preprocessing', ColumnTransformer([('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numerical_cols), ('cat', TargetEncoder(columns=categorical_cols), categorical_cols)], remainder='drop')),
    ('classifier', xgb.XGBClassifier(**best_clf_params, random_state=42))
])
final_clf_pipeline.fit(X, y['made_purchase'])

# Build and fit the final regressor pipeline on ALL buyer data
final_reg_pipeline = Pipeline([
    ('engineering', AdvancedFeatureEngineering()),
    ('preprocessing', ColumnTransformer([('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numerical_cols), ('cat', TargetEncoder(columns=categorical_cols), categorical_cols)], remainder='drop')),
    ('regressor', xgb.XGBRegressor(**best_reg_params, random_state=42))
])
final_reg_pipeline.fit(X[y['made_purchase'] == 1], y.loc[y['made_purchase'] == 1, 'log_purchaseValue'])

print("\n--- Generating Final Kaggle Submission ---")
try:
    kaggle_test_df = pd.read_csv(TEST_FILE_PATH, dtype={'fullVisitorId': 'str'})
    
    kaggle_prob_purchase = final_clf_pipeline.predict_proba(kaggle_test_df)[:, 1]
    kaggle_log_value_pred = final_reg_pipeline.predict(kaggle_test_df)
    
    kaggle_value_pred = np.expm1(kaggle_log_value_pred)
    kaggle_final_predictions_dollars = kaggle_prob_purchase * kaggle_value_pred
    kaggle_final_predictions_dollars[kaggle_final_predictions_dollars < 0] = 0
    
    kaggle_final_predictions_scaled = kaggle_final_predictions_dollars * 1e6
    
    submission_df = pd.DataFrame({'ID': kaggle_test_df.index, 'purchaseValue': kaggle_final_predictions_scaled})
    submission_df.to_csv('submission_final.csv', index=False)
    print("Submission file 'submission_final.csv' created successfully.")
    
except FileNotFoundError:
    print(f"\nKaggle '{TEST_FILE_PATH}' not found.")
except Exception as e:
    print(f"\nAn error occurred during submission generation: {e}")

  from .autonotebook import tqdm as notebook_tqdm


Loading and preparing data...

--- Tuning Classifier with Optuna ---


[I 2025-07-15 00:26:40,458] A new study created in memory with name: no-name-6c41a0b1-3d58-40b2-b75d-093d21c11c53
[I 2025-07-15 00:26:53,166] Trial 0 finished with value: 0.9895921285936123 and parameters: {'n_estimators': 525, 'max_depth': 6, 'learning_rate': 0.02148297041874146, 'subsample': 0.7032080682122083, 'colsample_bytree': 0.9596181215638754, 'gamma': 0.0005657090223697375}. Best is trial 0 with value: 0.9895921285936123.
[I 2025-07-15 00:27:10,279] Trial 1 finished with value: 0.989770511295699 and parameters: {'n_estimators': 822, 'max_depth': 7, 'learning_rate': 0.013764038342184735, 'subsample': 0.7207228809965966, 'colsample_bytree': 0.8000753610077241, 'gamma': 7.190943040730275e-05}. Best is trial 1 with value: 0.989770511295699.
[I 2025-07-15 00:27:28,070] Trial 2 finished with value: 0.98997017883108 and parameters: {'n_estimators': 797, 'max_depth': 8, 'learning_rate': 0.028451888420636683, 'subsample': 0.9304222654419644, 'colsample_bytree': 0.9383577314058353, 'ga


--- Tuning Regressor with Optuna ---


[I 2025-07-15 00:34:30,143] Trial 0 finished with value: 0.9575587392614255 and parameters: {'n_estimators': 971, 'max_depth': 4, 'learning_rate': 0.054708732513392304, 'subsample': 0.8704366213692751, 'colsample_bytree': 0.9425269589406653}. Best is trial 0 with value: 0.9575587392614255.
[I 2025-07-15 00:34:34,981] Trial 1 finished with value: 0.9764607085497653 and parameters: {'n_estimators': 865, 'max_depth': 4, 'learning_rate': 0.02645182855018343, 'subsample': 0.9242054867729493, 'colsample_bytree': 0.9298378923730929}. Best is trial 0 with value: 0.9575587392614255.
[I 2025-07-15 00:34:39,945] Trial 2 finished with value: 0.9586431807515525 and parameters: {'n_estimators': 850, 'max_depth': 4, 'learning_rate': 0.059911212521302375, 'subsample': 0.8322016660765699, 'colsample_bytree': 0.8934360540316002}. Best is trial 0 with value: 0.9575587392614255.
[I 2025-07-15 00:34:46,227] Trial 3 finished with value: 0.9474746229601186 and parameters: {'n_estimators': 799, 'max_depth': 6


--- Training Final Models with Best Parameters ---

--- Generating Final Kaggle Submission ---
Submission file 'submission_final.csv' created successfully.
