In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error

# ==============================================================================
# A: Custom Transformers (No changes here)
# ==============================================================================
class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    # ... (code is correct)
    def fit(self, X, y=None): return self
    def transform(self, X):
        X_copy = X.copy()
        X_copy['date'] = pd.to_datetime(X_copy['date'], format='%Y%m%d')
        X_copy['sessionYear'] = X_copy['date'].dt.year
        X_copy['sessionMonth'] = X_copy['date'].dt.month
        X_copy['sessionDayOfWeek'] = X_copy['date'].dt.dayofweek
        X_copy['sessionHour'] = pd.to_datetime(X_copy['sessionStart'], unit='s').dt.hour
        X_copy['ad_page_binned'] = X_copy['trafficSource.adwordsClickInfo.page'].apply(lambda p: 1 if p == 1.0 else (2 if pd.notna(p) else 0))
        X_copy['gclIdPresent'] = (X_copy['gclIdPresent'] == 1).astype(int)
        cols_to_drop = ['date', 'sessionStart', 'userId', 'sessionId', 'trafficSource.adwordsClickInfo.page']
        X_copy = X_copy.drop(columns=cols_to_drop, errors='ignore')
        return X_copy

class TargetEncoder(BaseEstimator, TransformerMixin):
    # ... (code is correct)
    def __init__(self, columns=None):
        self.columns = columns
        self.mappings_ = {}
        self.global_mean_ = 0
    def fit(self, X, y):
        X_fit = X.copy()
        y_fit = y.copy()
        self.global_mean_ = np.mean(y_fit)
        for col in self.columns:
            X_fit[col] = X_fit[col].fillna('missing')
            mapping = y_fit.groupby(X_fit[col]).mean().to_dict()
            self.mappings_[col] = mapping
        return self
    def transform(self, X):
        X_transform = X.copy()
        for col in self.columns:
            X_transform[col] = X_transform[col].fillna('missing')
            X_transform[col] = X_transform[col].map(self.mappings_[col]).fillna(self.global_mean_)
        return X_transform

# ==============================================================================
# B: Load Data and Prepare Targets
# ==============================================================================
df = pd.read_csv('/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/train_data.csv')
one_value_cols = [col for col in df.columns if df[col].nunique() == 1]
df = df.drop(columns=one_value_cols)
df['purchaseValue'] = df['purchaseValue'] / 1e6
df['made_purchase'] = (df['purchaseValue'] > 0).astype(int)
df['log_purchaseValue'] = np.log1p(df['purchaseValue'])
X = df.drop(columns=['purchaseValue', 'made_purchase', 'log_purchaseValue'])
y = df[['purchaseValue', 'made_purchase', 'log_purchaseValue']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y['made_purchase'])

# ==============================================================================
# C: THE FIX - Apply Initial Engineering FIRST, then Define ColumnTransformer
# ==============================================================================

# 1. Apply the first-stage engineering to get the final set of columns
feature_engineer = FeatureEngineeringTransformer()
X_train_engineered = feature_engineer.fit_transform(X_train)
X_test_engineered = feature_engineer.transform(X_test)

# 2. NOW define the column lists based on the *engineered* data
numerical_cols = ['sessionNumber', 'pageViews', 'totalHits', 'sessionYear', 'sessionMonth', 'sessionDayOfWeek', 'sessionHour']
categorical_cols = [col for col in X_train_engineered.columns if col not in numerical_cols]

# 3. Create the second-stage preprocessor (scaling and target encoding)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numerical_cols),
        ('cat', TargetEncoder(columns=categorical_cols), categorical_cols)
    ],
    remainder='drop'
)




In [3]:

# ==============================================================================
# MODEL PIPELINING AND TRAINING
# ==============================================================================

# ==============================================================================
# Step 1: Apply Initial Feature Engineering (to create the final column set)
# ==============================================================================
print("Applying initial feature engineering...")
feature_engineer = FeatureEngineeringTransformer()
X_train_engineered = feature_engineer.fit_transform(X_train)
X_test_engineered = feature_engineer.transform(X_test)

# Define column names based on the *engineered* data
numerical_cols = ['sessionNumber', 'pageViews', 'totalHits', 'sessionYear', 'sessionMonth', 'sessionDayOfWeek', 'sessionHour']
categorical_cols = [col for col in X_train_engineered.columns if col not in numerical_cols]

print("Initial feature engineering complete.")

# ==============================================================================
# Step 2: Define and Train the Classifier Component
# ==============================================================================
print("\n--- Building and Training Classifier ---")

# Define the preprocessor for the classifier
clf_preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numerical_cols),
        ('cat', TargetEncoder(columns=categorical_cols), categorical_cols)
    ],
    remainder='drop'
)

# Fit the preprocessor and transform the data
clf_preprocessor.fit(X_train_engineered, y_train['made_purchase'])
X_train_clf_processed = clf_preprocessor.transform(X_train_engineered)
X_test_clf_processed = clf_preprocessor.transform(X_test_engineered)

# Train the classifier model
print("Training Classifier Model...")
xgb_clf = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=42)
xgb_clf.fit(X_train_clf_processed, y_train['made_purchase'])

# ==============================================================================
# Step 3: Define and Train the Regressor Component
# ==============================================================================
print("\n--- Building and Training Regressor ---")

# Define a NEW, SEPARATE preprocessor for the regressor
reg_preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numerical_cols),
        ('cat', TargetEncoder(columns=categorical_cols), categorical_cols)
    ],
    remainder='drop'
)

# Fit this preprocessor ONLY on the buyer data from the training set
X_train_buyers_engineered = X_train_engineered[y_train['made_purchase'] == 1]
y_train_buyers_log = y_train.loc[y_train['made_purchase'] == 1, 'log_purchaseValue']
reg_preprocessor.fit(X_train_buyers_engineered, y_train_buyers_log)

# Transform the necessary data sets
# The model is trained on processed BUYER data
X_train_buyers_reg_processed = reg_preprocessor.transform(X_train_buyers_engineered)
# Predictions will be made on the full PROCESSED test set
X_test_reg_processed = reg_preprocessor.transform(X_test_engineered)

# Train the regressor model
print("Training Regressor Model...")
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse', random_state=42)
xgb_reg.fit(X_train_buyers_reg_processed, y_train_buyers_log)

# ==============================================================================
# Step 4: Evaluate the Combined Model
# ==============================================================================
print("\n--- Evaluating Final Combined Model ---")

# Use the correctly processed test data for each model
prob_purchase = xgb_clf.predict_proba(X_test_clf_processed)[:, 1]
log_value_pred = xgb_reg.predict(X_test_reg_processed)

# Combine and evaluate
value_pred = np.expm1(log_value_pred)
final_predictions = prob_purchase * value_pred
final_predictions[final_predictions < 0] = 0

actual_values = y_test['purchaseValue']
r2 = r2_score(actual_values, final_predictions)
rmse = np.sqrt(mean_squared_error(actual_values, final_predictions))

print(f"\nFinal R² Score: {r2:.4f}")
print(f"Final RMSE: ${rmse:.2f}")

Applying initial feature engineering...
Initial feature engineering complete.

--- Building and Training Classifier ---
Training Classifier Model...


Parameters: { "use_label_encoder" } are not used.




--- Building and Training Regressor ---
Training Regressor Model...

--- Evaluating Final Combined Model ---

Final R² Score: 0.5553
Final RMSE: $117.25


In [4]:

print("\n--- Generating Kaggle Submission ---")
try:
    # Load the test data from Kaggle.
    kaggle_test_df = pd.read_csv('/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/test_data.csv') 

    kaggle_test_engineered = feature_engineer.transform(kaggle_test_df)


    kaggle_test_clf_processed = clf_preprocessor.transform(kaggle_test_engineered)
    kaggle_test_reg_processed = reg_preprocessor.transform(kaggle_test_engineered)
    

    kaggle_prob_purchase = xgb_clf.predict_proba(kaggle_test_clf_processed)[:, 1]
    kaggle_log_value_pred = xgb_reg.predict(kaggle_test_reg_processed)
    
    
    kaggle_pred_dollar_value = np.expm1(kaggle_log_value_pred)
    kaggle_final_predictions = kaggle_prob_purchase * kaggle_pred_dollar_value
    kaggle_final_predictions[kaggle_final_predictions < 0] = 0
    
    
    submission_df = pd.DataFrame({
        'ID': kaggle_test_df.index,
        'purchaseValue': kaggle_final_predictions
    })
    
    # Save to CSV
    submission_df.to_csv('submission.csv', index=False)
    
    print("Submission file 'submission.csv' created successfully.")
    
except FileNotFoundError:
    print("\nKaggle 'test.csv' not found. Skipping submission file generation.")
except Exception as e:
    print(f"\nAn error occurred during submission generation: {e}")


--- Generating Kaggle Submission ---
Submission file 'submission.csv' created successfully.
