In [6]:

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [7]:
print("Starting Black Friday Purchase Prediction Pipeline")

train_path = "train.csv"
test_path = "test.csv"
sample_path = "sample_submission.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)


sample = pd.read_csv(sample_path)

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Sample submission shape: {sample.shape}")

Starting Black Friday Purchase Prediction Pipeline...
Train shape: (550068, 12)
Test shape: (233599, 12)
Sample submission shape: (233599, 3)


In [8]:

train['is_train'] = 1
test['is_train'] = 0
test['Purchase'] = np.nan

combined = pd.concat([train, test], ignore_index=True, sort=False)
print(f"Combined dataset shape: {combined.shape}")

combined.columns = [c.strip().lower() for c in combined.columns]

print(f"Columns: {list(combined.columns)}")

print("Handling missing values and data types")

if 'stay_in_current_city_years' in combined.columns:
    combined['stay_in_current_city_years'] = combined['stay_in_current_city_years'].astype(str)
    combined['stay_in_current_city_years'] = combined['stay_in_current_city_years'].replace('4+', '4')
    combined['stay_in_current_city_years'] = combined['stay_in_current_city_years'].replace('nan', np.nan)
    combined['stay_in_current_city_years'] = pd.to_numeric(combined['stay_in_current_city_years'], errors='coerce')


    median_stay = combined['stay_in_current_city_years'].median()
    combined['stay_in_current_city_years'].fillna(median_stay, inplace=True)
    print(f"Converted stay_in_current_city_years to numeric, filled with median: {median_stay}")
    categorical_cols = ['gender', 'age', 'city_category', 'marital_status']
for col in categorical_cols:
    if col in combined.columns and combined[col].isnull().sum() > 0:
        mode_val = combined[col].mode()[0] if len(combined[col].mode()) > 0 else 'Unknown'
        combined[col].fillna(mode_val, inplace=True)
        print(f"Filled {combined[col].isnull().sum()} missing values in {col}")

numeric_cols = ['product_category_1', 'product_category_2', 'product_category_3']
for col in numeric_cols:
    if col in combined.columns:
        # Ensure column is numeric
        combined[col] = pd.to_numeric(combined[col], errors='coerce')
        if combined[col].isnull().sum() > 0:
            median_val = combined[col].median()
            combined[col].fillna(median_val, inplace=True)
            print(f"Filled missing values in {col} with median: {median_val}")

print("\nCreating new features")


if combined['age'].dtype == 'object':
    age_mapping = {
        '0-17': 0, '18-25': 1, '26-35': 2, '36-45': 3,
        '46-50': 4, '51-55': 5, '55+': 6
    }
    combined['age_numeric'] = combined['age'].map(age_mapping)
    combined['age_numeric'].fillna(combined['age_numeric'].median(), inplace=True)

if 'product_category_1' in combined.columns and 'product_category_2' in combined.columns:
    combined['product_cat_interaction'] = (
        combined['product_category_1'].astype(str) + '_' +
        combined['product_category_2'].fillna(0).astype(str)
    )

if 'city_category' in combined.columns and 'gender' in combined.columns:
    combined['city_gender'] = combined['city_category'].astype(str) + '_' + combined['gender'].astype(str)


Combined dataset shape: (783667, 14)
Columns: ['user_id', 'product_id', 'gender', 'age', 'occupation', 'city_category', 'stay_in_current_city_years', 'marital_status', 'product_category_1', 'product_category_2', 'product_category_3', 'purchase', 'is_train', 'comb']
Handling missing values and data types
Converted stay_in_current_city_years to numeric, filled with median: 2.0
Filled missing values in product_category_2 with median: 9.0
Filled missing values in product_category_3 with median: 14.0

Creating new features


In [9]:
train_mask = combined['is_train'] == 1
train_data = combined[train_mask].copy()

In [10]:

if 'user_id' in combined.columns:
    user_aggs = train_data.groupby('user_id')['purchase'].agg(['mean', 'sum', 'count', 'std']).reset_index()
    user_aggs.columns = ['user_id', 'user_avg_purchase', 'user_total_purchase', 'user_purchase_count', 'user_purchase_std']
    user_aggs['user_purchase_std'].fillna(0, inplace=True)
    combined = combined.merge(user_aggs, on='user_id', how='left')

In [11]:
if 'product_id' in combined.columns:
    product_aggs = train_data.groupby('product_id')['purchase'].agg(['mean', 'count', 'std']).reset_index()
    product_aggs.columns = ['product_id', 'product_avg_purchase', 'product_purchase_count', 'product_purchase_std']
    product_aggs['product_purchase_std'].fillna(0, inplace=True)
    combined = combined.merge(product_aggs, on='product_id', how='left')

In [12]:
if 'product_category_1' in combined.columns:
    cat_aggs = train_data.groupby('product_category_1')['purchase'].agg(['mean', 'count']).reset_index()
    cat_aggs.columns = ['product_category_1', 'cat1_avg_purchase', 'cat1_purchase_count']
    combined = combined.merge(cat_aggs, on='product_category_1', how='left')

In [13]:
print("Encoding categorical variables")
le_cols = ['gender', 'age', 'city_category', 'marital_status']
if 'product_cat_interaction' in combined.columns:
    le_cols.append('product_cat_interaction')
if 'city_gender' in combined.columns:
    le_cols.append('city_gender')

label_encoders = {}
for col in le_cols:
    if col in combined.columns:
        le = LabelEncoder()
        combined[col + '_encoded'] = le.fit_transform(combined[col].astype(str))
        label_encoders[col] = le
        print(f"Encoded {col}: {len(le.classes_)} unique values")

Encoding categorical variables
Encoded gender: 2 unique values
Encoded age: 7 unique values
Encoded city_category: 3 unique values
Encoded marital_status: 2 unique values
Encoded product_cat_interaction: 103 unique values
Encoded city_gender: 6 unique values


In [14]:
train_fe = combined[combined['is_train'] == 1].copy()
test_fe = combined[combined['is_train'] == 0].copy()

print(f"Processed train shape: {train_fe.shape}")
print(f"Processed test shape: {test_fe.shape}")

features = []

basic_features = ['gender_encoded', 'marital_status_encoded', 'stay_in_current_city_years']
if 'age_numeric' in train_fe.columns:
    basic_features.append('age_numeric')
else:
    basic_features.append('age_encoded')
features.extend([f for f in basic_features if f in train_fe.columns])

city_product_features = ['city_category_encoded', 'product_category_1', 'product_category_2', 'product_category_3']
features.extend([f for f in city_product_features if f in train_fe.columns])
interaction_features = ['product_cat_interaction_encoded', 'city_gender_encoded']
features.extend([f for f in interaction_features if f in train_fe.columns])
agg_features = [
    'user_avg_purchase', 'user_total_purchase', 'user_purchase_count', 'user_purchase_std',
    'product_avg_purchase', 'product_purchase_count', 'product_purchase_std',
    'cat1_avg_purchase', 'cat1_purchase_count'
]
features.extend([f for f in agg_features if f in train_fe.columns])


features = [f for f in features if f in train_fe.columns]
print(f"\nSelected {len(features)} features for modeling:")
print(features)

Processed train shape: (550068, 32)
Processed test shape: (233599, 32)

Selected 19 features for modeling:
['gender_encoded', 'marital_status_encoded', 'stay_in_current_city_years', 'age_numeric', 'city_category_encoded', 'product_category_1', 'product_category_2', 'product_category_3', 'product_cat_interaction_encoded', 'city_gender_encoded', 'user_avg_purchase', 'user_total_purchase', 'user_purchase_count', 'user_purchase_std', 'product_avg_purchase', 'product_purchase_count', 'product_purchase_std', 'cat1_avg_purchase', 'cat1_purchase_count']


In [16]:
X = train_fe[features].copy()
y = train_fe['purchase'].copy()
test_X = test_fe[features].copy()

print(f"\nFeature matrix shapes:")
print(f"X: {X.shape}")
print(f"test_X: {test_X.shape}")


print("\nFinal imputation check")

print("Checking data types in feature matrix")
print(X.dtypes.value_counts())

missing_counts = X.isnull().sum()
if missing_counts.sum() > 0:
    print(f"Missing values found: {missing_counts[missing_counts > 0]}")

numeric_features = X.select_dtypes(include=[np.number]).columns
categorical_features = X.select_dtypes(exclude=[np.number]).columns

if len(categorical_features) > 0:
    print(f"Found categorical features that need encoding: {list(categorical_features)}")

    for col in categorical_features:
        if col in X.columns:
            le = LabelEncoder()
            X[col] = le.fit_transform(X[col].astype(str))
            test_X[col] = le.transform(test_X[col].astype(str))


imp = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imp.fit_transform(X), columns=X.columns, index=X.index)
test_X_imputed = pd.DataFrame(imp.transform(test_X), columns=test_X.columns, index=test_X.index)

print("Missing values after imputation:")
print(f"Train: {X_imputed.isnull().sum().sum()}")
print(f"Test: {test_X_imputed.isnull().sum().sum()}")


Feature matrix shapes:
X: (550068, 19)
test_X: (233599, 19)

Final imputation check
Checking data types in feature matrix
int64      10
float64     9
Name: count, dtype: int64
Missing values after imputation:
Train: 0
Test: 0


In [17]:


print("\nTraining model with GroupKFold cross-validation")
gkf = GroupKFold(n_splits=5)
oof_predictions = np.zeros(len(X_imputed))
test_predictions = np.zeros(len(test_X_imputed))

cv_scores = []

for fold, (train_idx, val_idx) in enumerate(gkf.split(X_imputed, y, train_fe['user_id'])):
    print(f"Training fold {fold + 1}/5...")


    X_train_fold = X_imputed.iloc[train_idx]
    X_val_fold = X_imputed.iloc[val_idx]
    y_train_fold = y.iloc[train_idx]
    y_val_fold = y.iloc[val_idx]


    model = HistGradientBoostingRegressor(
        max_iter=500,
        learning_rate=0.08,
        max_depth=8,
        random_state=42
    )


    model.fit(X_train_fold, np.log1p(y_train_fold))

    val_preds = model.predict(X_val_fold)
    oof_predictions[val_idx] = val_preds

    test_preds = model.predict(test_X_imputed)
    test_predictions += test_preds / 5


    fold_rmse = np.sqrt(mean_squared_error(np.log1p(y_val_fold), val_preds))
    cv_scores.append(fold_rmse)
    print(f"Fold {fold + 1} RMSE (log scale): {fold_rmse:.4f}")


mean_cv_score = np.mean(cv_scores)
std_cv_score = np.std(cv_scores)
print(f"\nCross-validation results:")
print(f"Mean RMSE (log scale): {mean_cv_score:.4f} (+/- {std_cv_score:.4f})")


oof_rmse_original = np.sqrt(mean_squared_error(y, np.expm1(oof_predictions)))
print(f"Out-of-fold RMSE (original scale): {oof_rmse_original:.2f}")


final_predictions = np.expm1(test_predictions)
final_predictions = np.maximum(final_predictions, 0)

print(f"\nPrediction statistics:")
print(f"Mean: ${final_predictions.mean():.2f}")
print(f"Median: ${np.median(final_predictions):.2f}")
print(f"Std: ${final_predictions.std():.2f}")
print(f"Min: ${final_predictions.min():.2f}")
print(f"Max: ${final_predictions.max():.2f}")


submission = sample.copy()
submission['Purchase'] = final_predictions


submission.to_csv("submission.csv", index=False)
print(f"\nSubmission saved as 'submission.csv'")
print(f"Submission shape: {submission.shape}")
print("\nFirst few predictions:")
print(submission.head(10))

print("\n" + "="*50)
print("PIPELINE COMPLETED SUCCESSFULLY!")
print(f"Final CV RMSE: {oof_rmse_original:.2f}")
print("Ready for submission!")
print("="*50)


Training model with GroupKFold cross-validation
Training fold 1/5...
Fold 1 RMSE (log scale): 0.3295
Training fold 2/5...
Fold 2 RMSE (log scale): 0.3239
Training fold 3/5...
Fold 3 RMSE (log scale): 0.3222
Training fold 4/5...
Fold 4 RMSE (log scale): 0.3204
Training fold 5/5...
Fold 5 RMSE (log scale): 0.3184

Cross-validation results:
Mean RMSE (log scale): 0.3229 (+/- 0.0038)
Out-of-fold RMSE (original scale): 2544.11

Prediction statistics:
Mean: $8974.14
Median: $7614.27
Std: $4216.62
Min: $317.89
Max: $23269.28

Submission saved as 'submission.csv'
Submission shape: (233599, 3)

First few predictions:
       Purchase  User_ID Product_ID
0  16647.254568  1000004  P00128942
1  11421.807103  1000009  P00113442
2   6361.976128  1000010  P00288442
3   2478.154160  1000010  P00145342
4   2419.749506  1000011  P00053842
5  11172.405951  1000013  P00350442
6  11495.078875  1000013  P00155442
7  10628.493658  1000013   P0094542
8  12699.601351  1000015  P00161842
9   5376.624223  100002