# INCLUDES IMPROVED PREPROCESSING + FEATURE ENGINEERING + MODEL SELECTION + MODEL FINE TUNING

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('train_data.csv')

In [None]:
df.duplicated().sum()

np.int64(236)

In [None]:
df = df.drop_duplicates()

In [None]:
df.duplicated().sum()

np.int64(0)

In [None]:
missing_thresh = 0.70
missing_ratio = df.isnull().mean()
drop_missing = missing_ratio[missing_ratio > missing_thresh].index.tolist()

# Drop columns with only one unique value
drop_constant = [col for col in df.columns if df[col].nunique() == 1]
# Drop columns having string "not available in demo"
drop_demo_cols = [col for col in df.columns if df[col].astype(str).str.contains("not available in demo", case=False).all()]

drop_cols = list(set(drop_missing + drop_constant + drop_demo_cols))

cols_to_keep=['trafficSource.isTrueDirect', 'new_visits', 'totals.bounces', 'trafficSource.adwordsClickInfo.isVideoAd']
for col in cols_to_keep:
    drop_cols.remove(col)

df = df.drop(columns=drop_cols)

print("Columns Dropped: ", drop_cols)

Columns Dropped:  ['device.mobileDeviceMarketingName', 'totals.visits', 'device.mobileDeviceBranding', 'device.operatingSystemVersion', 'locationZone', 'device.browserSize', 'device.screenResolution', 'trafficSource.adwordsClickInfo.adNetworkType', 'device.mobileInputSelector', 'device.flashVersion', 'trafficSource.adContent', 'trafficSource.adwordsClickInfo.slot', 'device.screenColors', 'device.language', 'browserMajor', 'device.mobileDeviceModel', 'device.browserVersion', 'screenSize', 'trafficSource.adwordsClickInfo.page', 'socialEngagementType', 'geoNetwork.networkLocation']


In [None]:
df = df.dropna(subset=['date', 'sessionStart'])

In [None]:
# Replacing placeholder values with NaNs and NaN with 'missing'
df['geoNetwork.region'] = df['geoNetwork.region'].replace("not available in demo dataset", np.nan)
df["geoNetwork.region"] = df["geoNetwork.region"].replace({"(not set)":np.nan})

df["trafficSource.campaign"] = df["trafficSource.campaign"].replace({"(not set)":np.nan})
df["trafficSource.keyword"] = df["trafficSource.keyword"].replace({np.nan:"missing"})

In [None]:
df['totals.bounces'] = df['totals.bounces'].fillna(0)
df['new_visits'] = df['new_visits'].fillna(0)

df['pageViews'] = df['pageViews'].fillna(1)
df['totalHits'] = df['totalHits'].fillna(1)
df['sessionNumber'] = df['sessionNumber'].fillna(1)

df['trafficSource.isTrueDirect'] = df['trafficSource.isTrueDirect'].fillna(False)
df['trafficSource.adwordsClickInfo.isVideoAd'] = df['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True)

  df['trafficSource.isTrueDirect'] = df['trafficSource.isTrueDirect'].fillna(False)
  df['trafficSource.adwordsClickInfo.isVideoAd'] = df['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True)


In [None]:
df['is_campaign_set'] = df['trafficSource.campaign'].notna().astype(int)
df['has_referral'] = df['trafficSource.referralPath'].notna().astype(int)

In [None]:
df['region_city'] = df['geoNetwork.region'].astype(str) + "_" + df['geoNetwork.city'].astype(str)
df['continent_subcontinent'] = df['geoNetwork.continent'].astype(str) + "_" + df['geoNetwork.subContinent'].astype(str)

In [None]:
# Engagement metric: ratio of pageViews to totalHits (+1 to avoid division by 0)
df['page_hit_ratio'] = df['pageViews'] / (df['totalHits'] + 1)
df['page_hit_ratio'] = df['page_hit_ratio'].fillna(0)

# Number of unique sessions per user
user_session_counts = df.groupby('userId')['sessionId'].nunique()
df['user_session_count'] = df['userId'].map(user_session_counts)

# Average purchase value per user
user_avg_purchase = df.groupby('userId')['purchaseValue'].mean()
df['avg_purchase_by_user'] = df['userId'].map(user_avg_purchase)

In [None]:
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
df['sessionStart'] = pd.to_datetime(df['sessionStart'], unit='s')

df['day_of_week'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month
df['hour'] = df['sessionStart'].dt.hour

# Now drop after extraction
df = df.drop(columns=['date', 'sessionStart'])

In [None]:
for col in ['sessionNumber', 'pageViews', 'totalHits']:
    Q1 = df[col].quantile(0.10)
    Q3 = df[col].quantile(0.90)

    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

In [None]:
browser_counts = df['browser'].value_counts()
valid_browsers = browser_counts[browser_counts >= 200].index

df['browser'] = df['browser'].apply(lambda x: x if x in valid_browsers else 'Other')

In [None]:
df.shape

(115787, 39)

In [None]:
X = df.drop(columns='purchaseValue')
y = df['purchaseValue']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

(92629, 38)

In [None]:
y_train.shape

(92629,)

In [None]:
X_test.shape

(23158, 38)

In [None]:
y_test.shape

(23158,)

In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 92629 entries, 1640 to 15799
Data columns (total 38 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   trafficSource.isTrueDirect                92629 non-null  bool   
 1   browser                                   92629 non-null  object 
 2   trafficSource.keyword                     92629 non-null  object 
 3   geoCluster                                92629 non-null  object 
 4   userId                                    92629 non-null  int64  
 5   trafficSource.campaign                    4621 non-null   object 
 6   geoNetwork.networkDomain                  92629 non-null  object 
 7   gclIdPresent                              92629 non-null  int64  
 8   sessionNumber                             92629 non-null  float64
 9   geoNetwork.region                         42370 non-null  object 
 10  trafficSource                       

In [None]:
num_cols = X_train.select_dtypes(include=['float64','int64','bool','int32']).columns.tolist()
cat_cols = X_train.select_dtypes(include='object').columns.tolist()

print("Numerical Columns: ", num_cols)
print("\nCategorical Columns: ", cat_cols)

Numerical Columns:  ['trafficSource.isTrueDirect', 'userId', 'gclIdPresent', 'sessionNumber', 'sessionId', 'trafficSource.adwordsClickInfo.isVideoAd', 'pageViews', 'totals.bounces', 'totalHits', 'device.isMobile', 'new_visits', 'is_campaign_set', 'has_referral', 'page_hit_ratio', 'user_session_count', 'avg_purchase_by_user', 'day_of_week', 'month', 'hour']

Categorical Columns:  ['browser', 'trafficSource.keyword', 'geoCluster', 'trafficSource.campaign', 'geoNetwork.networkDomain', 'geoNetwork.region', 'trafficSource', 'os', 'geoNetwork.subContinent', 'trafficSource.medium', 'locationCountry', 'geoNetwork.city', 'geoNetwork.metro', 'trafficSource.referralPath', 'deviceType', 'userChannel', 'geoNetwork.continent', 'region_city', 'continent_subcontinent']


In [None]:
# Choosing numerical columns that need scaling
num_cols = ['pageViews', 'totalHits', 'sessionNumber','user_session_count']

In [None]:
# Separating categorical columns with greater than 10 unique values
low_card_cat_cols = list()
high_card_cat_cols = list()

for col in cat_cols:
    if (X_train[col].nunique() < 10):
        low_card_cat_cols.append(col)
    else:
        high_card_cat_cols.append(col)

In [None]:
print("Numerical columns to scale: ", num_cols)
print()
print("Low cradinality columns to One-Hot encode: ", low_card_cat_cols)
print()
print("High cradinality columns to Target encode: ", high_card_cat_cols)

Numerical columns to scale:  ['pageViews', 'totalHits', 'sessionNumber', 'user_session_count']

Low cradinality columns to One-Hot encode:  ['geoCluster', 'geoNetwork.networkDomain', 'trafficSource.medium', 'deviceType', 'userChannel', 'geoNetwork.continent']

High cradinality columns to Target encode:  ['browser', 'trafficSource.keyword', 'trafficSource.campaign', 'geoNetwork.region', 'trafficSource', 'os', 'geoNetwork.subContinent', 'locationCountry', 'geoNetwork.city', 'geoNetwork.metro', 'trafficSource.referralPath', 'region_city', 'continent_subcontinent']


In [None]:
!pip install category_encoders



In [None]:
import category_encoders as ce
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OneHotEncoder

# Imputing missing values of numerical columns with Median and Scaling
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

# Imputing missing values with Mode and One-Hot Encoding for low cardinality columns
low_card_cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('label_encoder', OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# Inputing missing values with Mode and Target Encoding for high cardinality columns
high_card_cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', ce.TargetEncoder())
])

# Combining pipelines in Column Transformer and passing the remainder columns as it is
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, num_cols),
    ('low_card_cat', low_card_cat_pipeline, low_card_cat_cols),
    ('high_card_cat', high_card_cat_pipeline, high_card_cat_cols)
]
, remainder='passthrough')

In [None]:
# Fit and transform the training set
X_train_preprocessed = preprocessor.fit_transform(X_train, y_train)
# Transform the validation set
X_test_preprocessed = preprocessor.transform(X_test)

print("Preprocessing Done")
print()
print("Preprocessed X_train shape: ", X_train_preprocessed.shape)
print("Preprocessed X_test shape: ", X_test_preprocessed.shape)

Preprocessing Done

Preprocessed X_train shape:  (92629, 64)
Preprocessed X_test shape:  (23158, 64)


In [None]:
# from lightgbm import LGBMRegressor
# from xgboost import XGBRegressor
# from catboost import CatBoostRegressor
# from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
# from sklearn.metrics import r2_score

# models = {
#     "LGBM": LGBMRegressor(random_state=42),
#     "XGB": XGBRegressor(random_state=42),
#     "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
#     "HistGB": HistGradientBoostingRegressor(random_state=42),
#     "RF": RandomForestRegressor(random_state=42)
# }

# results = []

# for name, model in models.items():
#     try:
#         # Fit the model
#         model.fit(X_train_preprocessed, y_train)
#         print(f'{name}: Fitted Successfully on Training Data!')

#         # Make predictions
#         y_pred = model.predict(X_test_preprocessed)
#         print(f'{name}: Predictions Made Successfully on Test Data!')

#         # Calculate R2 score
#         score = r2_score(y_test, y_pred)
#         print(f'{name}: R2 Score Calculated Successfully! → {score:.4f}\n')

#         # Store results
#         results.append({
#             'Model': name,
#             'R2 Score': score
#         })
#     except Exception as e:
#         print(f"Error with {name}: {str(e)}")
#         results.append({
#             'Model': name,
#             'R2 Score': None,
#             'Error': str(e)
#         })

# # Create a results dataframe
# results_df = pd.DataFrame(results).sort_values(by='R2 Score', ascending=False)
# print("\nFinal Results:")
# print(results_df)

In [None]:
# import optuna
# from sklearn.model_selection import cross_val_score

# def objective(trial):
#     model_name = trial.suggest_categorical("model", ["LGBM", "XGB", "CatBoost"])

#     if model_name == "LGBM":
#         params = {
#             "n_estimators": trial.suggest_int("lgbm_n_estimators", 50, 500),
#             "max_depth": trial.suggest_int("lgbm_max_depth", 3, 12),
#             "learning_rate": trial.suggest_float("lgbm_lr", 1e-3, 0.3, log=True),
#             "subsample": trial.suggest_float("lgbm_subsample", 0.6, 1.0),
#         }
#         model = LGBMRegressor(**params, random_state=42)

#     elif model_name == "XGB":
#         params = {
#             "n_estimators": trial.suggest_int("xgb_n_estimators", 50, 500),
#             "max_depth": trial.suggest_int("xgb_max_depth", 3, 12),
#             "learning_rate": trial.suggest_float("xgb_lr", 1e-3, 0.3, log=True),
#             "subsample": trial.suggest_float("xgb_subsample", 0.6, 1.0),
#             "colsample_bytree": trial.suggest_float("xgb_colsample", 0.6, 1.0),
#         }
#         model = XGBRegressor(**params, random_state=42, tree_method="hist")

#     elif model_name == "CatBoost":
#         params = {
#             "iterations": trial.suggest_int("catboost_iter", 50, 500),
#             "depth": trial.suggest_int("catboost_depth", 3, 10),
#             "learning_rate": trial.suggest_float("catboost_lr", 1e-3, 0.3, log=True),
#         }
#         model = CatBoostRegressor(**params, verbose=0, random_state=42)

#     # Evaluate using cross-validation
#     score = cross_val_score(model, X_train_preprocessed, y_train, cv=3, scoring="r2").mean()
#     return score

In [None]:
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=50)

# print("Best trial:")
# trial = study.best_trial
# print(f"Model: {trial.params['model']}")
# print(f"R² Score: {trial.value:.4f}")
# print("Best Params:", trial.params)

In [None]:
test = pd.read_csv('test_data.csv')

In [None]:
cols_to_drop = ['trafficSource.adwordsClickInfo.adNetworkType', 'geoNetwork.networkLocation', 'browserMajor',
                'device.mobileDeviceBranding', 'device.browserSize', 'socialEngagementType', 'trafficSource.adwordsClickInfo.page',
                'locationZone', 'totals.visits', 'device.mobileDeviceModel', 'trafficSource.adwordsClickInfo.slot',
                'device.mobileDeviceMarketingName', 'device.language', 'device.flashVersion', 'device.screenResolution', 'screenSize',
                'device.operatingSystemVersion', 'device.browserVersion', 'trafficSource.adContent', 'device.screenColors',
                'device.mobileInputSelector']

test = test.drop(columns=cols_to_drop)

In [None]:
# Replacing placeholder values with NaNs and NaN with 'missing'
test['geoNetwork.region'] = test['geoNetwork.region'].replace("not available in demo dataset", np.nan)
test["geoNetwork.region"] = test["geoNetwork.region"].replace({"(not set)":np.nan})

test["trafficSource.campaign"] = test["trafficSource.campaign"].replace({"(not set)":np.nan})
test["trafficSource.keyword"] = test["trafficSource.keyword"].replace({np.nan:"missing"})

In [None]:
test['totals.bounces'] = test['totals.bounces'].fillna(0)
test['new_visits'] = test['new_visits'].fillna(0)

test['pageViews'] = test['pageViews'].fillna(1)
test['totalHits'] = test['totalHits'].fillna(1)
test['sessionNumber'] = test['sessionNumber'].fillna(1)

test['trafficSource.isTrueDirect'] = test['trafficSource.isTrueDirect'].fillna(False)
test['trafficSource.adwordsClickInfo.isVideoAd'] = test['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True)

  test['trafficSource.isTrueDirect'] = test['trafficSource.isTrueDirect'].fillna(False)
  test['trafficSource.adwordsClickInfo.isVideoAd'] = test['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True)


In [None]:
test['is_campaign_set'] = test['trafficSource.campaign'].notna().astype(int)
test['has_referral'] = test['trafficSource.referralPath'].notna().astype(int)

In [None]:
test['region_city'] = test['geoNetwork.region'].astype(str) + "_" + test['geoNetwork.city'].astype(str)
test['continent_subcontinent'] = test['geoNetwork.continent'].astype(str) + "_" + test['geoNetwork.subContinent'].astype(str)

In [None]:
# Engagement metric: ratio of pageViews to totalHits (+1 to avoid division by 0)
test['page_hit_ratio'] = test['pageViews'] / (test['totalHits'] + 1)
test['page_hit_ratio'] = test['page_hit_ratio'].fillna(0)

# Number of unique sessions per user
test['user_session_count'] = test['userId'].map(user_session_counts)
# Fill unknown users with overall mean (just in case)
global_mean_sessions = user_session_counts.mean()
test['user_session_count'] = test['user_session_count'].fillna(global_mean_sessions)

# Average purchase value per user
test['avg_purchase_by_user'] = test['userId'].map(user_avg_purchase)
# Fill missing users (not seen in train) with overall average
overall_avg = user_avg_purchase.mean()
test['avg_purchase_by_user'] = test['avg_purchase_by_user'].fillna(overall_avg)

In [None]:
test['date'] = pd.to_datetime(test['date'], format='%Y%m%d')
test['sessionStart'] = pd.to_datetime(test['sessionStart'], unit='s')

test['day_of_week'] = test['date'].dt.dayofweek
test['month'] = test['date'].dt.month
test['hour'] = test['sessionStart'].dt.hour

# Now drop after extraction
test = test.drop(columns=['date', 'sessionStart'])

In [None]:
for col in ['sessionNumber', 'pageViews', 'totalHits']:
    Q1 = df[col].quantile(0.10)
    Q3 = df[col].quantile(0.90)

    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    test[col] = test[col].clip(lower=lower_bound, upper=upper_bound)

In [None]:
test['browser'] = test['browser'].apply(lambda x: x if x in valid_browsers else 'Other')

In [None]:
# Fit and transform the COMPLETE training set
X_preprocessed = preprocessor.fit_transform(X, y)
# Transform the testing set
test_preprocessed = preprocessor.transform(test)

print("Preprocessing Done")
print()
print("Preprocessed X shape: ", X_preprocessed.shape)
print("Preprocessed test shape: ", test_preprocessed.shape)

Preprocessing Done

Preprocessed X shape:  (115787, 64)
Preprocessed test shape:  (29006, 64)


In [None]:
# # best_model = XGBRegressor(
# #     n_estimators = 343,
# #     max_depth = 7,
# #     lr = 0.05890776371030625,
# #     subsample = 0.7320325193461752,
# #     colsample = 0.6525959173893,
# #     random_state = 42
# # )

# best_model = CatBoostRegressor(verbose=0, random_state=42)

# best_model.fit(X_preprocessed, y)

# real_y_pred = best_model.predict(test_preprocessed)
# print(real_y_pred)

In [None]:
# real_y_pred.shape

In [None]:
# # Prepare submission
# submission_df = pd.DataFrame({
#     'id': test.index,
#     'purchaseValue': real_y_pred
# })

# submission_df.head()

In [None]:
# submission_df.to_csv('submission2.csv', index=False)

# print("✅ Submission file created successfully!")
# print(f"📁 Saved as: submission.csv")
# print(f"📊 Sample predictions:\n{submission_df.head()}")

In [None]:
from sklearn.preprocessing import StandardScaler

scale_cols = high_card_cat_cols

scaler = StandardScaler()

X_train_neural = X_train_preprocessed.copy()
X_test_neural = X_test_preprocessed.copy()

X_train_neural[:, :len(scale_cols)] = scaler.fit_transform(X_train_preprocessed[:, :len(scale_cols)])
X_test_neural[:, :len(scale_cols)] = scaler.transform(X_test_preprocessed[:, :len(scale_cols)])

In [None]:
X_train_neural.shape

(92629, 64)

In [None]:
X_test_neural.shape

(23158, 64)

In [None]:
X_train_neural = pd.DataFrame(X_train_neural)
X_test_neural = pd.DataFrame(X_test_neural)

In [None]:
X_train_neural = X_train_neural.astype('float32')
X_test_neural = X_test_neural.astype('float32')

In [None]:
X_train_neural.sample()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
25714,-0.590387,-0.569224,-0.480334,-0.169932,2.006759,-0.500307,-0.500459,-0.502313,-0.498603,1.412064,...,1.0,0.0,1.0,0.0,1.0,0.5,0.0,6.0,1.0,3.0


In [None]:
X_train_neural.shape

(92629, 64)

In [None]:
X_test_neural.shape

(23158, 64)

In [None]:
X_train_neural.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92629 entries, 0 to 92628
Data columns (total 64 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       92629 non-null  float32
 1   1       92629 non-null  float32
 2   2       92629 non-null  float32
 3   3       92629 non-null  float32
 4   4       92629 non-null  float32
 5   5       92629 non-null  float32
 6   6       92629 non-null  float32
 7   7       92629 non-null  float32
 8   8       92629 non-null  float32
 9   9       92629 non-null  float32
 10  10      92629 non-null  float32
 11  11      92629 non-null  float32
 12  12      92629 non-null  float32
 13  13      92629 non-null  float32
 14  14      92629 non-null  float32
 15  15      92629 non-null  float32
 16  16      92629 non-null  float32
 17  17      92629 non-null  float32
 18  18      92629 non-null  float32
 19  19      92629 non-null  float32
 20  20      92629 non-null  float32
 21  21      92629 non-null  float32
 22

In [None]:
# import tensorflow as tf
# from tensorflow.keras import backend as K

# def r2_metric(y_true, y_pred):
#     SS_res = K.sum(K.square(y_true - y_pred))
#     SS_tot = K.sum(K.square(y_true - K.mean(y_true)))
#     return 1 - SS_res / (SS_tot + K.epsilon())

In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# # Model Architecture
# model = Sequential([
#     Dense(256, activation='relu', input_shape=(X_train_neural.shape[1],)),
#     BatchNormalization(),
#     Dropout(0.4),
#     Dense(128, activation='relu', kernel_regularizer='l2'),
#     BatchNormalization(),
#     Dropout(0.3),
#     Dense(64, activation='relu'),
#     Dense(1)  # Linear output for regression
# ])

# # Compile with R² tracking
# model.compile(
#     optimizer=Adam(learning_rate=0.001),
#     loss='mse',
#     metrics=[r2_metric]  # Now tracking R² during training
# )

# # Callbacks to maximize R²
# callbacks = [
#     EarlyStopping(monitor='val_r2_metric', patience=15, mode='max', restore_best_weights=True),
#     ReduceLROnPlateau(monitor='val_r2_metric', factor=0.5, patience=5, mode='max')
# ]

In [None]:
# history = model.fit(
#     X_train_neural, y_train,
#     validation_split=0.2,
#     epochs=200,
#     batch_size=512,
#     callbacks=callbacks,
#     verbose=1
# )

In [None]:
# import tensorflow as tf
# import multiprocessing

# # Check GPU availability
# gpus = tf.config.list_physical_devices('GPU')
# print(f"Available GPUs: {gpus}")

# # Check CPU cores (for parallel trials)
# num_cores = multiprocessing.cpu_count()
# print(f"Available CPU cores: {num_cores}")

In [None]:
# import optuna
# from tensorflow.keras.regularizers import l1_l2
# from tensorflow.keras.optimizers import Nadam

# def objective(trial):
#     # Hyperparameter search space
#     params = {
#         'num_layers': trial.suggest_int('num_layers', 1, 4),
#         'units': trial.suggest_categorical('units', [64, 128, 256, 512]),
#         'dropout_rate': trial.suggest_float('dropout_rate', 0.1, 0.5, step=0.1),
#         'learning_rate': trial.suggest_float('lr', 1e-4, 1e-2, log=True),
#         'l1_reg': trial.suggest_float('l1_reg', 1e-6, 1e-2, log=True),
#         'l2_reg': trial.suggest_float('l2_reg', 1e-6, 1e-2, log=True),
#         'batch_size': trial.suggest_categorical('batch_size', [128, 256, 512]),
#     }

#     # Build model
#     model = Sequential()
#     model.add(Dense(params['units'], activation='relu',
#                    kernel_regularizer=l1_l2(params['l1_reg'], params['l2_reg']),
#                    input_shape=(X_train_neural.shape[1],)))
#     model.add(BatchNormalization())
#     model.add(Dropout(params['dropout_rate']))

#     # Add variable hidden layers
#     for _ in range(params['num_layers'] - 1):
#         model.add(Dense(params['units']//2, activation='relu',
#                       kernel_regularizer=l1_l2(params['l1_reg'], params['l2_reg'])))
#         model.add(BatchNormalization())
#         model.add(Dropout(params['dropout_rate']))

#     model.add(Dense(1))

#     # Compile
#     model.compile(
#         optimizer=Nadam(learning_rate=params['learning_rate']),
#         loss='mse',
#         metrics=[r2_metric]
#     )

#     # Train with early stopping
#     history = model.fit(
#         X_train_neural, y_train,
#         validation_split=0.2,
#         epochs=200,
#         batch_size=params['batch_size'],
#         callbacks=[
#             EarlyStopping(monitor='val_r2_metric', patience=15, mode='max',
#                          restore_best_weights=True)
#         ],
#         verbose=0
#     )

#     # Return best validation R²
#     return max(history.history['val_r2_metric'])

In [None]:
# import os
# os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices=false'

In [None]:
# !pip install optuna

In [None]:
# import optuna
# import numpy as np
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout, Input
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.regularizers import l1_l2
# from tensorflow.keras.callbacks import EarlyStopping
# from sklearn.metrics import r2_score

In [None]:
# import tensorflow as tf
# print("TensorFlow version:", tf.__version__)
# print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))

In [None]:
# # 🧠 For reproducibility
# tf.random.set_seed(42)
# np.random.seed(42)

# def objective(trial):
#     # 🎯 Hyperparameters
#     params = {
#         'n_layers': trial.suggest_int('n_layers', 1, 3),
#         'hidden_units': trial.suggest_categorical('hidden_units', [64, 128, 256, 512]),
#         'dropout': trial.suggest_float('dropout', 0.1, 0.5),
#         'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True),
#         'batch_size': trial.suggest_categorical('batch_size', [32, 64, 128]),
#         'optimizer': trial.suggest_categorical('optimizer', ['adam', 'rmsprop']),
#         'patience': trial.suggest_int('patience', 10, 30),
#     }

#     # 🔧 Select optimizer
#     if params['optimizer'] == 'adam':
#         optimizer = tf.keras.optimizers.Adam(learning_rate=params['learning_rate'])
#     else:
#         optimizer = tf.keras.optimizers.RMSprop(learning_rate=params['learning_rate'])

#     # 🔨 Build Model
#     model = Sequential()
#     model.add(Input(shape=(X_train_neural.shape[1],)))
#     for _ in range(params['n_layers']):
#         model.add(Dense(params['hidden_units'], activation='relu'))
#         model.add(Dropout(params['dropout']))

#     model.add(Dense(1))  # Regression output

#     model.compile(optimizer=optimizer, loss='mse')

#     # 🛑 Early stopping
#     early_stop = EarlyStopping(
#         monitor='val_loss',
#         patience=params['patience'],
#         restore_best_weights=True
#     )

#     # 🚂 Training
#     history = model.fit(
#         X_train_neural, y_train,
#         validation_data=(X_test_neural, y_test),
#         batch_size=params['batch_size'],
#         epochs=300,
#         callbacks=[early_stop],
#         verbose=0
#     )

#     # 📈 Evaluate
#     y_pred = model.predict(X_test_neural, verbose=0)
#     score = r2_score(y_test, y_pred)
#     return score

In [None]:
# # 🔍 Run Optuna Study
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=50)

# # 📊 Output best results
# print("\n✅ Best R² Score:", study.best_value)
# print("🔧 Best Hyperparameters:")
# for k, v in study.best_params.items():
#     print(f"   {k}: {v}")

In [None]:
# Basic Stats
total = len(df)
zero_count = (df['purchaseValue'] == 0).sum()
non_zero_count = total - zero_count

print("🔢 Total Samples:", total)
print("🚫 Zero Values:", zero_count, f"({zero_count / total:.2%})")
print("✅ Non-Zero Values:", non_zero_count, f"({non_zero_count / total:.2%})\n")

# Only non-zero values
non_zero_vals = df[df['purchaseValue'] != 0]['purchaseValue']

# Define bins (modify as needed)
bins = [0, 100, 500, 1000, 5000, 10000, 50000, float('inf')]
labels = ['0–100', '100–500', '500–1K', '1K–5K', '5K–10K', '10K–50K', '>50K']

# Bin the data
bin_counts = pd.cut(non_zero_vals, bins=bins, labels=labels, include_lowest=True).value_counts().sort_index()

print("📊 Distribution of Non-Zero Purchase Values:")
for label, count in bin_counts.items():
    percent = count / non_zero_count * 100
    print(f"   {label}: {count} ({percent:.2f}%)")

🔢 Total Samples: 115787
🚫 Zero Values: 91990 (79.45%)
✅ Non-Zero Values: 23797 (20.55%)

📊 Distribution of Non-Zero Purchase Values:
   0–100: 0 (0.00%)
   100–500: 0 (0.00%)
   500–1K: 0 (0.00%)
   1K–5K: 0 (0.00%)
   5K–10K: 1 (0.00%)
   10K–50K: 2 (0.01%)
   >50K: 23794 (99.99%)


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, r2_score
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostRegressor
import pandas as pd
import numpy as np

# STEP 1: Binary target for classification
df['purchase_bin'] = (df['purchaseValue'] > 0).astype(int)

# STEP 2: Create classification and regression datasets
df_class = df.copy()  # For classification (0 vs non-zero)
df_reg = df[df['purchaseValue'] > 0].copy()  # Only non-zero values

# STEP 3: Define target and features
target_class = 'purchase_bin'
target_reg = 'purchaseValue'

X_class = df_class.drop(columns=[target_class, target_reg])
y_class = df_class[target_class]

X_reg = df_reg.drop(columns=[target_class, target_reg])
y_reg = df_reg[target_reg]

# STEP 4: Train/Test Split
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42, stratify=y_class)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# STEP 5: Fit Preprocessor (use same pipelines you already wrote)
preprocessor.fit(X_train_class, y_train_class)
X_train_class_processed = preprocessor.transform(X_train_class)
X_test_class_processed = preprocessor.transform(X_test_class)

X_train_reg_processed = preprocessor.transform(X_train_reg)
X_test_reg_processed = preprocessor.transform(X_test_reg)

In [None]:
# STEP 6: Classification Model
clf = RandomForestClassifier(random_state=42, n_jobs=-1)
clf.fit(X_train_class_processed, y_train_class)
y_pred_class = clf.predict(X_test_class_processed)
print("\n📊 Classification Report:")
print(classification_report(y_test_class, y_pred_class))


📊 Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18398
           1       0.97      1.00      0.98      4760

    accuracy                           0.99     23158
   macro avg       0.99      0.99      0.99     23158
weighted avg       0.99      0.99      0.99     23158



In [None]:
# STEP 7: Regression Model on non-zero purchases
reg = CatBoostRegressor(verbose=0, random_seed=42)
reg.fit(X_train_reg_processed, y_train_reg)
y_pred_reg = reg.predict(X_test_reg_processed)
print("\n📈 R² Score on Non-Zero Purchase Prediction:", r2_score(y_test_reg, y_pred_reg))


📈 R² Score on Non-Zero Purchase Prediction: 0.70304896615114


In [None]:
# 🔮 Predict PurchaseValue on New Unseen Test Data
def predict_purchase(df_test):
    df_test_input = df_test.copy()

    # 🔧 Apply same preprocessing pipeline
    X_test_proc = preprocessor.transform(df_test_input)

    # 🎯 First: Predict 0 or Non-Zero using the classifier
    y_class_pred = clf.predict(X_test_proc)  # 0 or 1

    # 🧠 Then: Predict purchase value (even for all, will mask later)
    y_reg_pred = reg.predict(X_test_proc)

    # 🤖 Final Prediction: Use 0 if classifier says zero, else regressor's prediction
    y_final_pred = np.where(y_class_pred == 0, 0, y_reg_pred)

    return y_final_pred

In [None]:
# ✅ Run prediction
final_predictions = predict_purchase(test)

final_predictions.shape

(29006,)

In [None]:
# Prepare submission
submission_df = pd.DataFrame({
    'id': test.index,
    'purchaseValue': final_predictions
})

submission_df.head()

Unnamed: 0,id,purchaseValue
0,0,41977370.0
1,1,12702080.0
2,2,0.0
3,3,0.0
4,4,15394920.0


In [None]:
submission_df.to_csv('submission.csv', index=False)

print("✅ Submission file created successfully!")
print(f"📁 Saved as: submission.csv")
print(f"📊 Sample predictions:\n{submission_df.head()}")

✅ Submission file created successfully!
📁 Saved as: submission.csv
📊 Sample predictions:
   id  purchaseValue
0   0   4.197737e+07
1   1   1.270208e+07
2   2   0.000000e+00
3   3   0.000000e+00
4   4   1.539492e+07
