In [None]:
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import (
    Ridge, Lasso, ElasticNet, HuberRegressor, SGDRegressor
)
import pandas as pd
import lightgbm as lgb
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 100)

df = pd.read_csv("/kaggle/input/engage-2-value-from-clicks-to-conversions/train_data.csv")
test_df = pd.read_csv("/kaggle/input/engage-2-value-from-clicks-to-conversions/test_data.csv")

def clean_data(df, is_train=True):
    df = df.copy()
    # Only remove duplicates in train
    if is_train:
        df.drop_duplicates(inplace=True)
    # Handle object columns (categorical/text)
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].fillna("Unknown")
    # Handle numeric columns
    for col in df.select_dtypes(include=['int64', 'float64']).columns:
        median_val = df[col].mode()
        df[col] = df[col].fillna(median_val)
    return df

df = clean_data(df, True)
test_df = clean_data(test_df, False)

constant_cols = [col for col in df.columns if df[col].nunique() == 1]
print("Constant columns:", constant_cols)
df.drop(columns=constant_cols, inplace=True)
test_df.drop(columns=constant_cols, inplace=True) 

X = df.drop(columns=['purchaseValue'])  
y = df['purchaseValue']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)

X = df.drop(columns=['purchaseValue'])  
y = df['purchaseValue']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)
# --- Step 4: Full Pipeline with XGBoost ---
xgb_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=2000,
        learning_rate=0.0125,
        max_depth=16,
        subsample=0.875,
        colsample_bytree=0.875,
        random_state=42,
        n_jobs=-1,
        verbosity=1
    ))
])

# --- Step 5: Fit the Model ---
print("⏳ Training XGBoost...")
xgb_pipeline.fit(X_train, y_train)

y_pred = xgb_pipeline.predict(X_val)


mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("\n XGBoost Performance ")
print(f"MAE = {mae:.2f}")
print(f"R²  = {r2:.4f}")

# --- Step 7: Prepare test_df ---
test_df[cat_cols] = test_df[cat_cols].fillna("Unknown").astype(str)

# --- Step 8: Predict using trained pipeline ---
print("🧪 Predicting on test set...")
test_preds = xgb_pipeline.predict(test_df)


# Optional: Clip negative predictions if not allowed
test_preds = test_preds.clip(min=0)


# --- Step 9: Create Submission File ---
submission = pd.DataFrame({
    'id': test_df.index,  # Replace with the correct ID column name if different
    'purchaseValue': test_preds
})

# --- Step 10: Save as CSV ---
submission.to_csv("submission.csv", index=False)
print("✅ Submission file saved as 'submission.csv'")