In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

# Load data
train = pd.read_csv("train_data.csv")
test = pd.read_csv("test_data.csv")

# Use default index as ID
test_ids = range(len(test))

# Separate target
target = train["purchaseValue"]
train.drop(columns=["purchaseValue"], inplace=True)

# Combine for preprocessing
df = pd.concat([train, test], axis=0)

# Fill missing values
df.fillna(-999, inplace=True)

# Encode categorical columns (convert all to string first)
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype(str)
    df[col] = LabelEncoder().fit_transform(df[col])

# Feature engineering
df['log_totalHits'] = np.log1p(df['totalHits'])
df['log_pageViews'] = np.log1p(df['pageViews'])
df['pageViews_totalHits'] = df['pageViews'] * df['totalHits']

# Select useful features
important_features = [
    'totalHits', 'pageViews', 'sessionNumber',
    'log_totalHits', 'log_pageViews', 'pageViews_totalHits'
]

# Split back to train and test
train_X = df.iloc[:len(train)][important_features]
test_X = df.iloc[len(train):][important_features]

# Train model
model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6)
model.fit(train_X, target)

# Predict
predictions = model.predict(test_X)

# Submission file
submission = pd.DataFrame({
    "id": test_ids,
    "purchaseValue": predictions
})
submission.to_csv("submission.csv", index=False)


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Prepare data again from the combined DataFrame
train_data = df.iloc[:len(train)]
train_X_full = train_data[important_features]
train_y_full = target

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_X_full, train_y_full, test_size=0.2, random_state=42)

# Train model
model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6)
model.fit(X_train, y_train)

# Predict on validation set
val_preds = model.predict(X_val)

# Calculate R² score
r2 = r2_score(y_val, val_preds)
print("R² Score on Validation Set:", r2)


R² Score on Validation Set: -0.006260856739349618
