In [1]:
import pandas as pd
import re

In [2]:
bidding_data = pd.read_csv(r"C:\Users\shuddhabrota\OneDrive\Desktop\Adobe Devcraft Dataset\dataset\bid.07.txt", sep="\t", low_memory=False)

In [3]:
bid_col =['BidID', 'Timestamp', 'VisitorID', 'User-Agent', 'IP', 'Region', 'City', 'Adexchange', 'Domain', 'URL', 'AnonymousURLID', 'AdslotID', 'Adslotwidth', 'Adslotheight', 'Adslotvisibility', 'Adslotformat', 'Adslotfloorprice', 'CreativeID', 'Biddingprice', 'AdvertiserID', 'User_profile_tags']

bidding_data.columns = bid_col

In [4]:
df = bidding_data

In [5]:
df.columns

Index(['BidID', 'Timestamp', 'VisitorID', 'User-Agent', 'IP', 'Region', 'City',
       'Adexchange', 'Domain', 'URL', 'AnonymousURLID', 'AdslotID',
       'Adslotwidth', 'Adslotheight', 'Adslotvisibility', 'Adslotformat',
       'Adslotfloorprice', 'CreativeID', 'Biddingprice', 'AdvertiserID',
       'User_profile_tags'],
      dtype='object')

In [6]:
df = df.drop(['BidID', 'VisitorID', 'IP', 'Domain', 'URL', 'AnonymousURLID', 'CreativeID', 'User_profile_tags'], axis = 1)

In [7]:
df['Area'] = df['Adslotheight'] * df['Adslotwidth']
df.drop(columns=['Adslotheight', 'Adslotwidth'], inplace=True)

In [22]:
df.select_dtypes(include=['object', 'category']).nunique()

OS            6
DeviceType    3
dtype: int64

In [9]:
import re
import pandas as pd

df['User-Agent'] = df['User-Agent'].fillna("Unknown")

def extract_os(user_agent):
    os_patterns = {
        'Windows': r'Windows',
        'MacOS': r'Macintosh',
        'iOS': r'iPhone|iPad',
        'Android': r'Android',
        'Linux': r'Linux',
        'Other': r'.*'
    }
    for os, pattern in os_patterns.items():
        if re.search(pattern, user_agent, re.IGNORECASE):
            return os
    return 'Other'

def extract_device(user_agent):
    if re.search(r'Mobile|Android|iPhone|iPad', user_agent, re.IGNORECASE):
        return 'Mobile'
    elif re.search(r'Tablet', user_agent, re.IGNORECASE):
        return 'Tablet'
    else:
        return 'Desktop'

df['OS'] = df['User-Agent'].apply(extract_os)
df['DeviceType'] = df['User-Agent'].apply(extract_device)
df['Browser'] = df['User-Agent'].apply(lambda x: x.split(' ')[0] if ' ' in x else 'Unknown')
df.drop(columns=['User-Agent'], inplace=True)


In [10]:
df.select_dtypes(include=['object', 'category']).nunique()

AdslotID      55112
OS                6
DeviceType        3
Browser         683
dtype: int64

In [11]:
df.columns

Index(['Timestamp', 'Region', 'City', 'Adexchange', 'AdslotID',
       'Adslotvisibility', 'Adslotformat', 'Adslotfloorprice', 'Biddingprice',
       'AdvertiserID', 'Area', 'OS', 'DeviceType', 'Browser'],
      dtype='object')

In [12]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y%m%d%H%M%S%f', errors='coerce')
df['Hour'] = df['Timestamp'].dt.hour
df.drop(columns=['Timestamp'], inplace=True)

In [13]:
df.drop(columns=['Browser', 'AdslotID'], inplace=True)

In [14]:
num_col = df.select_dtypes(include=['int64', 'float64']).columns
cat_col = df.select_dtypes(include=['object', 'category']).columns

In [15]:
num_col

Index(['Region', 'City', 'Adexchange', 'Adslotvisibility', 'Adslotformat',
       'Adslotfloorprice', 'Biddingprice', 'AdvertiserID', 'Area'],
      dtype='object')

In [16]:
high_cardinality_cats = [col for col in cat_col if df[col].nunique() > 10]
low_cardinality_cats = [col for col in cat_col if df[col].nunique() <= 10]

In [17]:
low_cardinality_cats

['OS', 'DeviceType']

In [18]:
high_cardinality_cats

[]

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [24]:
categorical_cols = ['OS', 'DeviceType']
numerical_cols = ['Region', 'City', 'Adexchange', 'Adslotvisibility', 
                  'Adslotformat', 'Adslotfloorprice', 'AdvertiserID', 'Area']


In [25]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),  # Handle missing values
    ("encoder", OneHotEncoder(handle_unknown="ignore"))  # One-Hot Encoding
])

# Define preprocessing for numerical features
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value=0))  # Fill missing values with 0
])

# Combine transformers
preprocessor = ColumnTransformer([
    ("cat", cat_pipeline, categorical_cols),
    ("num", num_pipeline, numerical_cols)
])

In [30]:
target = 'Biddingprice'

# Train-test split
X = df.drop(columns=[target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [34]:
model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor(n_estimators=100, learning_rate=0.05, reg_lambda=1, reg_alpha=0.5, random_state=42))
])

In [35]:
model.fit(X_train, y_train)

In [36]:
y_pred = model.predict(X_test)

# Evaluate performance
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.4f}")

Mean Squared Error: 0.0253


In [39]:
import joblib
import os
os.makedirs("bin", exist_ok=True)
model_path = "bin/xgb_regressor_model.bin"
joblib.dump(model, model_path)

['bin/xgb_regressor_model.bin']