# TRIED CLASSIFICATION + REGRESSION APPROACH TO GET BETTER PREDICTIONS

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('../../data/raw/train_data.csv')

In [3]:
df.duplicated().sum()

236

In [4]:
df = df.drop_duplicates()

In [5]:
df.duplicated().sum()

0

In [6]:
missing_thresh = 0.70
missing_ratio = df.isnull().mean()
drop_missing = missing_ratio[missing_ratio > missing_thresh].index.tolist()

# Drop columns with only one unique value
drop_constant = [col for col in df.columns if df[col].nunique() == 1]
# Drop columns having string "not available in demo"
drop_demo_cols = [col for col in df.columns if df[col].astype(str).str.contains("not available in demo", case=False).all()]

drop_cols = list(set(drop_missing + drop_constant + drop_demo_cols))

cols_to_keep=['trafficSource.isTrueDirect', 'new_visits', 'totals.bounces', 'trafficSource.adwordsClickInfo.isVideoAd']
for col in cols_to_keep:
    drop_cols.remove(col)

df = df.drop(columns=drop_cols)

print("Columns Dropped: ", drop_cols)

Columns Dropped:  ['trafficSource.adContent', 'device.screenResolution', 'trafficSource.adwordsClickInfo.adNetworkType', 'trafficSource.adwordsClickInfo.page', 'device.mobileDeviceBranding', 'trafficSource.adwordsClickInfo.slot', 'screenSize', 'device.mobileInputSelector', 'device.operatingSystemVersion', 'device.language', 'device.flashVersion', 'totals.visits', 'device.mobileDeviceModel', 'device.browserVersion', 'device.browserSize', 'geoNetwork.networkLocation', 'locationZone', 'device.mobileDeviceMarketingName', 'browserMajor', 'device.screenColors', 'socialEngagementType']


In [7]:
df = df.dropna(subset=['date', 'sessionStart'])

In [8]:
# Replacing placeholder values with NaNs and NaN with 'missing'
df['geoNetwork.region'] = df['geoNetwork.region'].replace("not available in demo dataset", np.nan)
df["geoNetwork.region"] = df["geoNetwork.region"].replace({"(not set)":np.nan})

df["trafficSource.campaign"] = df["trafficSource.campaign"].replace({"(not set)":np.nan})
df["trafficSource.keyword"] = df["trafficSource.keyword"].replace({np.nan:"missing"})

In [9]:
df['totals.bounces'] = df['totals.bounces'].fillna(0)
df['new_visits'] = df['new_visits'].fillna(0)

df['pageViews'] = df['pageViews'].fillna(1)
df['totalHits'] = df['totalHits'].fillna(1)
df['sessionNumber'] = df['sessionNumber'].fillna(1)

df['trafficSource.isTrueDirect'] = df['trafficSource.isTrueDirect'].fillna(False)
df['trafficSource.adwordsClickInfo.isVideoAd'] = df['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True)

In [10]:
df['is_campaign_set'] = df['trafficSource.campaign'].notna().astype(int)
df['has_referral'] = df['trafficSource.referralPath'].notna().astype(int)

In [11]:
df['region_city'] = df['geoNetwork.region'].astype(str) + "_" + df['geoNetwork.city'].astype(str)
df['continent_subcontinent'] = df['geoNetwork.continent'].astype(str) + "_" + df['geoNetwork.subContinent'].astype(str)

In [12]:
# Engagement metric: ratio of pageViews to totalHits (+1 to avoid division by 0)
df['page_hit_ratio'] = df['pageViews'] / (df['totalHits'] + 1)
df['page_hit_ratio'] = df['page_hit_ratio'].fillna(0)

# Number of unique sessions per user
user_session_counts = df.groupby('userId')['sessionId'].nunique()
df['user_session_count'] = df['userId'].map(user_session_counts)

# Average purchase value per user
user_avg_purchase = df.groupby('userId')['purchaseValue'].mean()
df['avg_purchase_by_user'] = df['userId'].map(user_avg_purchase)

In [13]:
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
df['sessionStart'] = pd.to_datetime(df['sessionStart'], unit='s')

df['day_of_week'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month
df['hour'] = df['sessionStart'].dt.hour

# Now drop after extraction
df = df.drop(columns=['date', 'sessionStart'])

In [14]:
for col in ['sessionNumber', 'pageViews', 'totalHits']:
    Q1 = df[col].quantile(0.10)
    Q3 = df[col].quantile(0.90)
    
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

In [15]:
browser_counts = df['browser'].value_counts()
valid_browsers = browser_counts[browser_counts >= 200].index

df['browser'] = df['browser'].apply(lambda x: x if x in valid_browsers else 'Other')

In [16]:
# Create the bins and labels
max_value = df['purchaseValue'].max()
bins = range(0, int(max_value) + 50, 50)
labels = [f'{i}-{i+49}' for i in bins[:-1]]

# Cut into bins
df['value_bin'] = pd.cut(df['purchaseValue'], bins=bins, labels=labels, right=False)

# Count distribution
distribution = df['value_bin'].value_counts().sort_index()

# Display the distribution
print(distribution)

MemoryError: 