In [1]:
# ============================================
# ðŸ“Š SaaS Event Data â€” Advanced Preprocessing
# ============================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from category_encoders.target_encoder import TargetEncoder

# === Utility ===
def memory_usage_mb(df):
    return df.memory_usage(deep=True).sum() / 1024**2

In [2]:
input_path = "events_data.csv"
df = pd.read_csv(input_path, low_memory=False)
print(f"Loaded {len(df):,} rows and {df.shape[1]} columns")
print(f"Initial memory usage: {memory_usage_mb(df):.2f} MB")

Loaded 134,911 rows and 27 columns
Initial memory usage: 145.28 MB


In [3]:
# === Drop irrelevant identifiers ===
drop_cols = [
    "id", "trackingId", "sessionId", "userId", "ip",
    "customData", "expiresAt"
]
df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")

In [4]:
# === Clean strings & fill missing ===
df = df.apply(lambda col: col.str.strip() if col.dtype == "object" else col)

cat_fill = {
    "country": "unknown",
    "region": "unknown",
    "city": "unknown",
    "timezone": "unknown",
    "effectiveType": "unknown",
    "errorMessage": "none",
    "errorSource": "none",
}
for col, val in cat_fill.items():
    if col in df.columns:
        df[col] = df[col].fillna(val)

for col in ["clickX", "clickY", "scrollPercent"]:
    if col in df.columns:
        df[col] = df[col].fillna(0)

perf_cols = ["durationMs", "domLoadTime", "fullLoadTime", "ttfb", "rtt", "downlink"]
for col in perf_cols:
    if col in df.columns:
        df.loc[df[col] < 0, col] = np.nan
        median_val = df[col].median(skipna=True)
        df[col] = df[col].fillna(median_val)

for col in ["errorLine", "errorColumn"]:
    if col in df.columns:
        df[col] = df[col].fillna(0)

In [5]:
# === Outlier clipping ===
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    low, high = df[col].quantile([0.01, 0.99])
    df[col] = df[col].clip(lower=low, upper=high)
if "scrollPercent" in df.columns:
    df["scrollPercent"] = df["scrollPercent"].clip(0, 100)

In [6]:
# ============================================
# ðŸ§© Feature Engineering
# ============================================

# Split screenResolution -> width & height
if "screenResolution" in df.columns:
    def split_resolution(val):
        try:
            w, h = val.lower().replace(" ", "").split("x")
            return int(w), int(h)
        except Exception:
            return np.nan, np.nan
    wh = df["screenResolution"].apply(lambda x: pd.Series(split_resolution(str(x))))
    df["screenWidth"], df["screenHeight"] = wh[0], wh[1]
    df["aspectRatio"] = df["screenWidth"] / (df["screenHeight"] + 1e-6)
    df.drop(columns=["screenResolution"], inplace=True, errors="ignore")
    print("Parsed screenResolution -> screenWidth/screenHeight")
else:
    df["aspectRatio"] = np.nan

# Create quantile bin for screenResolution
if "screenResolution" in df.columns:
    try:
        df["screenResolution_bin"] = pd.qcut(df["screenResolution"].rank(method="first"),
                                             q=10, labels=False, duplicates="drop")
        print("screenResolution numeric â€” created quantile bins in screenResolution_bin")
    except Exception:
        pass
else:
    df["screenResolution_bin"] = np.nan

Parsed screenResolution -> screenWidth/screenHeight


In [7]:
# ============================================
# ðŸ§® Encoding Categorical Variables
# ============================================
cat_cols = df.select_dtypes(include=["object"]).columns.tolist()

# Separate low- and high-cardinality columns
low_card = [c for c in cat_cols if df[c].nunique() <= 50]
high_card = [c for c in cat_cols if df[c].nunique() > 50]

# Ordinal encode low-cardinality columns
if low_card:
    enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    df[low_card] = enc.fit_transform(df[low_card])

# Target encode high-cardinality columns (like url)
if high_card:
    target_col = "durationMs"
    target_enc = TargetEncoder(cols=high_card)
    df[high_card] = target_enc.fit_transform(df[high_card], df[target_col])

In [8]:
# ============================================
# ðŸ“ˆ Target Transformation (optional)
# ============================================
df["durationMs_log"] = np.log1p(df["durationMs"])

In [9]:
# ============================================
# ðŸ§¹ Downcast numeric columns
# ============================================
for col in df.select_dtypes(include=["float64"]).columns:
    df[col] = pd.to_numeric(df[col], downcast="float")
for col in df.select_dtypes(include=["int64"]).columns:
    df[col] = pd.to_numeric(df[col], downcast="integer")

print("\nâœ… Final schema:")
print(df.info(memory_usage="deep"))
print(f"Final memory: {memory_usage_mb(df):.2f} MB")


âœ… Final schema:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134911 entries, 0 to 134910
Data columns (total 27 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   eventType             134911 non-null  float32
 1   url                   134911 non-null  float64
 2   browser               134911 non-null  float64
 3   language              134911 non-null  float64
 4   country               134911 non-null  float32
 5   region                134911 non-null  float64
 6   city                  134911 non-null  float64
 7   timezone              134911 non-null  float64
 8   clickX                134911 non-null  float32
 9   clickY                134911 non-null  float32
 10  scrollPercent         134911 non-null  float32
 11  durationMs            134911 non-null  float64
 12  domLoadTime           134911 non-null  float32
 13  fullLoadTime          134911 non-null  float32
 14  ttfb                  134911 non-

In [10]:
# ============================================
# ðŸ’¾ Save cleaned dataset
# ============================================
output_path = "cleaned_advanced_events.csv"
df.to_csv(output_path, index=False)
print(f"\nSaved optimized dataset â†’ {output_path}")


Saved optimized dataset â†’ cleaned_advanced_events.csv


In [11]:
print(df.head(5))

   eventType            url       browser       language  country  \
0        6.0   78771.546408  57972.627262  270437.629695     41.0   
1        5.0  123093.997346  57972.627262  270437.629695     41.0   
2        3.0  123093.997346  57972.627262  270437.629695     41.0   
3        6.0  123093.997346  61480.587613  270437.629695     41.0   
4        5.0  123093.997346  61480.587613  270437.629695     41.0   

          region           city       timezone  clickX  clickY  ...    rtt  \
0  373820.298981  373820.298981  382949.547218     0.0     0.0  ...  150.0   
1  373820.298981  373820.298981  382949.547218     0.0     0.0  ...  150.0   
2  373820.298981  373820.298981  382949.547218     0.0     0.0  ...  150.0   
3  373820.298981  373820.298981  382949.547218     0.0     0.0  ...  150.0   
4  373820.298981  373820.298981  382949.547218     0.0     0.0  ...  150.0   

   errorMessage  errorSource  errorLine  errorColumn  screenWidth  \
0          21.0         23.0        0.0        