In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

# see memory usage easily
def memory_usage_mb(df):
    return df.memory_usage(deep=True).sum() / 1024**2

In [2]:
input_path = "events_data.csv"

df = pd.read_csv(input_path, low_memory=False)
print(f"Loaded {len(df):,} rows and {df.shape[1]} columns")
print(f"Initial memory usage: {memory_usage_mb(df):.2f} MB")

Loaded 134,911 rows and 27 columns
Initial memory usage: 145.28 MB


In [3]:
drop_cols = [
    'id', 'trackingId', 'sessionId', 'userId', 'ip',
    'customData', 'expiresAt'
]

df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors='ignore')

In [4]:
df.head()

Unnamed: 0,eventType,url,screenResolution,browser,language,country,region,city,timezone,clickX,...,domLoadTime,fullLoadTime,ttfb,downlink,effectiveType,rtt,errorMessage,errorSource,errorLine,errorColumn
0,page_view,/,385x854,Mozilla/5.0 (Linux; Android 10; K) AppleWebKit...,en-US,,,,,,...,,,,,,,,,,
1,page_performance,/home,385x854,Mozilla/5.0 (Linux; Android 10; K) AppleWebKit...,en-US,,,,,,...,6229.0,-1745939000000.0,291.0,,,,,,,
2,page_hidden,/home,385x854,Mozilla/5.0 (Linux; Android 10; K) AppleWebKit...,en-US,,,,,,...,,,,,,,,,,
3,page_view,/home,1440x718,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,en-US,,,,,,...,,,,,,,,,,
4,page_performance,/home,1440x718,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,en-US,,,,,,...,4433.0,-1745967000000.0,193.0,,,,,,,


In [5]:
df.describe()

Unnamed: 0,clickX,clickY,scrollPercent,durationMs,domLoadTime,fullLoadTime,ttfb,downlink,rtt,errorLine,errorColumn
count,44760.0,44760.0,22564.0,30446.0,11685.0,11685.0,11685.0,11994.0,11994.0,203.0,203.0
mean,573.732976,339.447319,55.26786,1889474.0,2952.124947,-40817980000.0,478.015576,5.942805,153.560113,209.743842,13756.684729
std,399.342118,190.63401,70.924162,11412540.0,9630.90811,263918400000.0,1263.026877,3.939981,185.909907,1241.830445,12785.01578
min,-4.0,-87.0,10.0,3.0,19.0,-1748005000000.0,0.0,0.0,0.0,0.0,0.0
25%,245.0,186.0,29.0,11205.25,992.0,0.0,88.0,1.0,0.0,1.0,226.0
50%,488.0,346.0,51.0,66559.0,1889.0,0.0,254.0,7.0,150.0,1.0,15125.0
75%,848.0,474.0,78.0,553137.2,3145.0,0.0,515.0,10.0,250.0,1.0,26162.0
max,2237.0,1901.0,6700.0,861706300.0,688576.0,25231.0,47658.0,10.0,3000.0,7982.0,92052.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134911 entries, 0 to 134910
Data columns (total 23 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   eventType         134911 non-null  object 
 1   url               134911 non-null  object 
 2   screenResolution  134911 non-null  object 
 3   browser           134911 non-null  object 
 4   language          134911 non-null  object 
 5   country           68473 non-null   object 
 6   region            68473 non-null   object 
 7   city              68473 non-null   object 
 8   timezone          132844 non-null  object 
 9   clickX            44760 non-null   float64
 10  clickY            44760 non-null   float64
 11  scrollPercent     22564 non-null   float64
 12  durationMs        30446 non-null   float64
 13  domLoadTime       11685 non-null   float64
 14  fullLoadTime      11685 non-null   float64
 15  ttfb              11685 non-null   float64
 16  downlink          11

In [7]:
df.isnull().sum()

eventType                0
url                      0
screenResolution         0
browser                  0
language                 0
country              66438
region               66438
city                 66438
timezone              2067
clickX               90151
clickY               90151
scrollPercent       112347
durationMs          104465
domLoadTime         123226
fullLoadTime        123226
ttfb                123226
downlink            122917
effectiveType       122917
rtt                 122917
errorMessage        134708
errorSource         134727
errorLine           134708
errorColumn         134708
dtype: int64

In [8]:
# Basic Cleaning — Strip whitespace and normalize
df = df.apply(lambda col: col.str.strip() if col.dtype == "object" else col)

In [9]:
# Handle missing values contextually
# Text / categorical columns
cat_fill = {
    "country": "unknown",
    "region": "unknown",
    "city": "unknown",
    "timezone": "unknown",
    "effectiveType": "unknown",
    "errorMessage": "none",
    "errorSource": "none",
}
for col, val in cat_fill.items():
    if col in df.columns:
        df[col] = df[col].fillna(val)

# Interaction-related numeric columns
for col in ["clickX", "clickY", "scrollPercent"]:
    if col in df.columns:
        df[col] = df[col].fillna(0)

# Performance metrics — remove invalid negatives, fill median
perf_cols = ["durationMs", "domLoadTime", "fullLoadTime", "ttfb", "rtt", "downlink"]
for col in perf_cols:
    if col in df.columns:
        df.loc[df[col] < 0, col] = np.nan
        median_val = df[col].median(skipna=True)
        df[col] = df[col].fillna(median_val)

# Error line/column → fill with 0 (none)
for col in ["errorLine", "errorColumn"]:
    if col in df.columns:
        df[col] = df[col].fillna(0)

In [10]:
print(df.info())
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134911 entries, 0 to 134910
Data columns (total 23 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   eventType         134911 non-null  object 
 1   url               134911 non-null  object 
 2   screenResolution  134911 non-null  object 
 3   browser           134911 non-null  object 
 4   language          134911 non-null  object 
 5   country           134911 non-null  object 
 6   region            134911 non-null  object 
 7   city              134911 non-null  object 
 8   timezone          134911 non-null  object 
 9   clickX            134911 non-null  float64
 10  clickY            134911 non-null  float64
 11  scrollPercent     134911 non-null  float64
 12  durationMs        134911 non-null  float64
 13  domLoadTime       134911 non-null  float64
 14  fullLoadTime      134911 non-null  float64
 15  ttfb              134911 non-null  float64
 16  downlink          13

In [12]:
# handle outliers
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    low, high = df[col].quantile([0.01, 0.99])
    df[col] = df[col].clip(lower=low, upper=high)

    # For scroll percent specifically
if "scrollPercent" in df.columns:
    df["scrollPercent"] = df["scrollPercent"].clip(0, 100)

In [13]:
# encode categorical features
cat_cols = df.select_dtypes(include=["object"]).columns

for col in cat_cols:
    n_unique = df[col].nunique(dropna=True)
    if n_unique > 50:
        # frequency encode
        freq = df[col].value_counts(normalize=True)
        df[col] = df[col].map(freq)
    else:
        # label encode
        df[col] = df[col].astype("category").cat.codes

In [15]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134911 entries, 0 to 134910
Data columns (total 23 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   eventType         134911 non-null  int8   
 1   url               134911 non-null  float64
 2   screenResolution  134911 non-null  float64
 3   browser           134911 non-null  float64
 4   language          134911 non-null  float64
 5   country           134911 non-null  int8   
 6   region            134911 non-null  float64
 7   city              134911 non-null  float64
 8   timezone          134911 non-null  float64
 9   clickX            134911 non-null  float64
 10  clickY            134911 non-null  float64
 11  scrollPercent     134911 non-null  float64
 12  durationMs        134911 non-null  float64
 13  domLoadTime       134911 non-null  float64
 14  fullLoadTime      134911 non-null  float64
 15  ttfb              134911 non-null  float64
 16  downlink          13

In [16]:
# Downcast numeric types to reduce memory
df = df.apply(pd.to_numeric, errors="ignore", downcast="integer")
df = df.apply(pd.to_numeric, errors="ignore", downcast="float")

In [17]:
print("\n✅ Cleaning complete:")
print(df.info(memory_usage="deep"))
print(df.describe().T.head(10))


✅ Cleaning complete:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134911 entries, 0 to 134910
Data columns (total 23 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   eventType         134911 non-null  int8   
 1   url               134911 non-null  float32
 2   screenResolution  134911 non-null  float32
 3   browser           134911 non-null  float32
 4   language          134911 non-null  float32
 5   country           134911 non-null  int8   
 6   region            134911 non-null  float32
 7   city              134911 non-null  float32
 8   timezone          134911 non-null  float32
 9   clickX            134911 non-null  int16  
 10  clickY            134911 non-null  int16  
 11  scrollPercent     134911 non-null  int8   
 12  durationMs        134911 non-null  float64
 13  domLoadTime       134911 non-null  float32
 14  fullLoadTime      134911 non-null  int8   
 15  ttfb              134911 non-null  float32
 16

In [18]:
output_path = "cleaned_events.csv"
df.to_csv(output_path, index=False)
print(f"\nSaved cleaned dataset → {output_path}")


Saved cleaned dataset → cleaned_events.csv
