In [1]:
# This notebook presents the Frequency Counter based solution for the exercise

In [2]:
# data reading
import pandas as pd

df = pd.read_csv("../../data/logs/hitlog_2025-10-27.csv")
df.head()

Unnamed: 0,page_name,page_url,user_id,timestamp
0,Tech | Scientists map deep-sea coral reefs,/articles/scientists-map-deep-sea-coral-reefs,u001,2025-10-26 16:02:57
1,Tech | Scientists map deep-sea coral reefs,/articles/scientists-map-deep-sea-coral-reefs,u001,2025-10-26 16:03:55
2,Science | Protein-rich lunches you can make fast,/articles/protein-rich-lunches-you-can-make-fast,u001,2025-10-26 16:06:26
3,World | Winners and losers from the tax changes,/articles/winners-and-losers-from-the-tax-changes,u001,2025-10-26 16:09:47
4,Lifestyle | Mortgage rates fall for third month,/articles/mortgage-rates-fall-for-third-month,u001,2025-10-26 16:11:04


In [3]:
# Basic normalization / safety: strip whitespace, coerce timestamp
for col in ["page_name", "page_url", "user_id"]:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()
if "timestamp" in df.columns:
    df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")

# Drop rows without a user_id or timestamp (cannot order)
df = df.dropna(subset=["user_id", "timestamp"]).copy()

In [4]:
# Normalize ordering
df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
df = df.sort_values(["user_id", "timestamp"], kind="mergesort").reset_index(drop=True)
df.head()

Unnamed: 0,page_name,page_url,user_id,timestamp
0,Tech | Scientists map deep-sea coral reefs,/articles/scientists-map-deep-sea-coral-reefs,u001,2025-10-26 16:02:57+00:00
1,Tech | Scientists map deep-sea coral reefs,/articles/scientists-map-deep-sea-coral-reefs,u001,2025-10-26 16:03:55+00:00
2,Science | Protein-rich lunches you can make fast,/articles/protein-rich-lunches-you-can-make-fast,u001,2025-10-26 16:06:26+00:00
3,World | Winners and losers from the tax changes,/articles/winners-and-losers-from-the-tax-changes,u001,2025-10-26 16:09:47+00:00
4,Lifestyle | Mortgage rates fall for third month,/articles/mortgage-rates-fall-for-third-month,u001,2025-10-26 16:11:04+00:00


In [5]:
# Constants
REG_URL = "/register"
ARTICLE_PREFIX = "/articles/"

# Keep only rows we care about: articles or /register
flt = df["page_url"].str.startswith(ARTICLE_PREFIX) | (df["page_url"] == REG_URL)
df = df.loc[flt].copy()
df.head()

Unnamed: 0,page_name,page_url,user_id,timestamp
0,Tech | Scientists map deep-sea coral reefs,/articles/scientists-map-deep-sea-coral-reefs,u001,2025-10-26 16:02:57+00:00
1,Tech | Scientists map deep-sea coral reefs,/articles/scientists-map-deep-sea-coral-reefs,u001,2025-10-26 16:03:55+00:00
2,Science | Protein-rich lunches you can make fast,/articles/protein-rich-lunches-you-can-make-fast,u001,2025-10-26 16:06:26+00:00
3,World | Winners and losers from the tax changes,/articles/winners-and-losers-from-the-tax-changes,u001,2025-10-26 16:09:47+00:00
4,Lifestyle | Mortgage rates fall for third month,/articles/mortgage-rates-fall-for-third-month,u001,2025-10-26 16:11:04+00:00


In [6]:
from collections import defaultdict, Counter

# Core traversal across ALL users:
# - Deduplicate articles within a journey using a set
# - On /register, increment counts for each unique article in the set
# - Then reset the set for the next journey

counts = Counter()

for user_id, grp in df.groupby("user_id", sort=False):
    seen_since_reg = set()  # unique article URLs seen since last /register

    for _, row in grp.iterrows():
        url = row["page_url"]

        # Record article URLs only
        if url.startswith(ARTICLE_PREFIX):
            seen_since_reg.add(url)

        # Registration ends the current journey: count + reset
        if url == REG_URL:
            for art_url in seen_since_reg:
                counts[art_url] += 1
            seen_since_reg.clear()

# NOTE: leftover articles without a trailing /register are ignored by design

print(counts)

Counter({'/articles/winners-and-losers-from-the-tax-changes': 5, '/articles/championship-title-race-down-to-the-wire': 4, '/articles/protein-rich-lunches-you-can-make-fast': 3, '/articles/mortgage-rates-fall-for-third-month': 3, '/articles/scientists-map-deep-sea-coral-reefs': 3, '/articles/volcano-eruption-prompts-evacuations': 3, '/articles/what-next-for-interest-rates': 3, '/articles/how-to-back-up-your-photos': 3, '/articles/university-rankings-shake-up-explained': 2, '/articles/uk-house-prices-show-signs-of-stabilising': 2, '/articles/top-museums-to-visit-this-weekend': 2, '/articles/hidden-gems-for-autumn-city-breaks': 2, '/articles/electric-cars--charging-myths-debunked': 2, '/articles/investor-guide-to-dividend-stocks': 2, '/articles/gardeners--tips-for-late-blooms': 2, '/articles/the-future-of-home-working': 1, '/articles/inside-the-race-to-build-fusion': 1, '/articles/what-the-new-data-rules-mean-for-you': 1, '/articles/ai-start-ups-race-to-raise-funding': 1, '/articles/europ

In [7]:
# Build results dataframe

if counts:
    result = (
        pd.DataFrame(
            [{"page_url": u, "total": int(t)} for u, t in counts.items()],
            columns=["page_url", "total"],
        )
        .merge(
            df[["page_name", "page_url"]].drop_duplicates(), on="page_url", how="left"
        )
        .loc[:, ["page_name", "page_url", "total"]]
        .sort_values(["total", "page_url"], ascending=[False, True], kind="mergesort")
        .reset_index(drop=True)
    )
else:
    result = pd.DataFrame(columns=["page_name", "page_url", "total"])


result.head(36)

Unnamed: 0,page_name,page_url,total
0,World | Winners and losers from the tax changes,/articles/winners-and-losers-from-the-tax-changes,5
1,Tech | Championship title race down to the wire,/articles/championship-title-race-down-to-the-...,4
2,Science | How to back up your photos,/articles/how-to-back-up-your-photos,3
3,Lifestyle | Mortgage rates fall for third month,/articles/mortgage-rates-fall-for-third-month,3
4,Science | Protein-rich lunches you can make fast,/articles/protein-rich-lunches-you-can-make-fast,3
5,Tech | Scientists map deep-sea coral reefs,/articles/scientists-map-deep-sea-coral-reefs,3
6,Culture | Volcano eruption prompts evacuations,/articles/volcano-eruption-prompts-evacuations,3
7,Sport | What next for interest rates?,/articles/what-next-for-interest-rates,3
8,Sport | Electric cars: charging myths debunked,/articles/electric-cars--charging-myths-debunked,2
9,Sport | Gardeners’ tips for late blooms,/articles/gardeners--tips-for-late-blooms,2
