In [1]:
# This Notebook presents Graph based solution with frozen weights to 1

In [2]:
# data reading
import pandas as pd

df = pd.read_csv("../../data/logs/hitlog_2025-10-27.csv")
df.head()

Unnamed: 0,page_name,page_url,user_id,timestamp
0,Tech | Scientists map deep-sea coral reefs,/articles/scientists-map-deep-sea-coral-reefs,u001,2025-10-26 16:02:57
1,Tech | Scientists map deep-sea coral reefs,/articles/scientists-map-deep-sea-coral-reefs,u001,2025-10-26 16:03:55
2,Science | Protein-rich lunches you can make fast,/articles/protein-rich-lunches-you-can-make-fast,u001,2025-10-26 16:06:26
3,World | Winners and losers from the tax changes,/articles/winners-and-losers-from-the-tax-changes,u001,2025-10-26 16:09:47
4,Lifestyle | Mortgage rates fall for third month,/articles/mortgage-rates-fall-for-third-month,u001,2025-10-26 16:11:04


In [3]:
# Basic normalization / safety: strip whitespace, coerce timestamp
for col in ["page_name", "page_url", "user_id"]:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()
if "timestamp" in df.columns:
    df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")

# Drop rows without a user_id or timestamp (cannot order)
df = df.dropna(subset=["user_id", "timestamp"]).copy()

In [4]:
# Normalize ordering
df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
df = df.sort_values(["user_id", "timestamp"], kind="mergesort").reset_index(drop=True)
df.head()

Unnamed: 0,page_name,page_url,user_id,timestamp
0,Tech | Scientists map deep-sea coral reefs,/articles/scientists-map-deep-sea-coral-reefs,u001,2025-10-26 16:02:57+00:00
1,Tech | Scientists map deep-sea coral reefs,/articles/scientists-map-deep-sea-coral-reefs,u001,2025-10-26 16:03:55+00:00
2,Science | Protein-rich lunches you can make fast,/articles/protein-rich-lunches-you-can-make-fast,u001,2025-10-26 16:06:26+00:00
3,World | Winners and losers from the tax changes,/articles/winners-and-losers-from-the-tax-changes,u001,2025-10-26 16:09:47+00:00
4,Lifestyle | Mortgage rates fall for third month,/articles/mortgage-rates-fall-for-third-month,u001,2025-10-26 16:11:04+00:00


In [6]:
# Constants
REG_URL = "/register"
ARTICLE_PREFIX = "/articles/"

# Keep only article and registration events (graph stays focused)
df = df[
    df["page_url"].str.startswith(ARTICLE_PREFIX) | (df["page_url"] == REG_URL)
].copy()

df.head()

Unnamed: 0,page_name,page_url,user_id,timestamp
0,Tech | Scientists map deep-sea coral reefs,/articles/scientists-map-deep-sea-coral-reefs,u001,2025-10-26 16:02:57+00:00
1,Tech | Scientists map deep-sea coral reefs,/articles/scientists-map-deep-sea-coral-reefs,u001,2025-10-26 16:03:55+00:00
2,Science | Protein-rich lunches you can make fast,/articles/protein-rich-lunches-you-can-make-fast,u001,2025-10-26 16:06:26+00:00
3,World | Winners and losers from the tax changes,/articles/winners-and-losers-from-the-tax-changes,u001,2025-10-26 16:09:47+00:00
4,Lifestyle | Mortgage rates fall for third month,/articles/mortgage-rates-fall-for-third-month,u001,2025-10-26 16:11:04+00:00


In [7]:
# Graph Node definition
class Node:
    def __init__(self, url: str, name: str):
        self.url = url  # page_url
        self.name = name  # page_name (assumed stable for the URL)
        self.neighbors = set()  # set of Node (directed edges to next visited pages)
        self.weight = 0  # accumulated influence count across all journeys

In [8]:
# Global node registry: url -> Node
nodes = {}

In [9]:
# Build graph structure (unique user_id → path edges between consecutive nodes)
# Also ensure a Node exists for every encountered page.

for _, row in df.iterrows():
    url = row["page_url"]
    name = row["page_name"]
    if url not in nodes:
        nodes[url] = Node(url, name)

# Create directed edges based on each user's chronological sequence
for user_id, grp in df.groupby("user_id", sort=False):
    prev_node = None
    for _, row in grp.iterrows():
        cur = nodes[row["page_url"]]
        if prev_node is not None:
            prev_node.neighbors.add(cur)
        prev_node = cur

In [10]:
# Traverse the graph per user timeline and apply journey logic
# - Iterate the *ordered events* for each user (this preserves the real navigation order).
# - "freeze" an article node for the current journey once seen (so repeated hits don't add more).
# - On hitting /register, we commit +1 to each seen-article node's global weight.

for user_id, grp in df.groupby("user_id", sort=False):
    # Tracks which article URLs have been seen in the CURRENT journey (frozen set)
    seen_in_journey = set()

    for _, row in grp.iterrows():
        url = row["page_url"]

        # If it's an article and not yet frozen in this journey, freeze it (+1 *pending* for this journey)
        if url.startswith(ARTICLE_PREFIX) and url not in seen_in_journey:
            seen_in_journey.add(url)

        # If we reached a registration, COMMIT +1 for all frozen article nodes in this journey, then reset
        if url == REG_URL:
            for art_url in seen_in_journey:
                nodes[art_url].weight += 1
            seen_in_journey.clear()  # unfreeze for the next journey

In [11]:
# Build the output DataFrame from node weights
# Only include article nodes with weight > 0 (influential)

records = []
for n in nodes.values():
    if n.url.startswith(ARTICLE_PREFIX) and n.weight > 0:
        records.append({"page_name": n.name, "page_url": n.url, "total": int(n.weight)})

result = (
    pd.DataFrame(records, columns=["page_name", "page_url", "total"])
    .sort_values(["total", "page_url"], ascending=[False, True], kind="mergesort")
    .reset_index(drop=True)
)

print("\nTop 10 influential articles (graph-based):")
result.head(40)


Top 10 influential articles (graph-based):


Unnamed: 0,page_name,page_url,total
0,World | Winners and losers from the tax changes,/articles/winners-and-losers-from-the-tax-changes,5
1,Tech | Championship title race down to the wire,/articles/championship-title-race-down-to-the-...,4
2,Science | How to back up your photos,/articles/how-to-back-up-your-photos,3
3,Lifestyle | Mortgage rates fall for third month,/articles/mortgage-rates-fall-for-third-month,3
4,Science | Protein-rich lunches you can make fast,/articles/protein-rich-lunches-you-can-make-fast,3
5,Tech | Scientists map deep-sea coral reefs,/articles/scientists-map-deep-sea-coral-reefs,3
6,Culture | Volcano eruption prompts evacuations,/articles/volcano-eruption-prompts-evacuations,3
7,Sport | What next for interest rates?,/articles/what-next-for-interest-rates,3
8,Sport | Electric cars: charging myths debunked,/articles/electric-cars--charging-myths-debunked,2
9,Sport | Gardeners’ tips for late blooms,/articles/gardeners--tips-for-late-blooms,2
