In [None]:
# Building on Directed Graph with Frozen weights, This solution adds variablity to the weighs based on their impact for the user registeration

In [1]:
# data reading
import pandas as pd

df = pd.read_csv("../../data/logs/hitlog_2025-10-27.csv")
df.head()

Unnamed: 0,page_name,page_url,user_id,timestamp
0,Tech | Scientists map deep-sea coral reefs,/articles/scientists-map-deep-sea-coral-reefs,u001,2025-10-26 16:02:57
1,Tech | Scientists map deep-sea coral reefs,/articles/scientists-map-deep-sea-coral-reefs,u001,2025-10-26 16:03:55
2,Science | Protein-rich lunches you can make fast,/articles/protein-rich-lunches-you-can-make-fast,u001,2025-10-26 16:06:26
3,World | Winners and losers from the tax changes,/articles/winners-and-losers-from-the-tax-changes,u001,2025-10-26 16:09:47
4,Lifestyle | Mortgage rates fall for third month,/articles/mortgage-rates-fall-for-third-month,u001,2025-10-26 16:11:04


In [2]:
# Basic normalization / safety: strip whitespace, coerce timestamp
for col in ["page_name", "page_url", "user_id"]:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()
if "timestamp" in df.columns:
    df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")

# Drop rows without a user_id or timestamp (cannot order)
df = df.dropna(subset=["user_id", "timestamp"]).copy()

In [3]:
# Normalize ordering
df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
df = df.sort_values(["user_id", "timestamp"], kind="mergesort").reset_index(drop=True)
df.head()

Unnamed: 0,page_name,page_url,user_id,timestamp
0,Tech | Scientists map deep-sea coral reefs,/articles/scientists-map-deep-sea-coral-reefs,u001,2025-10-26 16:02:57+00:00
1,Tech | Scientists map deep-sea coral reefs,/articles/scientists-map-deep-sea-coral-reefs,u001,2025-10-26 16:03:55+00:00
2,Science | Protein-rich lunches you can make fast,/articles/protein-rich-lunches-you-can-make-fast,u001,2025-10-26 16:06:26+00:00
3,World | Winners and losers from the tax changes,/articles/winners-and-losers-from-the-tax-changes,u001,2025-10-26 16:09:47+00:00
4,Lifestyle | Mortgage rates fall for third month,/articles/mortgage-rates-fall-for-third-month,u001,2025-10-26 16:11:04+00:00


In [4]:
# Constants
REG_URL = "/register"
ARTICLE_PREFIX = "/articles/"

# Keep only article and registration events (graph stays focused)
df = df[
    df["page_url"].str.startswith(ARTICLE_PREFIX) | (df["page_url"] == REG_URL)
].copy()

df.head()

Unnamed: 0,page_name,page_url,user_id,timestamp
0,Tech | Scientists map deep-sea coral reefs,/articles/scientists-map-deep-sea-coral-reefs,u001,2025-10-26 16:02:57+00:00
1,Tech | Scientists map deep-sea coral reefs,/articles/scientists-map-deep-sea-coral-reefs,u001,2025-10-26 16:03:55+00:00
2,Science | Protein-rich lunches you can make fast,/articles/protein-rich-lunches-you-can-make-fast,u001,2025-10-26 16:06:26+00:00
3,World | Winners and losers from the tax changes,/articles/winners-and-losers-from-the-tax-changes,u001,2025-10-26 16:09:47+00:00
4,Lifestyle | Mortgage rates fall for third month,/articles/mortgage-rates-fall-for-third-month,u001,2025-10-26 16:11:04+00:00


In [5]:
DECAY = (
    0.85  # Per-step decay as we move backward from /register (closer pages matter more)
)
BASE_HIT = 1.0  # Base contribution for each article occurrence
LANDING_BONUS = 0.5  # Extra credit for the first article in the journey (the attractor)


# - Increase DECAY (closer to 1.0) to give more credit to earlier pages in the journey.
# - Decrease DECAY (closer to 0.5) to focus influence on pages right before /register.
# - Increase LANDING_BONUS to reward pages that likely "attracted" the user to the journey.
# - Increase BASE_HIT if you want repeated exposures within a journey to matter more (they stack).

In [6]:
class Node:
    def __init__(self, url: str, name: str):
        self.url = url
        self.name = name
        self.neighbors = (
            set()
        )  # directed edges to next visited page (for reference/inspection)
        self.weight = 0.0  # accumulated influence score across all journeys

In [7]:
# Global registry: one Node per unique page_url (including /register for edges)
nodes = {}

In [None]:
# Ensure a node exists for every seen page
for _, row in df.iterrows():
    url = row["page_url"]
    name = row["page_name"]
    if url not in nodes:
        nodes[url] = Node(url, name)

# Add edges
for user_id, grp in df.groupby("user_id", sort=False):
    prev = None
    for _, row in grp.iterrows():
        cur = nodes[row["page_url"]]
        if prev is not None:
            prev.neighbors.add(cur)
        prev = cur

In [9]:
# Journey traversal with decay ranking
for user_id, grp in df.groupby("user_id", sort=False):
    # Collect article URLs for the current journey
    journey_urls = []  # ordered list of article page_urls since last /register
    journey_names = []  # aligned page_name list

    for _, row in grp.iterrows():
        url = row["page_url"]
        name = row["page_name"]

        if url.startswith(ARTICLE_PREFIX):
            # Accumulate this occurrence; duplicates are allowed and will stack weight later
            journey_urls.append(url)
            journey_names.append(name)

        if url == REG_URL:
            # We reached a registration => distribute influence back along the journey
            if journey_urls:
                # 1) Landing bonus to the *first* article of the journey (the attractor)
                #    That is the *first* element in journey_urls
                first_url = journey_urls[0]
                nodes[first_url].weight += LANDING_BONUS

                # 2) Now traverse backwards from /register through the journey.
                #    pos_from_end = 0 for the closest page before /register, then 1, 2, ...
                #    contribution = BASE_HIT * (DECAY ** pos_from_end)
                #    If a page appears multiple times, it accumulates contributions each time.
                for pos_from_end, url_back in enumerate(reversed(journey_urls)):
                    contrib = BASE_HIT * (DECAY**pos_from_end)
                    nodes[url_back].weight += contrib

            # Reset for the next journey
            journey_urls.clear()
            journey_names.clear()

In [10]:
# Build result table
records = []
for node in nodes.values():
    if node.url.startswith(ARTICLE_PREFIX) and node.weight > 0:
        records.append(
            {
                "page_name": node.name,
                "page_url": node.url,
                "total_weight": round(node.weight, 6),
            }
        )

result = (
    pd.DataFrame(records, columns=["page_name", "page_url", "total_weight"])
    .sort_values(
        ["total_weight", "page_url"], ascending=[False, True], kind="mergesort"
    )
    .reset_index(drop=True)
)


print("\nTop 10 pages by influence (with decay):")
result.head(50)


Top 10 pages by influence (with decay):


Unnamed: 0,page_name,page_url,total_weight
0,Science | How to back up your photos,/articles/how-to-back-up-your-photos,4.526799
1,World | Winners and losers from the tax changes,/articles/winners-and-losers-from-the-tax-changes,4.411103
2,Tech | Scientists map deep-sea coral reefs,/articles/scientists-map-deep-sea-coral-reefs,3.49089
3,Lifestyle | The best hikes near London,/articles/the-best-hikes-near-london,3.161852
4,Sport | What next for interest rates?,/articles/what-next-for-interest-rates,3.020577
5,Science | Protein-rich lunches you can make fast,/articles/protein-rich-lunches-you-can-make-fast,2.900414
6,Culture | Volcano eruption prompts evacuations,/articles/volcano-eruption-prompts-evacuations,2.760411
7,Travel | Investor guide to dividend stocks,/articles/investor-guide-to-dividend-stocks,2.5725
8,Tech | Championship title race down to the wire,/articles/championship-title-race-down-to-the-...,1.892616
9,Tech | New smartphone chips promise longer bat...,/articles/new-smartphone-chips-promise-longer-...,1.726607
