# Libraries

In [1]:
import numpy as np
import pandas as pd
import json
import math

# Data

## Impressions

In [2]:
raw_behaviour = pd.read_csv(
    "MINDsmall_train/behaviors.tsv",
    sep="\t",
    names=["impressionId", "userId", "timestamp", "click_history", "impressions"],
)

print(f"The dataset originally consist of {len(raw_behaviour)} number of interactions.")
raw_behaviour.head()

The dataset originally consist of 156965 number of interactions.


Unnamed: 0,impressionId,userId,timestamp,click_history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


In [3]:
# Function to split the impressions and clicks into two seperate lists
def process_impression(impression_list):
    list_of_strings = impression_list.split()
    click = [x.split("-")[0] for x in list_of_strings if x.split("-")[1] == "1"]
    non_click = [x.split("-")[0] for x in list_of_strings if x.split("-")[1] == "0"]
    return click, non_click


# We can then indexize these two new columns:
raw_behaviour["click"], raw_behaviour["noclicks"] = zip(
    *raw_behaviour["impressions"].map(process_impression)
)

In [4]:
# Convert timestamp value to hours since epoch
raw_behaviour["epochhrs"] = (
    pd.to_datetime(raw_behaviour["timestamp"]).values.astype(np.int64)
    / (1e6)
    / 1000
    / 3600
)
raw_behaviour["epochhrs"] = raw_behaviour["epochhrs"].round()

### Click History

In [5]:
# If there exists several clicks in one session, expand to new observation
raw_behaviour = raw_behaviour.explode("click").reset_index(drop=True)

# Extract the clicks from the previous clicks
click_history = raw_behaviour[["userId", "click_history"]].drop_duplicates().dropna()
click_history["click_history"] = click_history.click_history.map(lambda x: x.split())
click_history = click_history.explode("click_history").rename(
    columns={"click_history": "click"}
)
# Dummy time set to earlies epochhrs in raw_behaviour as we don't know when these events took place.
click_history["epochhrs"] = raw_behaviour.epochhrs.min()
click_history["noclicks"] = pd.Series([[] for _ in range(len(click_history.index))])

# concatenate historical clicks with the raw_behaviour
raw_behaviour = pd.concat([raw_behaviour, click_history], axis=0).reset_index(drop=True)
print(
    f"The dataset after pre-processing consist of {len(raw_behaviour)} number of interactions."
)

The dataset after pre-processing consist of 1162402 number of interactions.


## Article

In [6]:
def extract_labels(row):
    try:
        if isinstance(row, str):
            entities = json.loads(row)
            labels = [entity["Label"] for entity in entities]
            return labels
        elif isinstance(row, float) and math.isnan(row):
            return []
        else:
            return []
    except (ValueError, KeyError):
        return []

In [7]:
news = pd.read_csv(
    "MINDsmall_train/news.tsv",
    sep="\t",
    names=[
        "itemId",
        "category",
        "subcategory",
        "title",
        "abstract",
        "url",
        "title_entities",
        "abstract_entities",
    ],
)
print(f"The article data consist in total of {len(news)} number of articles.")

The article data consist in total of 51282 number of articles.


In [8]:
news["abstract_entities_labels"] = news["abstract_entities"].apply(extract_labels)
news["title_entities_labels"] = news["title_entities"].apply(extract_labels)

In [9]:
news.drop(
    ["url", "title", "abstract", "title_entities", "abstract_entities"],
    axis=1,
    inplace=True,
)

I couldnt really find any datasets with like an affiliate or allignment score, so for now i just randomly generated a score for it. Maybe I can use the unused data from the news dataset to come up with one based on the content?

In [10]:
news["affiliate_score"] = np.random.rand(len(news))

In [11]:
news.head()

Unnamed: 0,itemId,category,subcategory,abstract_entities_labels,title_entities_labels,affiliate_score
0,N55528,lifestyle,lifestyleroyals,[],"[Prince Philip, Duke of Edinburgh, Charles, Pr...",0.310886
1,N19639,health,weightloss,[Adipose tissue],[Adipose tissue],0.503125
2,N61837,news,newsworld,[Ukraine],[],0.933823
3,N53526,health,voices,[National Basketball Association],[],0.198382
4,N38324,health,medical,"[Skin tag, Dermatology, Reader's Digest]",[Skin tag],0.780608


## Removing entries with very few impressions

In [12]:
min_click_cutoff = 100
print(
    f"Number of items that have less than {min_click_cutoff} clicks make up",
    np.round(
        np.mean(raw_behaviour.groupby("click").size() < min_click_cutoff) * 100, 3
    ),
    "% of the total, and these will be removed.",
)

Number of items that have less than 100 clicks make up 93.852 % of the total, and these will be removed.


In [13]:
# remove items with less clicks than min_click_cutoff
raw_behaviour = raw_behaviour[
    raw_behaviour.groupby("click")["userId"].transform("size") >= min_click_cutoff
].reset_index(drop=True)
# Get a set with all the unique items
click_set = set(raw_behaviour["click"].unique())

# remove items for impressions that is not avaiable in the click set (the items that we will be training on)
raw_behaviour["noclicks"] = raw_behaviour["noclicks"].apply(
    lambda impressions: [
        impression for impression in impressions if impression in click_set
    ]
)

In [14]:
raw_behaviour.head()

Unnamed: 0,impressionId,userId,timestamp,click_history,impressions,click,noclicks,epochhrs
0,1.0,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,N55689,[N35729],437073.0
1,2.0,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,N17059,"[N20678, N39317, N58114, N20495, N42977, N1459...",437106.0
2,3.0,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,N23814,"[N23877, N35389, N49712, N16844, N59685, N2344...",437143.0
3,4.0,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,N49685,"[N35729, N33632, N27581]",437069.0
4,6.0,U19739,11/11/2019 6:52:13 PM,N39074 N14343 N32607 N32320 N22007 N442 N19001...,N21119-1 N53696-0 N33619-1 N25722-0 N2869-0,N33619,[],437083.0


### Cleaning up behavior

In [15]:
behaviour = raw_behaviour[["userId", "click", "noclicks", "click_history"]].copy()

In [16]:
print("Number of interactions in the behaviour dataset:", behaviour.shape[0])
print("Number of users in the behaviour dataset:", behaviour.userId.nunique())
print("Number of articles in the behaviour dataset:", behaviour.click.nunique())

Number of interactions in the behaviour dataset: 781871
Number of users in the behaviour dataset: 49832
Number of articles in the behaviour dataset: 2451


In [17]:
behaviour.head()

Unnamed: 0,userId,click,noclicks,click_history
0,U13740,N55689,[N35729],N55189 N42782 N34694 N45794 N18445 N63302 N104...
1,U91836,N17059,"[N20678, N39317, N58114, N20495, N42977, N1459...",N31739 N6072 N63045 N23979 N35656 N43353 N8129...
2,U73700,N23814,"[N23877, N35389, N49712, N16844, N59685, N2344...",N10732 N25792 N7563 N21087 N41087 N5445 N60384...
3,U34670,N49685,"[N35729, N33632, N27581]",N45729 N2203 N871 N53880 N41375 N43142 N33013 ...
4,U19739,N33619,[],N39074 N14343 N32607 N32320 N22007 N442 N19001...


# Models

## Epsilon greedy(biased)

In [18]:
class BiasedBandit:
    def __init__(self, news_df, behavior_df, num_articles=10, epsilon=0.1):
        self.news_df = news_df
        self.behavior_df = behavior_df
        self.num_articles = num_articles
        self.epsilon = epsilon
        self.article_values = np.zeros(num_articles)
        self.article_clicks = np.zeros(num_articles)
        self.article_ids = self.news_df["itemId"].tolist()
        self.affiliate_scores = self.news_df["affiliate_score"].tolist()

    def select_article(self):
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.num_articles)
        else:
            return np.argmax(self.article_values)

    def update_values(self, article_id, click):
        self.article_clicks[article_id] += click
        total_impressions = 0
        for click_history in self.behavior_df["click_history"]:
            if (
                isinstance(click_history, str)
                and str(self.article_ids[article_id]) in click_history
            ):
                total_impressions += 1
        self.article_values[article_id] = (
            self.article_clicks[article_id] / total_impressions
        ) * self.affiliate_scores[article_id]

    def run_optimization(self, num_iterations):
        for _ in range(num_iterations):
            article_id = self.select_article()
            click = (
                1
                if any(
                    str(self.article_ids[article_id]) in str(x)
                    for x in self.behavior_df["click_history"]
                )
                else 0
            )
            self.update_values(article_id, click)

        return self.article_values

In [19]:
def print_top_articles(article_values, news_df, n=5):
    top_article_ids = np.argsort(-article_values)[:n]
    print(f"Top {n} most valuable articles:")
    for i in top_article_ids:
        print(
            f"Article ID: {news_df.iloc[i]['itemId']}, Value: {article_values[i]:.3f}"
        )

In [20]:
optimizer = BiasedBandit(news, behaviour, num_articles=10, epsilon=0.1)

In [21]:
article_values = optimizer.run_optimization(1000)

  self.article_clicks[article_id] / total_impressions


In [22]:
print_top_articles(article_values, news, n=3)

Top 3 most valuable articles:
Article ID: N24510, Value: 0.876
Article ID: N55528, Value: 0.808
Article ID: N38324, Value: 0.260


## Epsilon Greedy(unbiased)

In [23]:
class UnbiasedBandit:
    def __init__(self, news_df, behavior_df, num_articles=10, epsilon=0.1):
        self.news_df = news_df
        self.behavior_df = behavior_df
        self.num_articles = num_articles
        self.epsilon = epsilon
        self.article_values = np.zeros(num_articles)
        self.article_clicks = np.zeros(num_articles)
        self.article_ids = self.news_df["itemId"].tolist()

    def select_article(self):
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.num_articles)
        else:
            return np.argmax(self.article_values)

    def update_values(self, article_id, click):
        self.article_clicks[article_id] += click
        total_impressions = 0
        for click_history in self.behavior_df["click_history"]:
            if (
                isinstance(click_history, str)
                and str(self.article_ids[article_id]) in click_history
            ):
                total_impressions += 1
        self.article_values[article_id] = (
            self.article_clicks[article_id] / total_impressions
        )

    def run_optimization(self, num_iterations):
        for _ in range(num_iterations):
            article_id = self.select_article()
            click = (
                1
                if any(
                    str(self.article_ids[article_id]) in str(x)
                    for x in self.behavior_df["click_history"]
                )
                else 0
            )
            self.update_values(article_id, click)

        return self.article_values

In [24]:
optimizer = UnbiasedBandit(news, behaviour, num_articles=10, epsilon=0.1)

In [25]:
article_values = optimizer.run_optimization(1000)

  self.article_clicks[article_id] / total_impressions


In [26]:
print_top_articles(article_values, news, n=3)

Top 3 most valuable articles:
Article ID: N24510, Value: 3.250
Article ID: N55528, Value: 3.200
Article ID: N59295, Value: 0.625
