In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from IPython.display import display, Markdown
from pathlib import Path

data_dir = Path('/kaggle/input/learning-equality-curriculum-recommendations')

In [2]:
# load the data into pandas dataframes
topics_df = pd.read_csv(data_dir / "topics.csv", index_col=0).fillna({"title": "", "description": ""})
content_df = pd.read_csv(data_dir / "content.csv", index_col=0).fillna("")
correlations_df = pd.read_csv(data_dir / "correlations.csv", index_col=0)

In [3]:
# define some helper functions and classes to aid with data traversal

def print_markdown(md):
    display(Markdown(md))

class Topic:
    def __init__(self, topic_id):
        self.id = topic_id

    @property
    def parent(self):
        parent_id = topics_df.loc[self.id].parent
        if pd.isna(parent_id):
            return None
        else:
            return Topic(parent_id)

    @property
    def ancestors(self):
        ancestors = []
        parent = self.parent
        while parent is not None:
            ancestors.append(parent)
            parent = parent.parent
        return ancestors

    @property
    def siblings(self):
        if not self.parent:
            return []
        else:
            return [topic for topic in self.parent.children if topic != self]

    @property
    def content(self):
        if self.id in correlations_df.index:
            return [ContentItem(content_id) for content_id in correlations_df.loc[self.id].content_ids.split()]
        else:
            return tuple([]) if self.has_content else []

    def get_breadcrumbs(self, separator=" >> ", include_self=True, include_root=True):
        ancestors = self.ancestors
        if include_self:
            ancestors = [self] + ancestors
        if not include_root:
            ancestors = ancestors[:-1]
        return separator.join(reversed([a.title for a in ancestors]))

    @property
    def children(self):
        return [Topic(child_id) for child_id in topics_df[topics_df.parent == self.id].index]

    def subtree_markdown(self, depth=0):
        markdown = "  " * depth + "- " + self.title + "\n"
        for child in self.children:
            markdown += child.subtree_markdown(depth=depth + 1)
        for content in self.content:
            markdown += ("  " * (depth + 1) + "- " + "[" + content.kind.title() + "] " + content.title) + "\n"
        return markdown

    def __eq__(self, other):
        if not isinstance(other, Topic):
            return False
        return self.id == other.id

    def __getattr__(self, name):
        return topics_df.loc[self.id][name]

    def __str__(self):
        return self.title
    
    def __repr__(self):
        return f"<Topic(id={self.id}, title=\"{self.title}\")>"


class ContentItem:
    def __init__(self, content_id):
        self.id = content_id

    @property
    def topics(self):
        return [Topic(topic_id) for topic_id in topics_df.loc[correlations_df[correlations_df.content_ids.str.contains(self.id)].index].index]

    def __getattr__(self, name):
        return content_df.loc[self.id][name]

    def __str__(self):
        return self.title
    
    def __repr__(self):
        return f"<ContentItem(id={self.id}, title=\"{self.title}\")>"

    def __eq__(self, other):
        if not isinstance(other, ContentItem):
            return False
        return self.id == other.id

    def get_all_breadcrumbs(self, separator=" >> ", include_root=True):
        breadcrumbs = []
        for topic in self.topics:
            new_breadcrumb = topic.get_breadcrumbs(separator=separator, include_root=include_root)
            if new_breadcrumb:
                new_breadcrumb = new_breadcrumb + separator + self.title
            else:
                new_breadcrumb = self.title
            breadcrumbs.append(new_breadcrumb)
        return breadcrumbs

## Naive inference example

As a naive example of how we might approach the problem, let's just get a raw embedding of the
topic's breadcrumbs and match those to the content titles (using a pretrained multilingual model).

In [4]:
# setup and imports

!pip install tqdm sentence_transformers

from sentence_transformers import SentenceTransformer
from sklearn.metrics import fbeta_score
from sklearn.neighbors import NearestNeighbors
from tqdm.notebook import tqdm
from annoy import AnnoyIndex

model = SentenceTransformer("distiluse-base-multilingual-cased-v2")

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=09a05521dbaed263f3b29bfa1de4ef952f905b4de18516d7e8ae711600562e3f
  Stored in directory: /root/.cache/pip/wheels/bf/06/fb/d59c1e5bd1dac7f6cf61ec0036cc3a10ab8fecaa6b2c3d3ee9
Successfully built sentence_transformers
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-2.2.2
[0m

Downloading:   0%|          | 0.00/744 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.69k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/610 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/539M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/539M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/531 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/341 [00:00<?, ?B/s]

In [5]:
# generate the embeddings for the topics and content items
# note: you'll want to ensure you have a GPU accelerator enabled in your notebook for this

print("Embedding topics...")
topic_ids = topics_df[topics_df.has_content].index
topic_embeddings = model.encode([Topic(topic_id).get_breadcrumbs() for topic_id in topic_ids])

print("Embedding content...")
content_ids = content_df.index
content_embeddings = model.encode([ContentItem(content_id).title for content_id in content_ids])

Embedding topics...


Batches:   0%|          | 0/1923 [00:00<?, ?it/s]

Embedding content...


Batches:   0%|          | 0/4814 [00:00<?, ?it/s]

In [6]:
# train a nearest neighbors model on the content embeddings
nbrs = NearestNeighbors(n_neighbors=35, algorithm='ball_tree').fit(content_embeddings)

In [7]:
# find the nearest neighbors for a specific sample topic, and calculate performance
# 使用最近邻方法找到最相关的主题，并计算得分
# specify the index of the target topic to use
topic_index = 7888

# calculate the nearest neighbors for the target topic
dist, nb = nbrs.kneighbors([topic_embeddings[topic_index]])
topic = Topic(topic_ids[topic_index])
print("Topic:", topic.get_breadcrumbs())

# get the set of ground truth content IDs correlated to the target topic
true_content_ids = set(correlations_df.loc[topic.id].content_ids.split())

# get the set of content IDs returned by the nearest neighbors model
# (skipping over any content items where the language does not match)
pred_content_ids = []
for cindex in nb[0]:
    cid = content_ids[cindex]
    content = ContentItem(cid)
    if content.language == topic.language:
        pred_content_ids.append(cid)
    # else:
    #     print("Skipping content item with mismatched language:", content.title)

# trim to only the top 20 results
pred_content_ids = set(pred_content_ids[:20])

# display the ground truth and predicted content item titles
print("True content:")
for cid in true_content_ids:
    print("  ", cid, "\t", ContentItem(cid).title)
print("Predicted content:")
if pred_content_ids:
    for cid in pred_content_ids:
        print("  ", cid, "\t", ContentItem(cid).title)
else:
    print("   [None]")

# calculate the confusion matrix variables
tp = len(true_content_ids.intersection(pred_content_ids))
fp = len(pred_content_ids - true_content_ids)
fn = len(true_content_ids - pred_content_ids)

print("Ground truth count:", len(true_content_ids))
print("Predicted count:", len(pred_content_ids))
print("True positives:", tp)
print("False positives:", fp)

# calculate the F2 score
if tp or (fp and fn):
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f2 = tp / (tp + 0.2 * fp + 0.8*fn)
    print("F2:", f2)

Topic: Khan Academy (Español) >> Matemáticas por grado (Perú) >> 3° Secundaria >> Números >> Reescribir números racionales y operaciones de adición y sustracción
True content:
   c_7958da2706e4 	 Sumar y restar números racionales
   c_3b9a21d42875 	 Convertir fracciones a decimales
   c_724886d72d5d 	 Expresiones con números racionales
   c_5ec419614d8b 	 Sumar y restar números racionales: 79% - 79.1 - 58 1/10
   c_a0cce4a12098 	 Desafío de reescribir decimales como fracciones
   c_0f009f729211 	 Sumar y restar números racionales: 0.79 - 4/3 - 1/2 + 150%
Predicted content:
   c_a88e2237af1d 	 IIT JEE Trigonometría peliaguda y álgebra (parte 3)
   c_7958da2706e4 	 Sumar y restar números racionales
Ground truth count: 6
Predicted count: 2
True positives: 1
False positives: 1
F2: 0.1923076923076923


In [8]:
# calculate the mean F2 over a random sampling of 500 topics

f2_scores = []

indices = np.random.choice(len(topic_ids), 500, replace=False)

for topic_index in tqdm(indices):

    # calculate the nearest neighbors for the target topic
    dist, nb = nbrs.kneighbors([topic_embeddings[topic_index]])
    topic = Topic(topic_ids[topic_index])

    # get the set of ground truth content IDs correlated to the target topic
    true_content_ids = set(correlations_df.loc[topic.id].content_ids.split())

    # get the set of content IDs returned by the nearest neighbors model
    # (skipping over any content items where the language does not match)
    pred_content_ids = []
    for cindex in nb[0]:
        cid = content_ids[cindex]
        content = ContentItem(cid)
        if content.language == topic.language:
            pred_content_ids.append(cid)

    # trim to only the top 20 results
    pred_content_ids = set(pred_content_ids[-20:])

    # calculate the confusion matrix variables
    tp = len(true_content_ids.intersection(pred_content_ids))
    fp = len(pred_content_ids - true_content_ids)
    fn = len(true_content_ids - pred_content_ids)

    # calculate the F2 score
    if pred_content_ids:
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f2 = tp / (tp + 0.2 * fp + 0.8*fn)
    else:
        f2 = 0

    f2_scores.append(f2)

print("Average F2:", np.mean(f2_scores))

  0%|          | 0/500 [00:00<?, ?it/s]

Average F2: 0.0891591852619031
