In [186]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.spatial
import json
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import hamming, pdist, squareform
from sklearn.metrics import jaccard_score

In [187]:
df = pd.read_parquet("wbmasterf.parquet")

In [188]:
df2 = pd.read_csv("checkdat.csv")
wordsz = pd.read_parquet("words.parquet")

# **Part 4B: Evaluating the recommendation model**

From a usability standpoint, presenting our users all 680 words for assessment is not ideal. We are therefore only able to collect data on which words the child is able to speak or not for 40 items. Consequently, there is a very large likelihood that the words then recommended to our user will be ones that they have already learned, especially considering that the nature of this CF model means there is a built-in "popularity bias".

However, this problem does present an new approach for its evaluation: 

- To what extent are the words recommended to our user ones that they have already learned?

Read in json file to extract the chosen assessment words.

In [189]:
with open('output1.json', 'r') as file:
    data = json.load(file)

In [190]:
words = [(item["name"], item["category"]) for item in data if item["is_initial_assessment"]]

In [191]:
ids = []

for word, category in words:
    match = df2[(df2["item_definition"] == word) & (df2["category"] == category)]
    ids.extend(match["item_id"].tolist())

First, I will extract children who are `male` and belong to **`age group`** `5`, just like our test user in the demo. The items considered as assessment words in this evaluation are also those selected for this specific age x sex combo.

In [192]:
testusers = df.drop_duplicates(subset="child_id")
testusers.shape

(4831, 9)

In [193]:
testusers = testusers.groupby(["ageg", "sex"])["child_id"].sample(frac=0.2)
testusers.shape

(967,)

Create a new df with only the information of our chosen test users; remove the test users from our "training data".

In [195]:
mask = df["child_id"].isin(testusers)
testdata = df[mask]
df = df[~mask]
df["child_id"].nunique()

3864

Extract the interaction history of our test users.

In [196]:
interaction_history = pd.DataFrame(columns=['child_id',"item_id"])

interaction_history['child_id'] = testusers
interaction_history["item_id"] = df2["item_id"]

for user in testusers:
    user_df = testdata[(testdata["child_id"] == user) & (testdata["item_id"].isin(ids))]
    interaction_history.loc[interaction_history['child_id'] == user, user_df['item_id']] = user_df['value'].values

interaction_history = interaction_history.fillna(0)

interaction_history.reset_index(drop=True, inplace=True)

Create a df with our test users' interaction history. All items are initialized with `0`, and only switched to `1` if the user has already interacted with the item/learned the word **and** it is also included in our assessment (the list **`ids`**). This mimics the information we would be able to receive in a real-use scenario.

In [197]:
all_item_ids = df2["item_id"].tolist()
interaction_history = pd.DataFrame(columns=["child_id"] + all_item_ids)

interaction_history['child_id'] = testusers

# sloooow, replace w list comp

for user in testusers:
    user_df = testdata[(testdata["child_id"] == user) & (testdata["item_id"].isin(ids))]
    interaction_history.loc[interaction_history['child_id'] == user, user_df['item_id']] = user_df['value'].values

interaction_history = interaction_history.fillna(0)

interaction_history.reset_index(drop=True, inplace=True)

Business as usual.

In [198]:
user_item_matrix = df.pivot(index="child_id", columns="item_id", values="value")
itemsorted = sorted(user_item_matrix.columns, key=lambda x: int(x.split("_")[1]))
user_item_matrix = user_item_matrix[itemsorted]
user_item_matrix.columns = [f"item_{int(col.split('_')[1]) - 1}" if col.startswith("item_") else col for col in user_item_matrix.columns]
target_item_matrix = user_item_matrix.T
itemsim = (scipy.spatial.distance.cdist(target_item_matrix.values, target_item_matrix.values, metric='jaccard'))
knn = NearestNeighbors(n_neighbors=5, metric='jaccard', algorithm='brute')
knn.fit(user_item_matrix)

In [199]:
user_weight = 0.3 # value doesnt seem to matter
item_weight = 1 - user_weight

In [200]:
def recommendations_model(user_interactions):
    distance, neighbor_indices = knn.kneighbors([user_interactions])

    user_based_recommendations = []
    for neighbor_index in neighbor_indices[0]:
        neighbor_interactions = user_item_matrix.iloc[neighbor_index] 
                                # change target_matrix to user_item_matrix to drop segmentation
        
        user_based_recommendations.extend([int(item.split('_')[1]) for item in neighbor_interactions[neighbor_interactions == 1].index])

    user_based_recommendations = [item for item in user_based_recommendations if user_interactions[item] == 0]
    user_based_recommendations = list(set(user_based_recommendations))

    item_scores = {
        item_id: sum(
            itemsim[item_id, user_interaction]
            for user_interaction, interaction_value in enumerate(user_interactions)
            if interaction_value == 1
        ) / sum(user_interactions)
        for item_id in user_based_recommendations
    }

    final_scores = {
        item_id: combined_score
        for item_id, neighbor_interaction in zip(item_scores.keys(), neighbor_interactions)
        for combined_score in [(distance * user_weight + item_scores[item_id] * item_weight)/2]
        if item_id not in combined_score or combined_score < final_scores[item_id]
    }

    final_scores = {key: np.min(value) for key, value in final_scores.items()}
    final_scores = {key+1: value for key, value in final_scores.items()}
    final_scores = dict(sorted(final_scores.items(), key=lambda item: item[1])[:6])

    return final_scores

def apply_recommendations_model(row):
    user_interactions = row.values
    return recommendations_model(user_interactions)

In [201]:
inthist = interaction_history.set_index("child_id")

In [202]:
result = inthist.apply(apply_recommendations_model, axis=1)



























































































In [203]:
clean = result.apply(lambda x: [int(key) for key in x.keys()])
clean = clean.to_dict()

In [205]:
fazit = []

for user_id, recommendation in clean.items():
    # Assuming recommendation is a list of item IDs
    for item_id in recommendation:
        item = f"item_{item_id}"
        value = testdata[(testdata["child_id"] == user_id) & (testdata["item_id"] == item)]["value"].values
        fazit.append((user_id, item, value[0]))

fazit = pd.DataFrame(fazit, columns = ["user_id", "item", "redrecs"])
fazit = fazit.groupby("user_id")["redrecs"].sum().reset_index()
fazit["percentage"] = fazit["redrecs"]/6

The table below displays both the count (**`redrecs`**) and **`percentage`** of "redundant recommendations" made for each 5 of our test users. A recommendation is considered redundant when it is a word that the child has actually already learned. As a reminder, the total number of recommendations for each user was 6. 

It's important to note that the prototype allows only specific words to be considered as "exercise words" due to limitations such as 1) the inability to represent all words visually and 2) the absence of audio data for some words. In contrast, I decided to allow all possible words to be suggested as exercise words for this evaluation.

In [206]:
fazit.head(5)

Unnamed: 0,user_id,redrecs,percentage
0,2,5,0.833333
1,4,6,1.0
2,8,6,1.0
3,12,4,0.666667
4,17,1,0.166667


In [207]:
fazit["percentage"].mean()

0.7595291479820628

In [208]:
fazit["redrecs"].mean()

4.557174887892376

As expected, the results are not great, although this notebook (much like **Part 4a**) does not necessarily serve as a "proper" evaluation, but more as a proof of concept. Also, changing weights does not impact the results at all, so I think there might be an error in my code ^^

If properly conducted, however, this method of evaluation could...

- help inform the selection of assessment words
- help inform the selection of weights (user vs. item)
- or even help choose between a user, item or hybrid CF approach