In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.spatial
import json
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import hamming, pdist, squareform
from sklearn.metrics import jaccard_score

In [38]:
df = pd.read_csv("wbmaster.csv")

In [5]:
df2 = pd.read_csv("checkdat.csv")

# **Part 4B: Evaluating the recommendation model**

Read in json file to extract the chosen assessment words.

In [3]:
with open('output1.json', 'r') as file:
    data = json.load(file)

In [22]:
words = [(item["name"], item["category"]) for item in data if item["is_initial_assessment"]]

In [31]:
ids = []

for word, category in words:
    match = df2[(df2["item_definition"] == word) & (df2["category"] == category)]
    ids.extend(match["item_id"].tolist())

From a usability standpoint, presenting our users all 680 words for assessment is not ideal. We are therefore only able to collect data on which words the child is able to speak or not for 40 items. Consequently, there is a very large likelihood that the words then recommended to our user will be ones that they have already learned, especially considering that the nature of this CF model means there is a built-in "popularity bias".

However, this problem does present an new approach for its evaluation: 

- To what extent are the words recommended to our user ones that they have already learned?

First, I will extract 5 children who are `male` and belong to **`age group`** `5`, just like our test user in the demo.

In [9]:
testusers = df[(df["ageg"] == 3) & (df["sex"] == "Male")]["child_id"].sample(5)

Create a new df with only the information of our chosen test users; remove the test users from our "training data".

In [10]:
mask = df["child_id"].isin(testusers)
testdata = df[mask]
df = df[~mask]

Extract the interaction history of our test users.

In [11]:
interaction_history = pd.DataFrame(columns=['child_id',"item_id"])

interaction_history['child_id'] = testusers
interaction_history["item_id"] = df2["item_id"]

for user in testusers:
    user_df = testdata[(testdata["child_id"] == user) & (testdata["item_id"].isin(ids))]
    interaction_history.loc[interaction_history['child_id'] == user, user_df['item_id']] = user_df['value'].values

interaction_history = interaction_history.fillna(0)

interaction_history.reset_index(drop=True, inplace=True)

In [42]:
interaction_history

Unnamed: 0,child_id,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,item_9,...,item_671,item_672,item_673,item_674,item_675,item_676,item_677,item_678,item_679,item_680
0,1384,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,511,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,782,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1771,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,86342,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
df[df["child_id"] == 1384].iloc[:,1:]

Unnamed: 0,item_id,item_definition,child_id,age,value,production,sex,ageg
919360,item_1,baa baa,1384,24,1,317,Male,3
919361,item_2,choo choo,1384,24,0,317,Male,3
919362,item_3,cockadoodledoo,1384,24,0,317,Male,3
919363,item_4,grrr,1384,24,1,317,Male,3
919364,item_5,meow,1384,24,1,317,Male,3
...,...,...,...,...,...,...,...,...
920035,item_676,because,1384,24,0,317,Male,3
920036,item_677,but,1384,24,0,317,Male,3
920037,item_678,if,1384,24,0,317,Male,3
920038,item_679,so,1384,24,0,317,Male,3


Create a df with our test users' interaction history. All items are initialized with `0`, and only switched to `1` if the user has already interacted with the item/learned the word **and** it is also included in our assessment (the list **`ids`**). This mimics the information we would be able to receive in a real-use scenario.

In [48]:
all_item_ids = df2["item_id"].tolist()
interaction_history = pd.DataFrame(columns=["child_id"] + all_item_ids)

interaction_history['child_id'] = testusers

for user in testusers:
    user_df = testdata[(testdata["child_id"] == user) & (testdata["item_id"].isin(ids))]
    interaction_history.loc[interaction_history['child_id'] == user, user_df['item_id']] = user_df['value'].values

interaction_history = interaction_history.fillna(0)

interaction_history.reset_index(drop=True, inplace=True)

In [54]:
interaction_history

Unnamed: 0,child_id,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,item_9,...,item_671,item_672,item_673,item_674,item_675,item_676,item_677,item_678,item_679,item_680
0,1384,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,511,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,782,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1771,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,86342,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Bus as usual.

In [50]:
user_item_matrix = df.pivot(index="child_id", columns="item_id", values="value")
itemsorted = sorted(user_item_matrix.columns, key=lambda x: int(x.split("_")[1]))
user_item_matrix = user_item_matrix[itemsorted]
user_item_matrix.columns = [f"item_{int(col.split('_')[1]) - 1}" if col.startswith("item_") else col for col in user_item_matrix.columns]
target_item_matrix = user_item_matrix.T
itemsim = (scipy.spatial.distance.cdist(target_item_matrix.values, target_item_matrix.values, metric='jaccard'))
knn = NearestNeighbors(n_neighbors=5, metric='jaccard', algorithm='brute')
knn.fit(user_item_matrix)

Create a for loop, in which the interaction history of our test users is used to determine which of our training users are most similar to them.

In [52]:
interactionhistory = {}

for user in testusers:
    user_interactions = interaction_history[interaction_history["child_id"] == user].iloc[:,1:].values
    distance, neighbor_indices = knn.kneighbors(user_interactions)
    neighbor_indices = neighbor_indices[0]
    neighbor_id = user_item_matrix.index[neighbor_indices]
    neighbors_info = legend[legend["child_id"].isin(neighbor_id)][["child_id", "ageg", "sex"]]
    interactionhistory[user] = distance, neighbor_id, neighbors_info



In [69]:
user_recommendations = {}

for index, row in interaction_history.iterrows():
    user_interactions = row[1:] 
    user_id = row['child_id']

    distance, neighbor_indices = knn.kneighbors([user_interactions])

    user_based_recommendations = []
    for neighbor_index in neighbor_indices[0]:
        neighbor_interactions = user_item_matrix.iloc[neighbor_index]

        user_based_recommendations.extend([int(item.split('_')[1]) for item in neighbor_interactions[neighbor_interactions == 1].index])

    user_based_recommendations = [item for item in user_based_recommendations if 0 <= item <= len(user_interactions) and user_interactions[item] == 0]
    user_based_recommendations = list(set(user_based_recommendations))

    item_scores = {}
    for item_id in range(len(itemsim)):
        similar_items = np.argsort(itemsim[item_id])[::-1]
        similar_items = [item for item in similar_items if user_interactions[item] == 0]
        similarity_score = sum(itemsim[item_id][similar_items])
        item_scores[item_id] = similarity_score

    sorted_items = sorted(item_scores.items(), key=lambda x: x[1], reverse=True)
    item_based_recommendations = [item_id for item_id, _ in sorted_items]
    item_based_recommendations = list(set(item_based_recommendations))

    user_weight = 0.3 
    item_weight = 1 - user_weight

    combined_recommendations = []
    for item_id in user_based_recommendations:
        user_score = 1 
        item_score = item_scores.get(item_id, 0)
        combined_score = (user_weight * user_score) + (item_weight * item_score)

        combined_recommendations.append((item_id, combined_score))

    combined_recommendations.sort(key=lambda x: x[1], reverse=True)

    combined_recommendations = [(item_id + 1, combined_score) for item_id, combined_score in combined_recommendations]

    final_items = [item_id for item_id, _ in combined_recommendations][0:6]

    user_recommendations[user_id] = final_items

# Print the recommendations for each user (optional)
#for user_id, recommendations in user_recommendations.items():
    #print(f"Final Recommendations for User {user_id} (w/ Scores):", recommendations)



Final Recommendations for User 1384 (w/ Scores): [103, 64, 152, 151, 156, 68]
Final Recommendations for User 511 (w/ Scores): [366, 356, 371, 9, 10, 4]
Final Recommendations for User 782 (w/ Scores): [103, 204, 352, 168, 366, 356]
Final Recommendations for User 1771 (w/ Scores): [40, 610, 204, 72, 366, 356]
Final Recommendations for User 86342 (w/ Scores): [68, 366, 356, 9, 10, 70]


In [122]:
fazit = []

for user_id, recommendation in user_recommendations.items():
    # Assuming recommendation is a list of item IDs
    for item_id in recommendation:
        item = f"item_{item_id}"
        value = testdata[(testdata["child_id"] == user_id) & (testdata["item_id"] == item)]["value"].values
        fazit.append((user_id, item, value[0]))

fazit = pd.DataFrame(fazit, columns = ["user_id", "item", "redrecs"])
fazit = fazit.groupby("user_id")["redrecs"].sum().reset_index()
fazit["percentage"] = fazit["redrecs"]/6

The table below displays both the count (**`redrecs`**) and **`percentage`** of "redundant recommendations" made for each 5 of our test users. A recommendation is considered redundant when it is a word that the child has actually already learned. As a reminder, the total number of recommendations for each user was 6. 

It's important to note that the prototype allows only specific words to be considered as "exercise words" due to limitations such as 1) the inability to represent all words visually and 2) the absence of audio data for some words. In contrast, I decided to allow all possible words to be suggested as exercise words for this evaluation.

In [123]:
fazit

Unnamed: 0,user_id,redrecs,percentage
0,511,3,0.5
1,782,3,0.5
2,1384,0,0.0
3,1771,5,0.833333
4,86342,5,0.833333


As expected, the results are not great, although this notebook (much like **Part 4a**) does not necessarily serve as a "proper" evaluation, but more as a proof of concept. The number of test users was intentionally kept small and chosen arbitrarily; the test users were limited to only one age x sex combination; and no statistical metrics were calculated, so the magnitude of this problem cannot be estimated properly.

If properly conducted, however, this method of evaluation could...

- help inform the selection of assessment words
- help inform the selection of weights (user vs. item)
- or even help choose between a user, item or hybrid CF approach