In [1]:
import pandas as pd
import random
import ast
import json
import torch
from sentence_transformers import SentenceTransformer, util

In [None]:
# ASSISTments data variables
RAW_DATA_PATH = "../data/ASSISTments/ASSISTments_2017.csv"
ASSISTMENTs_SKILL_PATH = "../data/ASSISTments/ASSISTments_skills.csv"
SKILL_MAPPED_DATA_PATH = "../data/ASSISTments/ASSISTments_skill_mapped_data.csv"
PROCESSED_DATA_PATH = "../data/ASSISTments/ASSISTments_processed_data.json"

# SL data variables
SL_SKILL_PATH = "../data/SL/sl_skills.csv"
SKILL_MAP_PATH = "../data/SL/skill_map.csv"

# pytorch/ml variables
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

SKILL_EMBEDDING_MODEL = "all-MiniLM-L6-v2"

In [3]:
assistments_data_whole = pd.read_csv(RAW_DATA_PATH)

In [4]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [5]:
print(len(assistments_data_whole))
print(assistments_data_whole.columns)
print(assistments_data_whole.head())

6123270
Index(['problem_log_id', 'skill', 'problem_id', 'user_id', 'assignment_id',
       'assistment_id', 'start_time', 'end_time', 'problem_type', 'original',
       'correct', 'bottom_hint', 'hint_count', 'actions', 'attempt_count',
       'ms_first_response', 'tutor_mode', 'sequence_id', 'student_class_id',
       'position', 'type', 'base_sequence_id', 'skill_id', 'teacher_id',
       'school_id', 'overlap_time', 'template_id', 'answer_id', 'answer_text',
       'first_action', 'problemlogid', 'Average_confidence(FRUSTRATED)',
       'Average_confidence(CONFUSED)', 'Average_confidence(CONCENTRATING)',
       'Average_confidence(BORED)'],
      dtype='object')
   problem_log_id                                 skill  problem_id  user_id  \
0       137792159                                   NaN      557460    61394   
1       138083797                              Rounding      365981    61394   
2       142332619  Multiplication and Division Integers      426415    61394   
3     

In [37]:
assistments_data_whole["start_time"] = pd.to_datetime(assistments_data_whole["start_time"], format = "mixed")
assistments_data_whole["end_time"] = pd.to_datetime(assistments_data_whole["end_time"], format = "mixed")

assistments_data_whole["time_taken"] = (assistments_data_whole["end_time"] - assistments_data_whole["start_time"]).dt.total_seconds()

In [38]:
assistments_data = assistments_data_whole[[
    "user_id",
    "skill_id",
    "correct",
    "problem_type",
    "start_time",
    "time_taken",
    "skill"
    ]]

In [39]:
print(assistments_data.isnull().sum())
print(assistments_data.dtypes)
print(assistments_data.head())

user_id               0
skill_id        3411457
correct               0
problem_type          0
start_time            0
time_taken            0
skill           3493190
dtype: int64
user_id                  int64
skill_id               float64
correct                float64
problem_type            object
start_time      datetime64[ns]
time_taken             float64
skill                   object
dtype: object
   user_id  skill_id  correct problem_type          start_time  time_taken  \
0    61394       NaN      1.0     choose_1 2012-09-28 15:11:27       9.856   
1    61394      54.0      1.0      algebra 2012-10-09 11:01:52      21.182   
2    61394     279.0      0.0      algebra 2013-03-07 10:53:20       8.661   
3    61394      79.0      1.0      algebra 2013-08-20 19:54:56      25.753   
4    76592       NaN      1.0     choose_1 2012-09-10 17:20:10     286.579   

                                  skill  
0                                   NaN  
1                              Roun

In [40]:
assistments_data[assistments_data['skill_id'].isnull()]['problem_type'].value_counts()

problem_type
algebra          1507058
choose_1         1492952
fill_in_1         384537
open_response      16869
choose_n            8556
rank                1485
Name: count, dtype: int64

In [41]:
'''
The null values in the data are because the Worcester Polytechnic Institute did not map every question
 to a skill initaily and they are left that way, and now we are left with the only practical option 
 of dropping the null rows as we cannot assign skills untill unless we question texts.
'''

assistments_data = assistments_data.dropna()

print(len(assistments_data))

2630080


In [42]:
assistments_data["user_id"] = pd.factorize(assistments_data["user_id"])[0]
assistments_data["skill_id"] = pd.factorize(assistments_data["skill"])[0]

In [43]:
assistments_skills = pd.Series(assistments_data["skill"].unique())
assistments_skills.name = "skill_name"

assistments_skills.to_csv(ASSISTMENTs_SKILL_PATH, index = False)

In [44]:
sl_skills = pd.read_csv(SL_SKILL_PATH).squeeze().tolist()
assistments_skills = pd.read_csv(ASSISTMENTs_SKILL_PATH).squeeze().tolist()

In [45]:
skill_embedding_model = SentenceTransformer(SKILL_EMBEDDING_MODEL)

assistments_skill_embeddings = skill_embedding_model.encode(assistments_skills, convert_to_tensor = True, device = DEVICE)
sl_skill_embeddings = skill_embedding_model.encode(sl_skills, convert_to_tensor = True, device = DEVICE)

cosine_scores = util.pytorch_cos_sim(assistments_skill_embeddings, sl_skill_embeddings)

In [46]:
# skill mapping (assistment to sl and vice versa for unmapped skills)

mapping_result = []

for i, skill in enumerate(assistments_skills):

    best_match = torch.argmax(cosine_scores[i])
    best_score = cosine_scores[i][best_match].item()

    result = {
        "assistments_skill" : skill,
        "sl_skill" : [
            {
                "skill" : sl_skills[best_match.item()],
                "cosine_score" : best_score
            }
        ]
    }

    mapping_result.append(result)

mapping_result = pd.DataFrame(mapping_result)


# Mapping in reverse for unmapped sl skills

unmapped_sl_skills = list(set(sl_skills) - set(mapping_result["sl_skill"].apply(lambda x: x[0]["skill"]).unique()))
umapped_sl_skill_embeddings = skill_embedding_model.encode(unmapped_sl_skills, convert_to_tensor = True, device = DEVICE)

reversed_cosine_scores = util.pytorch_cos_sim(umapped_sl_skill_embeddings, assistments_skill_embeddings)

targeted_mapping_result = []

for i, skill in enumerate(unmapped_sl_skills):

    best_match = torch.argmax(reversed_cosine_scores[i])
    best_score = reversed_cosine_scores[i][best_match].item()

    result = {
        "assistments_skill" : assistments_skills[best_match],
        "sl_skill" : [
            {
                "skill" : skill,
                "cosine_score" : best_score
            }
        ]
    }

    targeted_mapping_result.append(result)

targeted_mapping_result = pd.DataFrame(targeted_mapping_result)


combined = pd.concat([mapping_result, targeted_mapping_result], ignore_index=True)

final_mapping = combined.groupby("assistments_skill", as_index=False).agg({"sl_skill": lambda x: sum(x, [])})

In [47]:
final_mapping.to_csv(SKILL_MAP_PATH)


In [8]:
final_mapping = pd.read_csv(SKILL_MAP_PATH)
final_mapping['sl_skill'] = final_mapping['sl_skill'].apply(ast.literal_eval)
skill_map_dict = final_mapping.set_index('assistments_skill').to_dict(orient='index')

In [9]:

def probabilistic_skill_map(original_skill):

    row = skill_map_dict.get(original_skill)

    if row is None:
        return original_skill
    
    matches = row.get("sl_skill", [])
    if not matches:
        return original_skill

    if len(matches) == 1:
        return matches[0]["skill"]
    else:
        skills = [m["skill"] for m in matches]
        weights = [m["cosine_score"] for m in matches]

        return random.choices(skills, weights = weights)[0]


In [10]:
assistments_data["skill"] = assistments_data["skill"].apply(probabilistic_skill_map)
assistments_data["skill_id"] = pd.factorize(assistments_data["skill"])[0] + 1

In [11]:
print(assistments_data["skill"].nunique())

44


In [12]:
assistments_data.to_csv(SKILL_MAPPED_DATA_PATH)

In [13]:
assistments_data = pd.read_csv(SKILL_MAPPED_DATA_PATH)

In [6]:
assistments_data["skill_id"].nunique()

44

In [14]:
assistments_data["skill_id"].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44])

In [56]:
assistments_data.sort_values(by = ["user_id", "start_time"], inplace = True)
assistments_data["time_taken"] = assistments_data["time_taken"].clip(lower = 0, upper = 600)

grouped_by_user_id = assistments_data.groupby("user_id")

t_sequences = [user_data['time_taken'].tolist() for _, user_data in grouped_by_user_id]
q_sequences = [user_data['skill_id'].tolist() for _, user_data in grouped_by_user_id]
r_sequences = [user_data['correct'].tolist() for _, user_data in grouped_by_user_id]

In [60]:
processed_data = {
    "t_sequences" : t_sequences,
    "q_sequences" : q_sequences,
    "r_sequences" : r_sequences
}

with open(PROCESSED_DATA_PATH, 'w') as f:
    json.dump(processed_data, f)