# **Transformers and Natural Language Processing for Knowledge Tracing:**

This notebook will contain an implementation of applying TF-IDF to problems textual descriptions, after necessary data preprocessing and of using consine similarity on TF-IDF values to calculate similarity between problems and predict result for following problem. 

# Import necessary libraries:

In [4]:
#import needed libraries 
from urllib.request import urlretrieve
import zipfile, os
import time, sys, copy
import pandas as pd
import scipy.sparse as sps
import numpy as np
from collections import defaultdict
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import hunspell
import psutil
import gc
from sklearn.metrics import roc_auc_score, accuracy_score


# Clone github repositories from previous works

In [6]:
#clone repositeries
!git clone https://github.com/shalini1194/RKT
#!git clone https://github.com/lyf-1/PEBG.git
!git clone https://github.com/jhljx/GKT.git   
!git clone https://github.com/Shivanandmn/Knowledge-Tracing-SAINT.git
!git clone https://github.com/MaurizioFD/RecSys_Course_AT_PoliMi
#copy repositories in working directory
!cp -r ./RKT/* ./
!cp -r ./RecSys_Course_AT_PoliMi/* ./

!pip install hunspell
"""!pip install -r ./RecSys_Course_AT_PoliMi/requirements.txt
!python ./RecSys_Course_AT_PoliMi/run_compile_all_cython.py
"""

Cloning into 'RKT'...
remote: Enumerating objects: 53, done.[K
remote: Counting objects: 100% (53/53), done.[K
remote: Compressing objects: 100% (42/42), done.[K
remote: Total 53 (delta 9), reused 44 (delta 7), pack-reused 0[K
Unpacking objects: 100% (53/53), done.
Cloning into 'GKT'...
remote: Enumerating objects: 357, done.[K
remote: Counting objects: 100% (357/357), done.[K
remote: Compressing objects: 100% (247/247), done.[K
remote: Total 357 (delta 216), reused 236 (delta 107), pack-reused 0[K
Receiving objects: 100% (357/357), 17.02 MiB | 10.95 MiB/s, done.
Resolving deltas: 100% (216/216), done.
Cloning into 'Knowledge-Tracing-SAINT'...
remote: Enumerating objects: 19, done.[K
remote: Counting objects: 100% (19/19), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 19 (delta 6), reused 12 (delta 3), pack-reused 0[K
Unpacking objects: 100% (19/19), done.
Cloning into 'RecSys_Course_AT_PoliMi'...
remote: Enumerating objects: 969, done.[K
remote:

'!pip install -r ./RecSys_Course_AT_PoliMi/requirements.txt\n!python ./RecSys_Course_AT_PoliMi/run_compile_all_cython.py\n'

In [7]:
from Base.Similarity.Compute_Similarity import Compute_Similarity

# Import datasets

Assistments 2012/13

In [None]:
input_folder = '../input/'
# Assistments 2012/13 dataset with problems textual descriptions 
# Data folder + File name must be the path to dataset. 
file_name = 'assesments-12-13-precessed-data/ASSISTments2012DataSet-ProblemBodies.csv'
df = pd.read_csv(os.path.join(input_folder, file_name),low_memory=False)

Junyi

In [None]:
input_folder = '../input/'
# Junyi with problems textual descriptions 
# Data folder + File name must be the path to dataset. 
file_name = 'junyi-dataset/junyi_question_text.txt'
df = pd.read_csv(os.path.join(input_folder, file_name),low_memory=False, sep = '#')

Peking Online Judge (POJ)

In [8]:
input_folder = '../input/'
# POJ with problems textual descriptions 
# Data folder + File name must be the path to dataset. 
file_name = 'poj-dataset/poj_question_text.txt'
df = pd.read_csv(os.path.join(input_folder, file_name),low_memory=False, sep = '\n', names=["data"])

# Texts preprocessing and cleaning:

In [None]:
eng_dict = hunspell.HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff')

problem_ids, assistment_ids, bodies = df['problem_id'], df['assistment_id'], df['body']
texts=[]
texts_no_numbers=[]
texts_only_existing_words=[]

nltk.download('stopwords')
for body in bodies:
    words_set = set({})
    words_set_no_numbers = set({})
    words_set_only_existing_words = set({})
    text = str(body).replace(' ', '#').replace('/', '#slash#').replace('<', '#lessthan#').replace('>', '#morethan#').replace(",", "#comma#").replace(";", "#semicolon#").replace(".", "#point#").replace("?", "#questionmark#").replace("!", "exclamationpoint").replace("=", "#equal#").replace("\\", "#slash#").replace("%", "#percentage#").replace("\\t", "#").replace("\\n", "#").replace("\t", "#").replace("\n", "#").replace('\"', "#quotationmark#").replace("(", "#openroundbracket#").replace(")", "#closeroundbracket#").replace("[", "#opensquarebracket#").replace("]", "#closesquarebracket#").replace("_", "#underscore#").replace("&", "#ampersand#").replace("}", "#closebrace#").replace("{", "#openbrace#").replace("+", "#plus#").replace("-","#minus#").replace("*", "#multiplication#").replace("€","#euros#").replace("$","#dollar#").replace("^","#powerof#exponent#")
    text = str(text).split('#')
    for i in range(0, len(text)):
        text[i].lower()
        if eng_dict.spell(text[i]):
            words_set_only_existing_words.add(text[i])
        """if len(text[i])<20:
            words_set.add(text[i])
            if not (text[i].isdecimal()):
                result = ''.join(el for el in text[i] if not el.isdigit())
                words_set_no_numbers.add(result)"""
    """text = list(words_set)
    text_no_numbers = list(words_set_no_numbers)"""
    text_only_existing_words = list(words_set_only_existing_words)
    for i in stopwords.words('english'):
        i = i.lower()
        """if text.count(i)>0:
            text.remove(i)
        if text_no_numbers.count(i)>0:
            text_no_numbers.remove(i)"""
        if text_only_existing_words.count(i)>0:
            text_only_existing_words.remove(i)
            
    """texts.append(text)
    texts_no_numbers.append(text_no_numbers)"""
    texts_only_existing_words.append(text_only_existing_words)
print(texts[100:110])
print(texts_only_existing_words[100:110])


In [None]:
eng_dict = hunspell.HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff')

problem_names, questions, question_descriptions = df['question_name'], df['chinese_question'], df['chinese_question_desc']
texts_only_existing_words=[]

nltk.download('stopwords')
for index in range(0, len(questions)):
    words_set = set({})
    text = str(questions[index]).replace(' ', '#').replace('/', '#slash#').replace('<', '#lessthan#').replace('>', '#morethan#').replace(",", "#comma#").replace(";", "#semicolon#").replace(".", "#point#").replace("?", "#questionmark#").replace("!", "exclamationpoint").replace("=", "#equal#").replace("\\", "#slash#").replace("%", "#percentage#").replace("\\t", "#").replace("\\n", "#").replace("\t", "#").replace("\n", "#").replace('\"', "#quotationmark#").replace("(", "#openroundbracket#").replace(")", "#closeroundbracket#").replace("[", "#opensquarebracket#").replace("]", "#closesquarebracket#").replace("_", "#underscore#").replace("&", "#ampersand#").replace("}", "#closebrace#").replace("{", "#openbrace#").replace("+", "#plus#").replace("-","#minus#").replace("*", "#multiplication#").replace("€","#euros#").replace("$","#dollar#").replace("^","#powerof#exponent#")
    text = str(text).split('#')
    text_desc = str(question_descriptions[index]).replace(' ', '#').replace('/', '#slash#').replace('<', '#lessthan#').replace('>', '#morethan#').replace(",", "#comma#").replace(";", "#semicolon#").replace(".", "#point#").replace("?", "#questionmark#").replace("!", "exclamationpoint").replace("=", "#equal#").replace("\\", "#slash#").replace("%", "#percentage#").replace("\\t", "#").replace("\\n", "#").replace("\t", "#").replace("\n", "#").replace('\"', "#quotationmark#").replace("(", "#openroundbracket#").replace(")", "#closeroundbracket#").replace("[", "#opensquarebracket#").replace("]", "#closesquarebracket#").replace("_", "#underscore#").replace("&", "#ampersand#").replace("}", "#closebrace#").replace("{", "#openbrace#").replace("+", "#plus#").replace("-","#minus#").replace("*", "#multiplication#").replace("€","#euros#").replace("$","#dollar#").replace("^","#powerof#exponent#")
    text_desc = str(text_desc).split('#')
    text = text
    text = list(set(text) | set(text_desc))
    for i in range(0, len(text)):
        text[i].lower()
        if eng_dict.spell(text[i]):
            words_set.add(text[i])
    text_only_existing_words = list(words_set)
    for i in stopwords.words('english'):
        i = i.lower()
        if text_only_existing_words.count(i)>0:
            text_only_existing_words.remove(i)
    if text_only_existing_words.count('TIMEOUT')>0:
        text_only_existing_words.remove('TIMEOUT')
    if text_only_existing_words.count('ISSUE')>0:
        text_only_existing_words.remove('ISSUE')
    if text_only_existing_words.count('underscore')>0:
        text_only_existing_words.remove('underscore')
    texts_only_existing_words.append(text_only_existing_words)
print(texts_only_existing_words)

# Calculate TF-IDF using libraries from scikit:

In [9]:
eng_dict = hunspell.HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff')
number_to_index =dict({})
questions = []
index = 0
questions.append([])
for row in df['data']:
    if '#' in row:
        array = row.split('#')
        if array[0].isdigit():
            questions.append([])
            index = index +1
            number = int(array[0])
            number_to_index[number] = index
            questions[index] = array[1]
            
    else:
        new = str(questions[index])+str(row)
        questions[index] = new
#print(phrases)
texts_only_existing_words=[]

nltk.download('stopwords')
for index in range(0, len(questions)):
    words_set = set({})
    text = str(questions[index]).replace(' ', '#').replace('/', '#slash#').replace('<', '#lessthan#').replace('>', '#morethan#').replace(",", "#comma#").replace(";", "#semicolon#").replace(".", "#point#").replace("?", "#questionmark#").replace("!", "exclamationpoint").replace("=", "#equal#").replace("\\", "#slash#").replace("%", "#percentage#").replace("\\t", "#").replace("\\n", "#").replace("\t", "#").replace("\n", "#").replace('\"', "#quotationmark#").replace("(", "#openroundbracket#").replace(")", "#closeroundbracket#").replace("[", "#opensquarebracket#").replace("]", "#closesquarebracket#").replace("_", "#underscore#").replace("&", "#ampersand#").replace("}", "#closebrace#").replace("{", "#openbrace#").replace("+", "#plus#").replace("-","#minus#").replace("*", "#multiplication#").replace("€","#euros#").replace("$","#dollar#").replace("^","#powerof#exponent#")
    text = str(text).split('#')
    for i in range(0, len(text)):
        text[i].lower()
        if eng_dict.spell(text[i]):
            words_set.add(text[i])
    text_only_existing_words = list(words_set)
    for i in stopwords.words('english'):
        i = i.lower()
        if text_only_existing_words.count(i)>0:
            text_only_existing_words.remove(i)
    if text_only_existing_words.count('TIMEOUT')>0:
        text_only_existing_words.remove('TIMEOUT')
    if text_only_existing_words.count('ISSUE')>0:
        text_only_existing_words.remove('ISSUE')
    if text_only_existing_words.count('underscore')>0:
        text_only_existing_words.remove('underscore')
    texts_only_existing_words.append(text_only_existing_words)


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
def identity_tokenizer(text):
    return text
tfidf_vectorizer_existing_words_only = []
tfidf_vectorizer_existing_words_only = TfidfVectorizer(
    analyzer='word',
    tokenizer=identity_tokenizer,
    preprocessor=identity_tokenizer,
    token_pattern=None,
    use_idf = True)
tfidf_vectorizer_vectors_existing_words_only = tfidf_vectorizer_existing_words_only.fit_transform(texts_only_existing_words)
df_tf_idf_existing_words_only = pd.DataFrame.sparse.from_spmatrix(tfidf_vectorizer_vectors_existing_words_only)
print(df_tf_idf_existing_words_only.shape)

"""tfIdfVectorizer=TfidfVectorizer(use_idf=True, stop_words= 'english')
tfIdf = tfIdfVectorizer.fit_transform(texts_no_split)
df_tf_idf = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
df_tf_idf = df_tf_idf.sort_values('TF-IDF', ascending=False)
#df_tf_idf = df_tf_idf[df_tf_idf>0 and (not math.isnan(df_tf_idf))]
print (df_tf_idf.head(50))"""
#Now we change the name of the datasets available:
sparse_tf_idf = tfidf_vectorizer_vectors_existing_words_only
dataframe_tf_idf = df_tf_idf_existing_words_only

words_unique = tfidf_vectorizer_existing_words_only.get_feature_names()
#Save sparse matrix in current directory
data_folder = './'

sps.save_npz(os.path.join(data_folder, 'pro_words_existing_words_only.npz'), sparse_tf_idf)
#sps.save_npz(os.path.join(data_folder, 'pro_words_removed_digits.npz'), tfidf_vectorizer_vectors_no_numbers)

words_dict = dict({})
for i in range(0, len(words_unique)):
    words_dict[str(i)] = words_unique[i]
print(words_dict)

def write_txt(file, data):
    with open(file, 'w') as f:
        for dd in data:
            f.write(str(dd)+'\n')
                    
write_txt(os.path.join(data_folder, 'words_set.txt'), words_unique)

(905, 9513)


shape of pro_word matrix before escaping any special character: (179950, 82084)

In the end I considered that escaping a symbol is better than removing, so I escape all the symbols, including { } * + - _ too, infact now symbols and words are cleaner, despite number has remained the same:    (179950, 62198)

Eventually we can consider removing words with digits to reduce number to: (179950, 44642)   --> 18000 less

A good alternative is to use Hunspell library to check if the word exist, removing not existing ones. In this cas we reduce number of words to: (179950, 27742)   --> 17000 less  --> very good!!

If we consider only existing words and remove digits too we obtain:

# Available dataset with TF-IDF values:

In [11]:
pro_num = dataframe_tf_idf.shape[0]
words_num = dataframe_tf_idf.shape[1]
#print(words_unique) 
dataframe_tf_idf    #dense pandas dataframe
sparse_tf_idf       #sparse matrix


<905x9513 sparse matrix of type '<class 'numpy.float64'>'
	with 68994 stored elements in Compressed Sparse Row format>

# Calculate cosine similarity between questions from TF-IDF dataset

In [12]:
shrink = 10
topK = 100
normalize = True
similarity = "cosine"
similarity_matrix = Compute_Similarity(sparse_tf_idf.T, shrink=shrink, topK=topK, normalize=normalize, similarity = similarity).compute_similarity()
print(similarity_matrix[0])

sps.save_npz(os.path.join(data_folder, 'TF_IDF_pro_pro.npz'), similarity_matrix)



Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 905 ( 100 % ), 5277.35 column/sec, elapsed time 0.00 min
  (0, 219)	0.0043383874
  (0, 222)	0.009828101
  (0, 233)	0.0037593378
  (0, 278)	0.0065368856
  (0, 313)	0.008974723
  (0, 315)	0.008524718
  (0, 316)	0.014030246
  (0, 317)	0.012027885
  (0, 318)	0.008133279
  (0, 319)	0.004235424
  (0, 320)	0.012255359
  (0, 321)	0.01126293
  (0, 324)	0.03237601
  (0, 357)	0.00607282
  (0, 370)	0.0048625167
  (0, 376)	0.00737638
  (0, 408)	0.007335538
  (0, 409)	0.003706918
  (0, 425)	0.0032119956
  (0, 546)	0.005101904
  (0, 588)	0.005023346
  (0, 589)	0.003869648
  (0, 613)	0.0036262022
  (0, 640)	0.0047958866
  (0, 643)	0.0048505855
  (0, 697)	0.0069442517
  (0, 713)	0.0030633395
  (0, 771)	0.029231397
  (0, 773)	0.01187474
  (0, 775)	0.01068079
  (0, 843)	0.0055784094


In [None]:
from KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender
def get_URM(path):
    process = psutil.Process(os.getpid())
    gc.enable()
    data = pd.read_csv(path, low_memory=False, encoding="ISO-8859-1", dtype={"user_id": int, "problem_id": int, "correct": float})
    user_list = data.user_id.to_list()
    problem_list = data.problem_id.to_list()
    y_list = data.correct.to_list()
    for n, i in enumerate(y_list):
        if i == 0.0:
            y_list[n] = -1.0
    del data
    print(process.memory_info().rss)
    URM_all = sps.coo_matrix((y_list, (user_list, problem_list)))
    URM_all = URM_all.tocsr()
    del y_list
    del user_list
    del problem_list
    return URM_all
data_path = '../input/skillbuilder-data-2009-2010/2012-2013-data-with-predictions-4-final.csv'
URM = get_URM(data_path)
print(URM)

In [None]:
similarity_matrix = sps.load_npz('../input/assesments-12-13-precessed-data/TF_IDF_pro_pro.npz')

In [None]:
sparse_cb_similarity_matrix = sps.load_npz('../input/assesments-12-13-precessed-data/pro_pro_.npz')
sparse_cb_similarity_matrix2 = sps.load_npz('../input/assesments-12-13-precessed-data/pro_pro_existing_words_only.npz')

#CBF.save_model(folder_path='./')


# **Evaluation of TF-IDF + cosine similarity**

In [13]:
def _compute_problem_score_from_pro_pro_matrix(sparse_similarity_matrix, user_profile_array, correct, target_problem):
        """
        
        """
        item_scores = sparse_similarity_matrix.tocsr()[user_profile_array, :].dot(sparse_similarity_matrix.tocsr().getrow(target_problem).transpose())
        item_scores = item_scores.transpose().todense().dot(correct)
        return item_scores



In [None]:
data = np.load('../input/assesments-12-13-precessed-data/2012-2013-data-with-predictions-4-final.csv.npz')
y, problems, real_lens = data['y'], data['problem'], data['real_len']
pro_num = data['problem_num']
corrects = np.where(y==-1.0, -100.0, y)
corrects = np.where(corrects==0.0, -1.0, corrects)
corrects = np.where(corrects ==-100.0, 0.0, corrects)
print(problems[0])
print(real_lens[0])

In [None]:
data = pd.read_csv('../input/junyi-dataset/junyi.csv', sep = '\n', names =['data'])
data = data['data']
print(data)
index = range(0, len(data)//4)
real_len_index = [el*4 for el in index]
real_lens = [int(data[x]) for x in real_len_index]
problem_index = [el*4+1 for el in index]
problem_data = [data[x].split(',') for x in problem_index]
corrects_index = [el*4+2 for el in index]
corrects_data = [data[x].split(',') for x in corrects_index]
labels =[]
predictions =[]
for problem, correct, real_len in list(zip(*(problem_data, corrects_data, real_lens))):
    correct2 = [float(x) for x in correct]
    problem2 = [int(x) for x in problem]
    problem = problem2
    correct = np.where(correct2==-1.0, -100.0, correct2)
    correct = np.where(correct==0.0, -1.0, correct)
    correct = np.where(correct ==-100.0, 0.0, correct)
    prediction = _compute_problem_score_from_pro_pro_matrix(similarity_matrix, problem[0:real_len-1], correct[0:real_len-1], problem[real_len-1])  
    if prediction >0:
        predictions.append(1)
    else:
        predictions.append(0)
    if correct[real_len-1] == 1.0:
        labels.append(1)
    else:
        labels.append(0)
    i+=1
    if i%1000==0:
        print(i)
print(predictions)

pro_num = len(problem)



In [31]:
data = pd.read_csv('../input/poj-dataset/poj_log.csv')
users = set(data['User'])
print(len(users))
real_lens = []
problems = []
corrects = []
for user, problem in data.groupby("User"):
    correct_answer = problem['Result']
    correct = []
    problem_df = problem["Problem"]
    problem_list = []
    k = 0
    for p, c in list(zip(*(problem_df, correct_answer))):
        if p in number_to_index:
            problem_list.append(number_to_index[p])
            if c == "Accepted":
                correct.append(1.0)
            else:
                correct.append(-1.0)
            k+=1
    if k>1:
        real_lens.append(k)
        problems.append(problem_list)
        corrects.append(correct)
print(problems[1])
print(corrects[1])
print(real_lens[1])

22916
[273, 273, 273, 273, 273, 273, 273, 273, 273, 219, 219]
[-1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0, -1.0]
11


In [32]:
i=0
predictions =[]
labels = []
i=0
print(len(problems))
for problem, correct, real_len in list(zip(*(problems, corrects, real_lens))):
    prediction = _compute_problem_score_from_pro_pro_matrix(similarity_matrix, problem[0:real_len-1], correct[0:real_len-1], problem[real_len-1])  
    if prediction >0:
        predictions.append(1)
    else:
        predictions.append(0)
    if correct[real_len-1] == 1.0:
        labels.append(1)
    else:
        labels.append(0)
    i+=1
    if i%1000==0:
        print(i)
print(predictions)

12517
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
[1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 

In [37]:
def write_txt(file, data):
    with open(file, 'w') as file2:
        for d in data:
            file2.write(str(float(d)))
            file2.write(' ')
                    
write_txt(os.path.join(data_folder, 'predictions.txt'), predictions)

In [34]:
acc = accuracy_score(labels, predictions)
print(acc)

0.609411200766957
