In [1]:
import pandas as pd
from skimpy import skim
from scipy import sparse
import numpy as np
import json
import os

In [2]:
path = r'C:\Users\pcteste1\Downloads\2012-2013-data-with-predictions-4-final'
file = '2012-2013-data-with-predictions-4-final.csv'
file_path = os.path.join(path, file)

In [12]:
df = pd.read_csv(file_path)

In [12]:
df.shape

(6123270, 35)

In [15]:
skim(df)

In [29]:
df.iloc[5]['actions']

'--- \n- - start\n  - 1355364055251\n  - "697389"\n- - answer\n  - 12522\n  - true\n  - "1024"\n  - \n- - end\n'

In [17]:
df.columns

Index(['problem_log_id', 'skill', 'problem_id', 'user_id', 'assignment_id',
       'assistment_id', 'start_time', 'end_time', 'problem_type', 'original',
       'correct', 'bottom_hint', 'hint_count', 'actions', 'attempt_count',
       'ms_first_response', 'tutor_mode', 'sequence_id', 'student_class_id',
       'position', 'type', 'base_sequence_id', 'skill_id', 'teacher_id',
       'school_id', 'overlap_time', 'template_id', 'answer_id', 'answer_text',
       'first_action', 'problemlogid', 'Average_confidence(FRUSTRATED)',
       'Average_confidence(CONFUSED)', 'Average_confidence(CONCENTRATING)',
       'Average_confidence(BORED)'],
      dtype='object')

In [36]:
df['position']

0           4
1           5
2          58
3          21
4           3
           ..
6123265     1
6123266     1
6123267     1
6123268     1
6123269     1
Name: position, Length: 6123270, dtype: int64

In [37]:
# remove features witht high number of missing values
columns = ['student_class_id', 'user_id', 'problem_log_id', 'problem_id', 'assignment_id', 'actions', \
        'Average_confidence(FRUSTRATED)',
       'Average_confidence(CONFUSED)', 'Average_confidence(CONCENTRATING)',
       'Average_confidence(BORED)', 'problemlogid', 'skill', 'assistment_id', 'student_class_id', 
       'position', 'base_sequence_id', 'skill_id', 'teacher_id', 'template_id', 'answer_id',
       'school_id']

In [38]:
df = df.drop(columns, axis='columns')

In [39]:
df.columns

Index(['start_time', 'end_time', 'problem_type', 'original', 'correct',
       'bottom_hint', 'hint_count', 'attempt_count', 'ms_first_response',
       'tutor_mode', 'sequence_id', 'type', 'overlap_time', 'answer_text',
       'first_action'],
      dtype='object')

In [40]:
df.shape

(6123270, 15)

In [5]:
def prepare_assistments12(min_interactions_per_user, remove_nan_skills, verbose):
	"""Preprocess ASSISTments 2012-2013 dataset.

	Arguments:
	min_interactions_per_user -- minimum number of interactions per student
	remove_nan_skills -- if True, remove interactions with no skill tag

	Outputs:
	df -- preprocessed ASSISTments dataset (pandas DataFrame)
	Q_mat -- corresponding q-matrix (item-skill relationships sparse array)
	"""
	df = pd.read_csv(file_path)
	if verbose:
		initial_shape = df.shape[0]
		print("Opened ASSISTments 2012 data. Output: {} samples.".format(initial_shape))
	
	df["timestamp"] = df["start_time"]
	df["timestamp"] = pd.to_datetime(df["timestamp"], format='mixed', dayfirst=True)
	df["timestamp"] = df["timestamp"] - df["timestamp"].min()
	df["timestamp"] = df["timestamp"].apply(lambda x: x.total_seconds()).astype(np.int64)
	#df.sort_values(by="timestamp", inplace=True)
	#df.reset_index(inplace=True, drop=True)
	if remove_nan_skills:
		df = df[~df["skill_id"].isnull()]
		if verbose:
			print("Removed {} samples with NaN skills.".format(df.shape[0]-initial_shape))
			initial_shape = df.shape[0]
	else:
		df.loc[df["skill_id"].isnull(), "skill_id"] = -1

	df = df[df.correct.isin([0,1])] # Remove potential continuous outcomes
	if verbose:
		print("Removed {} samples with non-binary outcomes.".format(df.shape[0]-initial_shape))
		initial_shape = df.shape[0]
	df['correct'] = df['correct'].astype(np.int32) # Cast outcome as int32

	df = df.groupby("user_id").filter(lambda x: len(x) >= min_interactions_per_user)
	if verbose:
		print(f'Removed {df.shape[0]-initial_shape} samples (users with less than {min_interactions_per_user} interactions)')
		initial_shape = df.shape[0]

	df["user_id"] = np.unique(df["user_id"], return_inverse=True)[1]
	df["item_id"] = np.unique(df["problem_id"], return_inverse=True)[1]
	df["skill_id"] = np.unique(df["skill_id"], return_inverse=True)[1]
	
	#df.reset_index(inplace=True, drop=True) # Add unique identifier of the row
	#df["inter_id"] = df.index

	# Build Q-matrix
	Q_mat = np.zeros((df["item_id"].nunique(), df["skill_id"].nunique()))
	item_skill = np.array(df[["item_id", "skill_id"]])
	for i in range(len(item_skill)):
		Q_mat[item_skill[i,0],item_skill[i,1]] = 1
	if verbose:
		print("Computed q-matrix. Shape: {}.".format(Q_mat.shape))

	#df = df[['user_id', 'item_id', 'timestamp', 'correct', "inter_id"]]
	df = df[['user_id', 'item_id', 'timestamp', 'correct']]
	# Remove potential duplicates
	df.drop_duplicates(inplace=True)
	if verbose:
		print("Removed {} duplicated samples.".format(df.shape[0] - initial_shape))
		initial_shape = df.shape[0]

	df.sort_values(by="timestamp", inplace=True)
	df.reset_index(inplace=True, drop=True)
	print("Data preprocessing done. Final output: {} samples.".format((df.shape[0])))
	# Save data
	sparse.save_npz("data/assistments12/q_mat.npz", sparse.csr_matrix(Q_mat))
	df.to_csv("data/assistments12/preprocessed_data.csv", index=False)

	with open('data/assistments12/config.json', 'w') as f:
		f.write(json.dumps({
			'n_users': df.user_id.nunique(),
			'n_items': df.item_id.nunique(),
			'n_skills': Q_mat.shape[1]
			}, indent=4))

	return df, Q_mat

In [13]:
df.iloc[696]['start_time']

'2012-09-03 14:18:36.278543'

In [6]:
df, Q_mat = prepare_assistments12(min_interactions_per_user=10,
										  remove_nan_skills=True,
										  verbose=True)

Opened ASSISTments 2012 data. Output: 6123270 samples.
Removed -3411457 samples with NaN skills.
Removed -211 samples with non-binary outcomes.
Removed -29391 samples (users with less than 10 interactions)
Computed q-matrix. Shape: (52850, 265).
Removed 0 duplicated samples.
Data preprocessing done. Final output: 2682211 samples.


FileNotFoundError: [Errno 2] No such file or directory: 'data/assistments12/q_mat.npz'