In [1]:
import os

import dill as pkl
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook as tqdm

## Combine EdNet KT1 user files into single csv

In [12]:
BASE_PATH = "/shared/new_am_data/ednet_kt1"
users = os.listdir(BASE_PATH)
len(users)

784309

In [19]:
user_csvs = []
for user_file in tqdm(users):
    uid = user_file.split("u")[1].split(".")[0]
    user_csv = pd.read_csv(
        f"{BASE_PATH}/{user_file}",
        header=0,
        names=[
            "timestamp",
            "question_id",
            "user_answer",
            "elapsed_time",
        ],
        usecols=[0, 2, 3, 4,],
    )
    user_csv.insert(loc=0, column="student_id", value=[int(uid)] * len(user_csv))
    user_csvs.append(user_csv)

HBox(children=(IntProgress(value=0, max=784309), HTML(value='')))




In [23]:
concat_data = pd.concat(user_csvs).reset_index(drop=True)

In [24]:
concat_data

Unnamed: 0,student_id,timestamp,question_id,user_answer,elapsed_time
0,291146,1540387236178,q6271,b,13000
1,291146,1540387250432,q6469,a,11000
2,291146,1540387276279,q4771,d,24000
3,291146,1540387310819,q6722,a,33000
4,291146,1540387327374,q6731,a,14000
5,291146,1540387370881,q8695,c,42000
6,291146,1540387403303,q5805,a,30000
7,291146,1540387455910,q4025,a,51000
8,526792,1551936109338,q4482,a,24000
9,526792,1551936146150,q6263,b,32000


In [47]:
questions = pd.read_csv(f"/shared/new_am_data/ednet_contents/questions.csv", usecols=[0, 3, 4, 5])
questions

Unnamed: 0,question_id,correct_answer,part,tags
0,q1,b,1,1;2;179;181
1,q2,a,1,15;2;182
2,q3,b,1,14;2;179;183
3,q4,b,1,9;2;179;184
4,q5,c,1,8;2;179;181
5,q6,d,1,9;2;179;182
6,q7,d,1,11;7;179;183
7,q8,b,1,20;21;179;184
8,q9,c,1,13;2;179;183
9,q10,c,1,17;7;182


In [48]:
qid2ans = questions.set_index("question_id").correct_answer.to_dict()
qid2part = questions.set_index("question_id").part.to_dict()
qid2tags = questions.set_index("question_id").tags.to_dict()

In [49]:
concat_data["correct_answer"] = concat_data.question_id.map(qid2ans)
concat_data["part"] = concat_data.question_id.map(qid2part)
concat_data["tags"] = concat_data.question_id.map(qid2tags)

In [45]:
col = concat_data.pop("elapsed_time")
concat_data.insert(6, col.name, col)

In [50]:
concat_data

Unnamed: 0,student_id,timestamp,question_id,user_answer,correct_answer,part,elapsed_time,tags
0,291146,1540387236178,q6271,b,b,5,13000,74
1,291146,1540387250432,q6469,a,a,5,11000,81
2,291146,1540387276279,q4771,d,d,5,24000,119
3,291146,1540387310819,q6722,a,a,5,33000,77
4,291146,1540387327374,q6731,a,c,5,14000,77
5,291146,1540387370881,q8695,c,d,5,42000,119
6,291146,1540387403303,q5805,a,d,5,30000,74
7,291146,1540387455910,q4025,a,d,5,51000,76
8,526792,1551936109338,q4482,a,a,5,24000,92
9,526792,1551936146150,q6263,b,c,5,32000,71


In [26]:
time_limits = pd.read_csv("am_v2/tmp/time_limits.csv")
time_limits

Unnamed: 0,question_id,eq_id,part_number,question_count,time_limit_in_ms,audio_duration_in_ms
0,1,p1-121,1,1,26076.734694,18076.734694
1,2,p1-122,1,1,26599.183673,18599.183673
2,3,p1-123,1,1,28192.653061,20192.653061
3,4,p1-124,1,1,27069.387755,19069.387755
4,5,p1-125,1,1,27513.469388,19513.469388
5,6,p1-126,1,1,28166.530612,20166.530612
6,7,p1-127,1,1,29681.632653,21681.632653
7,8,p1-128,1,1,28166.530612,20166.530612
8,9,p1-129,1,1,30099.591837,22099.591837
9,10,p1-130,1,1,29002.448980,21002.448980


In [54]:
time_limits.question_id  = "q" + time_limits.question_id.astype(str)

In [55]:
time_limits

Unnamed: 0,question_id,eq_id,part_number,question_count,time_limit_in_ms,audio_duration_in_ms
0,q1,p1-121,1,1,26076.734694,18076.734694
1,q2,p1-122,1,1,26599.183673,18599.183673
2,q3,p1-123,1,1,28192.653061,20192.653061
3,q4,p1-124,1,1,27069.387755,19069.387755
4,q5,p1-125,1,1,27513.469388,19513.469388
5,q6,p1-126,1,1,28166.530612,20166.530612
6,q7,p1-127,1,1,29681.632653,21681.632653
7,q8,p1-128,1,1,28166.530612,20166.530612
8,q9,p1-129,1,1,30099.591837,22099.591837
9,q10,p1-130,1,1,29002.448980,21002.448980


In [56]:
qid2limit = time_limits.set_index("question_id").time_limit_in_ms.astype(int).to_dict()

In [68]:
avg_times = time_limits.groupby("part_number").mean().astype(int).time_limit_in_ms.to_dict()
qid_diff = set(qid2part.keys()) - set(time_limits.question_id.unique())
qid2avg = {qid: avg_times[qid2part[qid]] for qid in qid_diff}
qid2limit.update(qid2avg)

In [71]:
concat_data["time_limit_in_ms"] = concat_data.question_id.map(qid2limit)

In [72]:
concat_data

Unnamed: 0,student_id,timestamp,question_id,user_answer,correct_answer,part,elapsed_time,tags,time_limit_in_ms
0,291146,1540387236178,q6271,b,b,5,13000,74,25000
1,291146,1540387250432,q6469,a,a,5,11000,81,25000
2,291146,1540387276279,q4771,d,d,5,24000,119,25000
3,291146,1540387310819,q6722,a,a,5,33000,77,25000
4,291146,1540387327374,q6731,a,c,5,14000,77,25000
5,291146,1540387370881,q8695,c,d,5,42000,119,25000
6,291146,1540387403303,q5805,a,d,5,30000,74,25000
7,291146,1540387455910,q4025,a,d,5,51000,76,25000
8,526792,1551936109338,q4482,a,a,5,24000,92,25000
9,526792,1551936146150,q6263,b,c,5,32000,71,25000


In [74]:
# col = concat_data.pop("tags")
# concat_data.insert(8, col.name, col)
concat_data.rename({
#     "timestamp": "start_time", 
    "question_id": "content_id"
#     "elapsed_time": "elapsed_time_in_ms",
},axis='columns',inplace=True)
concat_data

Unnamed: 0,student_id,start_time,content_id,user_answer,correct_answer,part,elapsed_time_in_ms,time_limit_in_ms,tags
0,291146,1540387236178,q6271,b,b,5,13000,25000,74
1,291146,1540387250432,q6469,a,a,5,11000,25000,81
2,291146,1540387276279,q4771,d,d,5,24000,25000,119
3,291146,1540387310819,q6722,a,a,5,33000,25000,77
4,291146,1540387327374,q6731,a,c,5,14000,25000,77
5,291146,1540387370881,q8695,c,d,5,42000,25000,119
6,291146,1540387403303,q5805,a,d,5,30000,25000,74
7,291146,1540387455910,q4025,a,d,5,51000,25000,76
8,526792,1551936109338,q4482,a,a,5,24000,25000,92
9,526792,1551936146150,q6263,b,c,5,32000,25000,71


In [80]:
concat_data.content_id = concat_data.content_id.str.replace('q','')

In [81]:
concat_data

Unnamed: 0,student_id,start_time,content_id,user_answer,correct_answer,part,elapsed_time_in_ms,time_limit_in_ms,tags
0,291146,1540387236178,6271,b,b,5,13000,25000,74
1,291146,1540387250432,6469,a,a,5,11000,25000,81
2,291146,1540387276279,4771,d,d,5,24000,25000,119
3,291146,1540387310819,6722,a,a,5,33000,25000,77
4,291146,1540387327374,6731,a,c,5,14000,25000,77
5,291146,1540387370881,8695,c,d,5,42000,25000,119
6,291146,1540387403303,5805,a,d,5,30000,25000,74
7,291146,1540387455910,4025,a,d,5,51000,25000,76
8,526792,1551936109338,4482,a,a,5,24000,25000,92
9,526792,1551936146150,6263,b,c,5,32000,25000,71


In [83]:
concat_data.content_id = concat_data.content_id.astype(int)

In [85]:
concat_data.to_csv(f"/shared/new_am_data/ednet_kt1.csv", index=False)

## Create Mapping for EdNet Contents

In [97]:
questions = pd.read_csv(f"/shared/new_am_data/ednet_contents/questions.csv", usecols=[0])
mapping = np.arange(1, len(questions)+1)
questions["mapping"] = mapping
questions.question_id = questions.question_id.str.replace('q','')
questions.to_csv("am_v2/load/ednet_content_mapping.csv", index=False, header=None)
questions

Unnamed: 0,question_id,mapping
0,1,1
1,2,2
2,3,3
3,4,4
4,5,5
5,6,6
6,7,7
7,8,8
8,9,9
9,10,10


## Split train/dev/test for EdNet users

In [58]:
max([len(ts.split(";")) for ts in ednet_kt1.tags.values])

7

In [13]:
(ednet_kt1.sample(100).elapsed_time_in_ms.values / 1000).tolist()

[14.0,
 17.0,
 17.333,
 28.0,
 21.0,
 25.0,
 21.0,
 12.0,
 28.0,
 8.0,
 10.0,
 52.0,
 26.667,
 3.0,
 24.333,
 26.0,
 19.0,
 39.0,
 58.0,
 35.333,
 19.0,
 24.0,
 12.0,
 26.0,
 22.0,
 22.0,
 109.333,
 25.666,
 32.0,
 13.0,
 3.0,
 9.0,
 20.0,
 39.0,
 9.0,
 14.0,
 49.0,
 25.0,
 14.0,
 17.0,
 24.0,
 30.0,
 19.0,
 22.0,
 17.0,
 17.0,
 59.6,
 59.8,
 12.0,
 24.0,
 30.0,
 12.0,
 30.0,
 8.666,
 9.0,
 15.0,
 20.666,
 22.0,
 21.0,
 14.0,
 87.75,
 7.0,
 15.0,
 4.0,
 46.6,
 21.0,
 6.0,
 52.0,
 51.0,
 20.0,
 16.0,
 6.0,
 31.0,
 23.0,
 21.0,
 57.75,
 12.0,
 25.333,
 25.0,
 109.5,
 35.333,
 14.0,
 33.0,
 47.0,
 4.666,
 7.0,
 11.0,
 11.0,
 11.5,
 19.0,
 49.5,
 27.666,
 48.0,
 20.0,
 19.0,
 52.0,
 26.5,
 336.0,
 42.2,
 3.0]

In [118]:
tags = [
    list(map(int, ts.split(";")))
    for ts in ednet_kt1.head(1000).tags.values
]
idx = np.random.randint(0, [len(ts) for ts in tags])
tags = np.array([ts + [0]*(7-len(ts)) for ts in tags])

In [119]:
tags[np.arange(30)]

array([[ 74,   0,   0,   0,   0,   0,   0],
       [ 81,   0,   0,   0,   0,   0,   0],
       [119,   0,   0,   0,   0,   0,   0],
       [ 77,   0,   0,   0,   0,   0,   0],
       [ 77,   0,   0,   0,   0,   0,   0],
       [119,   0,   0,   0,   0,   0,   0],
       [ 74,   0,   0,   0,   0,   0,   0],
       [ 76,   0,   0,   0,   0,   0,   0],
       [ 92,   0,   0,   0,   0,   0,   0],
       [ 71,   0,   0,   0,   0,   0,   0],
       [ 76,   0,   0,   0,   0,   0,   0],
       [ 85,   0,   0,   0,   0,   0,   0],
       [ 83,   0,   0,   0,   0,   0,   0],
       [ 78,   0,   0,   0,   0,   0,   0],
       [ 76,   0,   0,   0,   0,   0,   0],
       [128,   0,   0,   0,   0,   0,   0],
       [ 89,   0,   0,   0,   0,   0,   0],
       [107,   0,   0,   0,   0,   0,   0],
       [ 78,   0,   0,   0,   0,   0,   0],
       [120,   0,   0,   0,   0,   0,   0],
       [ 93,   0,   0,   0,   0,   0,   0],
       [ 88,   0,   0,   0,   0,   0,   0],
       [ 85,   0,   0,   0,   0,

In [120]:
tags[np.arange(1000), idx][:30]

array([ 74,  81, 119,  77,  77, 119,  74,  76,  92,  71,  76,  85,  83,
        78,  76, 128,  89, 107,  78, 120,  93,  88,  85, 100, 183, 184,
       182, 183, 179, 185])

In [24]:
ednet_kt1 = pd.read_csv("/shared/new_am_data/ednet_kt1.csv")

In [83]:
uids = ednet_kt1.student_id.unique()

In [84]:
print(len(uids))

784227


In [85]:
train_users, test_users = train_test_split(list(uids), test_size=0.05)
val_users, test_users = train_test_split(list(test_users), test_size=0.5)
print(len(train_users), len(val_users), len(test_users))

745015 19606 19606


In [86]:
with open("am_v2/load/ednet_user_split.pkl", "wb") as output_file:
    pkl.dump([train_users, val_users, test_users], output_file)

In [2]:
torch.Tensor(10),

(tensor([3.3215e-24, 4.5579e-41, 0.0000e+00, 0.0000e+00, 7.0625e-43, 0.0000e+00,
         2.8026e-45, 0.0000e+00, 3.3215e-24, 4.5579e-41]),)

In [123]:
import torch
import numpy as np

In [128]:
torch.tensor(3)

tensor(3)

In [129]:
torch.tensor([3])

tensor([3])

In [130]:
torch.Tensor(3.)

TypeError: new(): data must be a sequence (got float)

In [131]:
torch.Tensor(3)

tensor([ 1.2948e-10,  4.5580e-41, -1.4292e+16])

In [None]:
torch.Tensor()

In [137]:
a = [torch.Tensor([True,True]), torch.Tensor([False, True]), torch.Tensor([False, False])]
a = torch.stack(a)
a

tensor([[1., 1.],
        [0., 1.],
        [0., 0.]])

In [132]:
class DataSet(torch.utils.data.Dataset):
    def __len__(self):
        return 10000
    def __getitem__(self, index):
        return(
#             torch.Tensor([1,2,3,4,5]),
#             torch.Tensor([6,7,8,9,10]),
            torch.tensor([1,2,3,4,5]),
            torch.tensor([True, False, True, False, True]),
            2,
            True,
        )
    
x = torch.utils.data.DataLoader(
        dataset=DataSet(),
        shuffle=False,
        batch_size=4,
#         collate_fn=collate_fn
    )
for b in x:
    print(b)
    print()
    print(b[0])
    print(b[1])
    print(b[2])
    print(b[3])
    print(len(b))
    break


[tensor([[1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5]]), tensor([[ True, False,  True, False,  True],
        [ True, False,  True, False,  True],
        [ True, False,  True, False,  True],
        [ True, False,  True, False,  True]]), tensor([2, 2, 2, 2]), tensor([True, True, True, True])]

tensor([[1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5]])
tensor([[ True, False,  True, False,  True],
        [ True, False,  True, False,  True],
        [ True, False,  True, False,  True],
        [ True, False,  True, False,  True]])
tensor([2, 2, 2, 2])
tensor([True, True, True, True])
4


In [135]:
class DataSet(torch.utils.data.Dataset):
    def __len__(self):
        return 10000
    def __getitem__(self, index):
        return(
#             torch.Tensor([1,2,3,4,5]),
#             torch.Tensor([6,7,8,9,10]),
            [1,2,3,4,5],
            [True, False, True, False, True],
            5,
            3,
        )
    
def collate_fn(batch):
    a, b, c, d = zip(*batch)
    u = np.array(c) > 1
    a = torch.stack(list(torch.Tensor(a)[u]))
    b = torch.stack(list(torch.Tensor(b)[u]))
    d = torch.stack(list(torch.Tensor(d)[u]))
#     a = print(torch.stack(list(np.array(a, dtype=object)[u])))
#     b = print(torch.stack(list(np.array(b, dtype=object)[u])))
    return a, b, d

x = torch.utils.data.DataLoader(
        dataset=DataSet(),
        shuffle=False,
        batch_size=5,
        collate_fn=collate_fn
    )
for b in x:
    print(b[0])
    print(b[1])
    print(b[2])
    break

tensor([[1., 2., 3., 4., 5.],
        [1., 2., 3., 4., 5.],
        [1., 2., 3., 4., 5.],
        [1., 2., 3., 4., 5.],
        [1., 2., 3., 4., 5.]])
tensor([[1., 0., 1., 0., 1.],
        [1., 0., 1., 0., 1.],
        [1., 0., 1., 0., 1.],
        [1., 0., 1., 0., 1.],
        [1., 0., 1., 0., 1.]])
tensor([3., 3., 3., 3., 3.])


In [107]:
torch.__version__

'1.5.1'

In [106]:
torch.utils.data._utils.collate.default_collate

<function torch.utils.data._utils.collate.default_collate(batch)>

## Map EdNet User IDs to Vida User IDs

In [65]:
ednet_kt1 = pd.read_csv("/shared/new_am_data/ednet_kt1.csv")
ednet_kt1

Unnamed: 0,student_id,start_time,content_id,user_answer,correct_answer,part,elapsed_time_in_ms,time_limit_in_ms,tags
0,291146,1540387236178,6271,b,b,5,13000,25000,74
1,291146,1540387250432,6469,a,a,5,11000,25000,81
2,291146,1540387276279,4771,d,d,5,24000,25000,119
3,291146,1540387310819,6722,a,a,5,33000,25000,77
4,291146,1540387327374,6731,a,c,5,14000,25000,77
...,...,...,...,...,...,...,...,...,...
95293921,547317,1553949060051,987,a,a,2,9000,24613,27;24;40;181;183
95293922,547317,1553949082450,329,a,a,2,17000,24248,30;179;38;39;183;185
95293923,547317,1553949105804,309,a,c,2,17000,23856,28;29;183;184
95293924,547317,1553949137381,661,c,c,2,20000,23777,30;28;49;181;182


In [66]:
pk2uid = pd.read_csv("/shared/new_am_data/students_final_ver2.csv").set_index("pk").id.to_dict()
print(len(pk2uid))
pk2uid

1932104


{'1a455200-eea6-4286-beb0-243c419f7c7e': 1689134,
 'c4fbc4fe-5693-4979-b1ad-761632f83a5f': 1908674,
 '3efc21b1-0ebd-479e-a881-dd8bfa02f1fc': 1809050,
 '8cfba28b-e689-4e43-9cbe-27753f5f8ca4': 1815348,
 'ccfdeb8b-1404-4ef4-8c99-2b6e6169358e': 1526442,
 '79dcc26f-d7f4-4527-8dcc-7528c30bcb5d': 1613137,
 'e2c87a9d-5095-409f-8d68-9d721d84d76b': 1581623,
 '97ff3cc6-ac16-414f-880e-1eaaab6510c6': 1547243,
 '4a914f31-ff5f-4232-830b-a8a9b2605c47': 1573670,
 '523a872d-9f4c-4763-892b-e33d64cfb7e1': 1599528,
 '56dede28-adcd-46dd-b426-f8c7fe1144a2': 1681653,
 '49dd2c77-7df3-47da-9b28-9c36d0bd4ca4': 1700059,
 '84bcbfb3-3f66-42ed-b75f-42d75137da98': 1790835,
 '6b230653-ddcb-49c3-b0c4-84e5bc370a6a': 1651831,
 '70372d1f-045c-4138-b43b-1689d94adc24': 1855149,
 '3cf3bf2e-e033-4d73-96ad-21915c005ff2': 1526443,
 '994bae9e-a6fb-48a9-99a3-f906cc3f0150': 1618249,
 '4b1f5adc-4613-11ea-ab8c-9a282af7e099': 1533022,
 'f2c954fc-0ff5-4d42-b898-d55028bf77c4': 1526444,
 '23d7aa9c-8323-4a4d-bc83-446ff684468c': 1679224,


In [67]:
pk2eid = pd.read_csv("/shared/new_am_data/pk_table.csv")
pk2eid.uid = pk2eid.uid.str.replace('u', '').astype(int)
pk2eid

Unnamed: 0,pk,uid
0,8c76a82d-d419-40bf-a58b-4a9c70a29dd3,1
1,5c111fb2-9ede-11e7-93ea-5e144353e7c1,2
2,2c6ba86a-4bfe-4b70-b5f0-9584d3498111,3
3,64a6a41e-f41f-46e0-af2e-f42efbe8eb70,4
4,a09e1a60-e300-4a1a-b592-cb69318c246d,5
...,...,...
1919574,57eac089-c6d0-4b32-9bec-630f7db7149f,1919575
1919575,5924fc98-a94e-4a10-b056-b918ef160113,1919576
1919576,3f1feeba-dca5-4e4e-bc32-700006f004c5,1919577
1919577,24bc50bc-9c9e-4ed5-8707-1c533d5d759c,1919578


In [68]:
pk2eid = pk2eid.set_index("pk").uid.to_dict()
print(len(pk2eid))
pk2eid

1919579


{'8c76a82d-d419-40bf-a58b-4a9c70a29dd3': 1,
 '5c111fb2-9ede-11e7-93ea-5e144353e7c1': 2,
 '2c6ba86a-4bfe-4b70-b5f0-9584d3498111': 3,
 '64a6a41e-f41f-46e0-af2e-f42efbe8eb70': 4,
 'a09e1a60-e300-4a1a-b592-cb69318c246d': 5,
 '3b95e08c-416e-4a34-a5c1-09912fe69491': 6,
 '70157aa0-22f7-45e8-9778-f321f2b0a1ef': 7,
 'f132f138-afd5-4c0f-869f-687f69079c20': 8,
 'e030f9b4-d02e-48ef-9fd9-9d0782981b1b': 9,
 'c2a35b55-2b52-4740-ac8f-ac0cd77e902b': 10,
 '865b2bc1-a8f5-4130-9ed9-36177ee4c309': 11,
 '5cf25920-47c0-4365-a80b-fbaf5fb8168a': 12,
 'b127b6e7-3ada-43e6-a73e-71aa3a16e021': 13,
 '51b57f87-57fa-4f2a-9f65-3c5f171d15e0': 14,
 'd0122e16-6dfa-4ced-bd2e-c6c3ca922c21': 15,
 'bc687c16-2756-4505-bf59-b59d62a9d51e': 16,
 '8588b218-022c-47f2-9a47-6c17cec8818a': 17,
 '8588b218-022c-47f2-9a47-6c17cec88186': 18,
 'c8653650-8e4a-11e9-97e4-06a161f18377': 19,
 'b8870806-55b5-4c73-9406-fd636a190c12': 20,
 '4c81097a-d819-11e8-a472-76596ef8492b': 21,
 '38d99ae0-89f8-476d-a44b-3aebbfc5e2f5': 22,
 'e7d56569-2bc3-4ea

In [69]:
len(pk2uid)

1932104

In [70]:
pk2uid['8588b218-022c-47f2-9a47-6c17cec8818a']

KeyError: '8588b218-022c-47f2-9a47-6c17cec8818a'

In [71]:
eid2pk = {eid: pk for pk, eid in tqdm(pk2eid.items())}
print(len(eid2pk))
eid2pk

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=1919579.0), HTML(value='')))


1919579


{1: '8c76a82d-d419-40bf-a58b-4a9c70a29dd3',
 2: '5c111fb2-9ede-11e7-93ea-5e144353e7c1',
 3: '2c6ba86a-4bfe-4b70-b5f0-9584d3498111',
 4: '64a6a41e-f41f-46e0-af2e-f42efbe8eb70',
 5: 'a09e1a60-e300-4a1a-b592-cb69318c246d',
 6: '3b95e08c-416e-4a34-a5c1-09912fe69491',
 7: '70157aa0-22f7-45e8-9778-f321f2b0a1ef',
 8: 'f132f138-afd5-4c0f-869f-687f69079c20',
 9: 'e030f9b4-d02e-48ef-9fd9-9d0782981b1b',
 10: 'c2a35b55-2b52-4740-ac8f-ac0cd77e902b',
 11: '865b2bc1-a8f5-4130-9ed9-36177ee4c309',
 12: '5cf25920-47c0-4365-a80b-fbaf5fb8168a',
 13: 'b127b6e7-3ada-43e6-a73e-71aa3a16e021',
 14: '51b57f87-57fa-4f2a-9f65-3c5f171d15e0',
 15: 'd0122e16-6dfa-4ced-bd2e-c6c3ca922c21',
 16: 'bc687c16-2756-4505-bf59-b59d62a9d51e',
 17: '8588b218-022c-47f2-9a47-6c17cec8818a',
 18: '8588b218-022c-47f2-9a47-6c17cec88186',
 19: 'c8653650-8e4a-11e9-97e4-06a161f18377',
 20: 'b8870806-55b5-4c73-9406-fd636a190c12',
 21: '4c81097a-d819-11e8-a472-76596ef8492b',
 22: '38d99ae0-89f8-476d-a44b-3aebbfc5e2f5',
 23: 'e7d56569-2bc3

In [72]:
pkeid, pkuid = set(pk2eid.keys()), set(pk2uid.keys())

In [73]:
print("in EID, but NOT in UID:", len(pkeid-pkuid))
print("in UID, but NOT in EID:", len(pkuid-pkeid))
print("Length Difference:", len(pkuid)-len(pkeid))

in EID, but NOT in UID: 85
in UID, but NOT in EID: 12610
Length Difference: 12525


In [74]:
eid2uid = {eid: pk2uid[pk] for pk, eid in tqdm(pk2eid.items()) if pk in pkuid}
eid2uid

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=1919579.0), HTML(value='')))




{1: 1043805,
 2: 35257,
 3: 1091080,
 4: 1095233,
 5: 543341,
 6: 1097777,
 7: 755397,
 8: 1103352,
 9: 987771,
 10: 1050620,
 11: 1093065,
 12: 1068809,
 13: 1099884,
 14: 1092460,
 15: 520906,
 16: 927606,
 19: 896086,
 20: 614879,
 21: 371988,
 22: 896483,
 27: 369939,
 37: 508961,
 38: 27124,
 40: 1375,
 43: 697491,
 45: 914232,
 47: 918847,
 48: 918890,
 49: 897151,
 50: 385179,
 51: 899337,
 52: 919014,
 53: 918986,
 54: 919125,
 55: 307730,
 56: 919155,
 57: 261634,
 58: 172628,
 59: 919258,
 60: 919290,
 61: 919286,
 62: 919369,
 63: 919410,
 64: 919413,
 65: 919361,
 66: 919510,
 67: 595227,
 68: 919483,
 69: 919516,
 70: 919545,
 71: 919557,
 72: 919529,
 73: 919533,
 74: 919255,
 75: 919649,
 76: 914324,
 77: 919667,
 78: 919679,
 79: 919655,
 80: 647016,
 81: 656978,
 82: 791780,
 83: 919552,
 84: 551871,
 85: 919780,
 86: 919430,
 87: 919794,
 88: 919921,
 89: 919924,
 90: 919938,
 91: 919957,
 92: 919979,
 93: 919987,
 94: 920010,
 95: 91167,
 96: 920002,
 97: 919374,
 98

In [75]:
ednet_kt1.student_id = ednet_kt1.student_id.map(eid2uid)
ednet_kt1.student_id.dropna().astype(int)

0           369842
1           369842
2           369842
3           369842
4           369842
             ...  
95293921    771546
95293922    771546
95293923    771546
95293924    771546
95293925    771546
Name: student_id, Length: 95290122, dtype: int64

In [79]:
ednet_kt1 = ednet_kt1.dropna(subset=["student_id"])
ednet_kt1.student_id = ednet_kt1.student_id.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [80]:
ednet_kt1

Unnamed: 0,student_id,start_time,content_id,user_answer,correct_answer,part,elapsed_time_in_ms,time_limit_in_ms,tags
0,369842,1540387236178,6271,b,b,5,13000,25000,74
1,369842,1540387250432,6469,a,a,5,11000,25000,81
2,369842,1540387276279,4771,d,d,5,24000,25000,119
3,369842,1540387310819,6722,a,a,5,33000,25000,77
4,369842,1540387327374,6731,a,c,5,14000,25000,77
...,...,...,...,...,...,...,...,...,...
95293921,771546,1553949060051,987,a,a,2,9000,24613,27;24;40;181;183
95293922,771546,1553949082450,329,a,a,2,17000,24248,30;179;38;39;183;185
95293923,771546,1553949105804,309,a,c,2,17000,23856,28;29;183;184
95293924,771546,1553949137381,661,c,c,2,20000,23777,30;28;49;181;182


In [81]:
95293926 - 95290122

3804

In [82]:
ednet_kt1.to_csv(f"/shared/new_am_data/ednet_kt1.csv", index=False)