/
logfileMOOC.py
622 lines (543 loc) · 32.1 KB
/
logfileMOOC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
#!/usr/bin/env python
__author__ = 'IH'
__project__ = 'processMOOC'
import utilsMOOC as utils
import copy
import datetime
import random
from collections import defaultdict
from QHInstance import QHInstance
from topicModelLDA import LDAtopicModel as ldat
# LOGFILE NAMES
FILENAME_HELPERLOG = utils.FILENAME_HELPERLOG
FILENAME_SELECTIONLOG = utils.FILENAME_SELECTIONLOG
FILENAME_VOTELOG = utils.FILENAME_VOTELOG
FILENAME_USERLOG = utils.FILENAME_USERLOG
FILENAME_CLICKLOG = utils.FILENAME_CLICKLOG
EXTENSION_LOGFILE = utils.EXTENSION_LOGFILE
EXTENSION_PROCESSED = utils.EXTENSION_PROCESSED
# TEXT-RELATED VARIABLES
CONST_DELIMITER = utils.CONST_DELIMITER
CONST_DELIMITERVAR = "<DELIMITER>"
CONST_LINESTART = "{\"level\":\"info\",\"message\":\"<DELIMITER>"
# limiting date
CONST_FIRST_DAY = utils.CONST_FIRST_DAY
CONST_LAST_DAY = utils.CONST_LAST_DAY
# IMAGE VARIABLES
BADGE_NONE = {"http://erebor.lti.cs.cmu.edu/quickhelper/badges/blank.png", "http://i62.tinypic.com/2mgqut2.jpg"}
BADGE_ONE = {"http://erebor.lti.cs.cmu.edu/quickhelper/badges/helper1.png", "http://i61.tinypic.com/24q1jyr.jpg"}
BADGE_TWO = {"http://erebor.lti.cs.cmu.edu/quickhelper/badges/helper2.png", "http://i58.tinypic.com/29yjuww.jpg"}
BADGE_THREE = {"http://erebor.lti.cs.cmu.edu/quickhelper/badges/helper3.png", "http://i62.tinypic.com/214w7wl.jpg"}
BADGE_FOUR = {"http://erebor.lti.cs.cmu.edu/quickhelper/badges/helper4.png", "http://i58.tinypic.com/2cgymgh.jpg"}
BADGE_NONE_TXT = "0"
BADGE_ONE_TXT = "1"
BADGE_TWO_TXT = "2"
BADGE_THREE_TXT = "3"
BADGE_FOUR_TXT = "4"
# mapping from instance ID to conditions (badge, sentence, voting, userid)
count_repeat = 0
instances_by_id = {} # instance ID -> instance object QHInstance
dict_helpers = defaultdict(list) # instanceID -> a list of helper IDs shown
dict_selected_helpers = defaultdict(list) # instanceID -> a list of helper IDs that were selected
instances_by_dupkey = defaultdict(list) # dup key -> instance objects, keeping track of all items by duplicate key
dict_all_helpers = defaultdict(list) # instance_id -> list (3) helper logfile entries to remove duplicates from later
dict_num_helpers = {} # instance_id -> num helpers selected, to add to our instances
list_no_duplicates = [] # a list of instances with duplicates removed
list_sentences = [] # a list of a list of all words in post titles + post bodies, indices refer to Instances in list_no_duplicates
# TODO: command line input for column delimiters, dates, filenames, etc.
def run():
"""
Call the functions that do the processing for each kind of file
user.log is processed first and then written to file at the end
:return: None
"""
# Process of files
proc_user()
proc_selection()
proc_helper()
proc_vote()
proc_click()
remove_duplicates() # remove duplicates from user.log and helper.log
# user.log must be written at the end, because we need info from select.log
list_no_duplicates.sort(key=lambda r:r.timestamp) # sort by timestamp
# Writing to userfile and helperfile at the same time
userfile_out = open(FILENAME_USERLOG+EXTENSION_PROCESSED,'w')
userfile_out.write(QHInstance.get_headers(delimiter=CONST_DELIMITER)+'\n')
helperfile_out = open(FILENAME_HELPERLOG+EXTENSION_PROCESSED, 'w')
helpersample_out = open(FILENAME_HELPERLOG+utils.FILE_SAMPLE+EXTENSION_PROCESSED, 'w')
helper_headers = utils.COL_HELPERID + CONST_DELIMITER
helper_headers += "helperName" + CONST_DELIMITER
helper_headers += utils.COL_INSTANCEID + CONST_DELIMITER
helper_headers += utils.COL_NUMSTARS + CONST_DELIMITER + utils.COL_PREVHELPREQ + CONST_DELIMITER
helper_headers += utils.COL_NUMWEEKS+CONST_DELIMITER+utils.COL_TOPICMATCH+CONST_DELIMITER+utils.COL_RELSENTENCE
helper_headers += CONST_DELIMITER+utils.COL_IRRELSENTENCE+CONST_DELIMITER+utils.COL_DATE+CONST_DELIMITER
helper_headers += utils.COL_TIME + CONST_DELIMITER + utils.COL_WASSELECTED
helper_headers += CONST_DELIMITER + utils.COL_NUMSTARS+utils.COL_SHOWN + CONST_DELIMITER
helper_headers += utils.COL_NUMWEEKS+utils.COL_SHOWN + CONST_DELIMITER + utils.COL_TOPICMATCH+utils.COL_SHOWN
helper_headers += CONST_DELIMITER + QHInstance.get_headers(delimiter=CONST_DELIMITER) # adding all instance info to end of each helper line
helperfile_out.write(helper_headers+"\n")
helpersample_out.write(helper_headers+"\n")
lda = ldat(utils.NUM_LDA_TOPICS, list_sentences)
for qh_instance in list_no_duplicates:
# assign LDA topic
topic_name = lda.predict_topic(" ".join([getattr(qh_instance, 'question_title', ''), getattr(qh_instance,'question_body','')]))
setattr(qh_instance, 'lda_topic', topic_name)
# write user instance to file
userfile_out.write(qh_instance.to_string(delimiter=CONST_DELIMITER)+'\n')
# print associated helpers to the helper.log
instance_helpers = dict_all_helpers[getattr(qh_instance, 'instance_id')]
helpersample_out.write(random.choice(instance_helpers)+CONST_DELIMITER + qh_instance.to_string(delimiter=CONST_DELIMITER)+'\n') # write one randomly selected helper from this instance
for helper_line in instance_helpers:
helper_line += CONST_DELIMITER + qh_instance.to_string(delimiter=CONST_DELIMITER) # adding all instance info to end of helper lines
helperfile_out.write(helper_line+'\n')
userfile_out.close()
helperfile_out.close()
helpersample_out.close()
print("Done writing " + FILENAME_USERLOG+EXTENSION_LOGFILE+ " and " + FILENAME_HELPERLOG+EXTENSION_LOGFILE + " and " + FILENAME_HELPERLOG+utils.FILE_SAMPLE+EXTENSION_PROCESSED)
print("\tNumber of repeats in "+FILENAME_USERLOG+EXTENSION_LOGFILE+": "+str(count_repeat)+"\n")
def proc_user():
"""
Process the user log.
A line in the Userfile Log represents what user-level variables the user saw (specific information about individual
helpers shown is stored in the Helperfile Log).
ex: {"level":"info","message":"<DELIMITER>100<DELIMITER>1413061797181100<DELIMITER>1<DELIMITER>0<DELIMITER>1<DELIMITER>1<DELIMITER>0<DELIMITER>1833503<DELIMITER>2512601<DELIMITER>1657199<DELIMITER>title1<DELIMITER>body1<DELIMITER>","timestamp":"2014-10-11T21:09:57.182Z"}
Help Seeker User ID, Instance ID, Badge Shown?, Irrelevant Sentence Shown?, Voting Shown?, Anonymized Image Shown?, User ID Shown?, helper0, helper1, helper2, Question title, Question body
:return: None
"""
print("Processing "+FILENAME_USERLOG+EXTENSION_LOGFILE)
with open(FILENAME_USERLOG+EXTENSION_LOGFILE,'r') as f:
for line in f:
line = line[len(CONST_LINESTART): len(line)] # Cut off the extra chars from beginning
line = line.replace(CONST_DELIMITER, ' ') # Replace all occurrences of delimiters with empty space
line = line.replace(CONST_DELIMITERVAR, CONST_DELIMITER) # Replace delimiter stand-in with actual delimiters
array_line = line.split(CONST_DELIMITER)
#print(str(len(array_line))+" " +line)
col_user_id = array_line[0]
col_instance_id = array_line[1]
col_badge_shown = array_line[2]
col_irrelevant_sentence = array_line[3]
col_voting = array_line[4]
col_anon_img = array_line[5]
col_userid_shown = array_line[6]
col_helper0 = array_line[7]
col_helper1 = array_line[8]
col_helper2 = array_line[9]
col_ques_title = array_line[10]
col_ques_body = array_line[11]
# processing the timestamp
col_timestamp = get_timestamp(array_line[len(array_line) - 1]) # We know the last column is always a timestamp
# determining if this is a help request
col_help_req = utils.VAL_ISNOT
if is_help_topic(col_ques_title + " " + col_ques_body):
col_help_req = utils.VAL_IS
# processing extra column with a URL
col_url = ""
if len(array_line) > 13:
col_url = array_line[12]
# if the helper IDs are greater than the minimum, this is likely a TA version instance
col_version = utils.CONST_TA
if (int(col_helper0) > utils.CONST_MIN_USERID or int(col_helper1) > utils.CONST_MIN_USERID or int(col_helper2) > utils.CONST_MIN_USERID):
col_version = utils.CONST_STUDENT
# turns 0 and 1 conditional variables into categorical variables
if int(col_badge_shown):
col_badge_shown = utils.VAL_IS
else:
col_badge_shown = utils.VAL_ISNOT
if int(col_irrelevant_sentence):
col_irrelevant_sentence = utils.VAL_IS
else:
col_irrelevant_sentence = utils.VAL_ISNOT
if int(col_voting):
col_voting = utils.VAL_IS
else:
col_voting = utils.VAL_ISNOT
if int(col_anon_img):
col_anon_img = utils.VAL_IS
else:
col_anon_img = utils.VAL_ISNOT
if int(col_userid_shown):
col_userid_shown = utils.VAL_IS
else:
col_userid_shown = utils.VAL_ISNOT
# Create QHInstance
user_instance = QHInstance(col_user_id, col_instance_id, col_version, col_badge_shown, col_irrelevant_sentence, col_voting, col_anon_img, col_userid_shown,col_helper0, col_helper1, col_helper2, col_ques_title, col_ques_body, col_url, col_timestamp, col_help_req)
# keeping track of instances for printing late (if in correct date range)
# store instance if it's during the right time period
# AND if it's not one of the researchers' actions
# AND only if the message body is longer than __ characters.
if is_during_course(col_timestamp.date()) and not is_researcher(col_user_id) and len(col_ques_body) > 10:
instances_by_dupkey[user_instance.get_duplicate_key()].append(user_instance)
# all duplicates get added to our dictionary of instances
# since helper.log needs it (i.e., the first entry isn't
# always the one that was shown!
instances_by_id[col_instance_id] = user_instance # need this for the click.log
# Add helper IDs to dictionary of instances to helpers
dict_helpers[col_instance_id].append(col_helper0)
dict_helpers[col_instance_id].append(col_helper1)
dict_helpers[col_instance_id].append(col_helper2)
#print(user_instance.to_string(delimiter=CONST_DELIMITER))
print("Done processing "+FILENAME_USERLOG+EXTENSION_LOGFILE+"\n")
def proc_helper():
"""
Process the helper log file.
A line in the Helperfile Log represents all the information specific to the helper that the user saw.
ex: {"level":"info","message":"<DELIMITER>1<DELIMITER>1413061797181100<DELIMITER>8<DELIMITER>http://i58.tinypic.com/2cgymgh.jpg<DELIMITER>3<DELIMITER>This student has been participating in the course for 1 weeks and the matching of his/her interest and the topic of your query is 100.0 .<DELIMITER>","timestamp":"2014-10-11T21:09:57.182Z"}
:return: None
"""
print("Processing "+FILENAME_HELPERLOG+EXTENSION_LOGFILE)
with open(FILENAME_HELPERLOG+EXTENSION_LOGFILE, 'r') as f:
for line in f:
line = line[len(CONST_LINESTART): len(line)] # Cut off the extra chars from beginning
line = line.replace(CONST_DELIMITER, ' ') # Replace all occurrences of delimiters with empty space
line = line.replace(CONST_DELIMITERVAR,CONST_DELIMITER) # Replace delimiter stand-in with actual delimiters
array_line = line.split(CONST_DELIMITER)
col_helper_id = array_line[0]
col_instance_id = array_line[1]
col_helper_name = array_line[2]
col_badge_stars = get_badge_stars(array_line[3])
num_prev_inst = array_line[4]
col_rec_sentence = array_line[5]
col_irrel_sentence = "unknown" # This was missing in the logs!
col_num_weeks = get_num_weeks(col_rec_sentence)
col_topic_match = get_topic_match(col_rec_sentence)
# Processing the timestamp into a datetime object
col_timestamp = get_timestamp(array_line[len(array_line) - 1]) # We know the last column is always a timestamp
col_date = col_timestamp.date()
col_time = col_timestamp.time()
# handling num previous helper instances as none if in TA version
# it's always '100' in TA version, and this is meaningless
if str(getattr(instances_by_id[col_instance_id], "version", "")) == utils.CONST_TA:
num_prev_inst = ""
# Constructing the new helper logfile line
line = col_helper_id + CONST_DELIMITER
line += col_helper_name + CONST_DELIMITER
line += col_instance_id + CONST_DELIMITER
line += col_badge_stars + CONST_DELIMITER + num_prev_inst + CONST_DELIMITER + col_num_weeks + CONST_DELIMITER
line += col_topic_match + CONST_DELIMITER + col_rec_sentence + CONST_DELIMITER + col_irrel_sentence + CONST_DELIMITER
line += str(col_date) + CONST_DELIMITER + str(col_time)
# determine if this helper was selected
was_selected = utils.VAL_ISNOT
if col_instance_id not in dict_helpers: # that instance didn't occur in user.log
print("WARNING: instanceID in "+FILENAME_HELPERLOG+EXTENSION_LOGFILE+" not found in " + FILENAME_USERLOG+EXTENSION_LOGFILE+", not writing to file: " + col_instance_id)
elif col_helper_id in dict_selected_helpers[col_instance_id]: # this ID was selected in selection.log
was_selected = utils.VAL_IS
line += CONST_DELIMITER + str(was_selected)
# retrieve experimental conditions from dict
if col_instance_id not in instances_by_id:
print("WARNING: "+FILENAME_HELPERLOG+EXTENSION_LOGFILE+" instance does not exist in" + FILENAME_USERLOG+EXTENSION_LOGFILE+": "+col_instance_id)
# badges - add another column that has num stars only if in isBadgeShown condition
condition_shown = ""
if str(getattr(instances_by_id[col_instance_id], "cond_badge", "")) == utils.VAL_IS: #or int(dict_badge.get(col_instance_id)) > 0:
condition_shown = col_badge_stars
line += CONST_DELIMITER + condition_shown
# sentences - add 2 columns that has topic match + weeks if in isRelevantSentence condition
condition_shown = CONST_DELIMITER
if str(getattr(instances_by_id[col_instance_id], "cond_irrelevant_sentence", "")) == utils.VAL_ISNOT: #or int(dict_sentence.get(col_instance_id)) < 1:
condition_shown = col_num_weeks + CONST_DELIMITER + col_topic_match # could add the relevant sentence here, if desired
line += CONST_DELIMITER + condition_shown
# only store line if it's in our date range and it appeared in user.log
if is_during_course(col_date) and col_instance_id in dict_helpers: # that instance didn't occur in user.log:
dict_all_helpers[col_instance_id].append(line)
#print(line)
print("Done processing "+FILENAME_HELPERLOG+EXTENSION_LOGFILE+"\n")
def proc_selection():
"""
Process the selection log file.
A line in the Helperfile Log represents one (of three maximum) of the helpers selected by user
ex: {"level":"info","message":"<DELIMITER>11<DELIMITER>0<DELIMITER>","timestamp":"2014-10-11T21:09:57.211Z"}
:return:
"""
print("Processing "+FILENAME_SELECTIONLOG+EXTENSION_LOGFILE)
file_out = open(FILENAME_SELECTIONLOG+EXTENSION_PROCESSED,'w')
file_out.write(utils.COL_INSTANCEID+CONST_DELIMITER+utils.COL_HELPERSELECTED+CONST_DELIMITER+utils.COL_SELECTEDHELPER_ID+CONST_DELIMITER+utils.COL_DATE+CONST_DELIMITER+utils.COL_TIME+"\n")
with open(FILENAME_SELECTIONLOG+EXTENSION_LOGFILE,'r') as f:
for line in f:
line = line[len(CONST_LINESTART): len(line)] # Cut off the extra chars from beginning
line = line.replace(CONST_DELIMITER, ' ') # Replace all occurrences of delimiters with empty space
line = line.replace(CONST_DELIMITERVAR,CONST_DELIMITER) # Replace delimiter stand-in with actual delimiters
# print(line)
array_line = line.split(CONST_DELIMITER)
col_instance_id = array_line[0]
col_helper_selected = array_line[1]
# Processing the timestamp into a datetime object
col_timestamp = get_timestamp(array_line[len(array_line) - 1]) # We know the last column is always a timestamp
col_date = col_timestamp.date()
col_time = col_timestamp.time()
# Constructing the new selection logfile line
line = col_instance_id + CONST_DELIMITER + col_helper_selected + CONST_DELIMITER
# retrieve helper user ID
if len(col_helper_selected) > 1: #i.e., it's not 0,1, or 2 ("NONE")
line += ""
else:
array_helpers = dict_helpers[col_instance_id] # all helpers shown for this instance
helper_id = ""
if len(array_helpers) < int(col_helper_selected)+1: # less helpers than the index of this helper
print("WARNING: Helper index ("+col_helper_selected+") referenced in " + FILENAME_SELECTIONLOG+EXTENSION_LOGFILE+" is outside possible number of helpers (i.e., "+str(len(array_helpers))+") for instance #: " + col_instance_id)
elif 0 <= int(col_helper_selected) <= 2: # valid index is 0, 1, or 2
helper_id = array_helpers[int(col_helper_selected)]
else: # we have an invalid helper index
print("WARNING: Helper index ("+col_helper_selected+") referenced in " + FILENAME_SELECTIONLOG+EXTENSION_LOGFILE+" does not exist in instance: " + col_instance_id)
line += helper_id
# record this as a selected helper for helper.log
dict_selected_helpers[col_instance_id].append(helper_id)
dict_num_helpers[col_instance_id] = int(dict_num_helpers.get(col_instance_id, 0)) + 1 # add one to our number of helpers selected
# continue...
line += CONST_DELIMITER + str(col_date) + CONST_DELIMITER + str(col_time)
# only write line if it's in our date range
if is_during_course(col_date):
file_out.write(line+'\n')
print("Done processing "+FILENAME_SELECTIONLOG+EXTENSION_LOGFILE+"\n")
file_out.close()
def proc_vote():
"""
Process the vote log file.
A line in the Upvote Log represents each instance a Helper up or downvotes a QuickHelp request.
ex: {"level":"info","message":"<DELIMITER>2231948<DELIMITER>1<DELIMITER>1<DELIMITER>","timestamp":"2014-10-11T06:05:13.668Z"}
:return: None
"""
print("Processing "+FILENAME_VOTELOG+EXTENSION_LOGFILE)
file_out = open(FILENAME_VOTELOG+EXTENSION_PROCESSED,'w')
file_out.write(utils.COL_HELPERID+CONST_DELIMITER+utils.COL_INSTANCEID+CONST_DELIMITER+utils.COL_VOTE+CONST_DELIMITER+utils.COL_DATE+CONST_DELIMITER+utils.COL_TIME+"\n")
with open(FILENAME_VOTELOG+EXTENSION_LOGFILE,'r') as f:
for line in f:
line = line[len(CONST_LINESTART): len(line)] # Cut off the extra chars from beginning
line = line.replace(CONST_DELIMITER, ' ') # Replace all occurrences of delimiters with empty space
line = line.replace("<i>", '') # Vote.log has some HTML to remove
line = line.replace("</i>", '')
line = line.replace(CONST_DELIMITERVAR,CONST_DELIMITER) # Replace delimiter stand-in with actual delimiters
# print(line)
array_line = line.split(CONST_DELIMITER)
col_helper_id = array_line[0]
col_instance_id = array_line[1]
col_vote = array_line[2]
# Processing the timestamp into a datetime object
col_timestamp = get_timestamp(array_line[len(array_line) - 1]) # We know the last column is always a timestamp
col_date = col_timestamp.date()
col_time = col_timestamp.time()
# Constructing the new vote logfile line
line = col_helper_id + CONST_DELIMITER + col_instance_id + CONST_DELIMITER + col_vote + CONST_DELIMITER
line += str(col_date) + CONST_DELIMITER + str(col_time)
# only write line if it's in our date range
if is_during_course(col_date):
file_out.write(line+'\n')
#print(line)
print("Done processing "+FILENAME_VOTELOG+EXTENSION_LOGFILE+"\n")
file_out.close()
def proc_click():
"""
Process the click log file.
A line in the Click Log represents each instance a Helper up or downvotes a QuickHelp request.
{"level":"info","message":"<DELIMITER><i>helper_id</i><DELIMITER><i>instance_id</i><DELIMITER>https://www.edx.org//courses/UTArlingtonX/LINK5.10x/3T2014/discussion/forum/8d9482b366ae4999b706b2d7372d8393/threads/54808da7a2a525e05300156b<DELIMITER>","timestamp":"2014-12-04T16:35:45.863Z"}
:return: None
"""
print("Processing "+FILENAME_CLICKLOG+EXTENSION_LOGFILE)
file_out = open(FILENAME_CLICKLOG+EXTENSION_PROCESSED,'w')
file_out.write(utils.COL_HELPERID+CONST_DELIMITER+utils.COL_INSTANCEID+CONST_DELIMITER+utils.COL_DATE_SENT+CONST_DELIMITER+utils.COL_TIME_SENT+CONST_DELIMITER+utils.COL_DATE_CLICKED+CONST_DELIMITER+utils.COL_TIME_CLICKED+CONST_DELIMITER+utils.COL_QTITLE+CONST_DELIMITER+utils.COL_QBODY+CONST_DELIMITER+utils.COL_URL+"\n")
with open(FILENAME_CLICKLOG+EXTENSION_LOGFILE,'r') as f:
for line in f:
line = line[len(CONST_LINESTART): len(line)] # Cut off the extra chars from beginning
line = line.replace(CONST_DELIMITER, ' ') # Replace all occurrences of delimiters with empty space
line = line.replace("<i>", '') # Click.log has some HTML to remove
line = line.replace("</i>", '')
line = line.replace(CONST_DELIMITERVAR,CONST_DELIMITER) # Replace delimiter stand-in with actual delimiters
array_line = line.split(CONST_DELIMITER)
col_helper_id = array_line[0]
col_instance_id = array_line[1]
col_url = array_line[2]
# Processing the timestamp into a datetime object
col_timestamp = get_timestamp(array_line[len(array_line) - 1]) # We know the last column is always a timestamp
col_date_clicked = col_timestamp.date()
col_time_clicked = col_timestamp.time()
# Retrieving timestamp when URL was sent
col_date_sent = ""
col_time_sent = ""
qtitle = ""
qbody = ""
if col_instance_id in instances_by_id: # instance id might be 'instance_id'
col_date_sent = getattr(instances_by_id[col_instance_id], 'timestamp').date()
col_time_sent = getattr(instances_by_id[col_instance_id], 'timestamp').time()
qtitle = getattr(instances_by_id[col_instance_id], 'question_title')
qbody = getattr(instances_by_id[col_instance_id], 'question_body')
else:
print("WARNING: Encountered instance ID in " + FILENAME_CLICKLOG+EXTENSION_LOGFILE+" that is not in " + FILENAME_USERLOG+EXTENSION_LOGFILE+": " + col_instance_id)
# Constructing the new vote logfile line
line = col_helper_id + CONST_DELIMITER + col_instance_id + CONST_DELIMITER + str(col_date_sent) + CONST_DELIMITER
line += str(col_time_sent) + CONST_DELIMITER + str(col_date_clicked) + CONST_DELIMITER
line += str(col_time_clicked) + CONST_DELIMITER + qtitle + CONST_DELIMITER + qbody + CONST_DELIMITER + col_url
# only write line if the email with URL was sent in our date range
if col_instance_id in instances_by_id and is_during_course(col_date_sent):
file_out.write(line+'\n')
# TODO: there's duplicates in the click logs from clicking several times that need to be removed
#print(line)
print("Done processing "+FILENAME_CLICKLOG+EXTENSION_LOGFILE+"\n")
file_out.close()
def remove_duplicates():
"""
Remove duplicates from our list of instances, based on whatever key was used in duplicate_instances
:return: A list of QHInstances with all duplicates removed
"""
for duplicate_key in instances_by_dupkey: # iterate through each key in our duplicate-arranged list
selected_dup = None # instance with a selection (the one shown)
for dup in instances_by_dupkey[duplicate_key]: # for each instance object in these duplicates
num_helpers = dict_num_helpers.get(getattr(dup, 'instance_id'), 0)
if num_helpers > 0: # If it has helpers selected, it's the one
selected_dup = dup
setattr(dup, 'num_helpers_selected', num_helpers)
if selected_dup is None:
selected_dup = create_new_duplicate(instances_by_dupkey[duplicate_key]) # Clear out non-matching condition variables
# Clean the selected duplicate's message
message_title = ldat.clean_string(getattr(selected_dup, 'question_title', ''))
message_body = ldat.clean_string(getattr(selected_dup, 'question_body', ''))
setattr(selected_dup, 'question_title', message_title)
setattr(selected_dup, 'question_body', message_body)
list_no_duplicates.append(selected_dup) # Record selected_dup as our correct one
# Store the message for topic modeling
list_sentences.append(ldat.to_bow(message_title + ' ' + message_body))
if len(instances_by_dupkey[duplicate_key]) > 1: # counting our duplicates
global count_repeat
count_repeat += len(instances_by_dupkey[duplicate_key])-1
return list_no_duplicates
def create_new_duplicate(duplicates):
"""
Given a list of duplicates, return a QHInstance that only has entries for columns
that have identical values for all items in the list (i.e. "isBadgeShown" is 0 for all instances, etc)
:param duplicates: a list of items that are pre-determined to be duplicates
:return: a new QHInstance containing only info the duplicates share
"""
# Creating our default duplicate object to return
new_duplicate = copy.copy(duplicates[0])
if len(duplicates) > 1: # we have [several] duplicates
badge = getattr(new_duplicate, 'cond_badge')
sentence = getattr(new_duplicate,'cond_irrelevant_sentence')
voting = getattr(new_duplicate, 'cond_voting')
#anon_img = getattr(new_duplicate, 'cond_anon_img') # Didn't implement the image condition, so don't worry about it
uid = getattr(new_duplicate, 'cond_user_id')
h0 = getattr(new_duplicate, 'id_helper0')
h1 = getattr(new_duplicate, 'id_helper1')
h2 = getattr(new_duplicate, 'id_helper2')
same_badge = True
same_sentence = True
same_voting = True
#same_img = True # Didn't implement the image condition, so don't worry about it
same_uid = True
same_h0 = True
same_h1 = True
same_h2 = True
for dup in duplicates: # iterate through each duplicate-arranged list
# Only check if they've been the same so far...
if same_badge and not badge == getattr(dup, 'cond_badge'): # attribute is not the same
same_badge = False
setattr(new_duplicate, 'cond_badge', "")
if same_sentence and not sentence == getattr(dup, 'cond_irrelevant_sentence'): # attribute is not the same
same_sentence = False
setattr(new_duplicate, 'cond_irrelevant_sentence', "")
if same_voting and not voting == getattr(dup, 'cond_voting'): # attribute is not the same
same_voting = False
setattr(new_duplicate, 'cond_voting', "")
if same_uid and not uid == getattr(dup, 'cond_user_id'): # attribute is not the same
same_uid = False
setattr(new_duplicate, 'cond_user_id', "")
if same_h0 and not h0 == getattr(dup, 'id_helper0'): # attribute is not the same
same_h0 = False
setattr(new_duplicate, 'id_helper0', "")
if same_h1 and not h1 == getattr(dup, 'id_helper1'): # attribute is not the same
same_h1 = False
setattr(new_duplicate, 'id_helper1', "")
if same_h2 and not h2 == getattr(dup, 'id_helper2'): # attribute is not the same
same_h2 = False
setattr(new_duplicate, 'id_helper2', "")
return new_duplicate
def is_during_course(instance_date):
"""
Determine if given date is within the range of dates the course took place
:param instance_date: date to check if it's in range
:return: True if given date is in our restricted time range
"""
if instance_date is not None:
if CONST_LAST_DAY >= instance_date >= CONST_FIRST_DAY:
return True
else: # Not in given course date range
return False
else:
print("ERROR:: logfileMOOC.is_during_course(): Cannot process date column: " + instance_date)
return False
def is_researcher(userID):
"""
Check if given user is one of the researchers
:param userID: The user ID we want to see if it's a researcher/TA
:return: True if userID is that of a researcher
"""
return int(userID) < utils.CONST_MIN_USERID or int(userID) in utils.exclude_ids # TAs and researchers had userIDs less than 0
def get_timestamp(tstamp):
"""
Clean the timestamp from a messy string in the logfiles
:param tstamp: string containing the timestamp
:return: the datetime object
"""
tstamp = tstamp[tstamp.index(':')+1: (len(tstamp)-2)].replace("\"",'') # remove meta tags
tstamp = tstamp.replace('Z', '') # TODO: Figure out how to convert the Z at the end
try:
tstamp = datetime.datetime.strptime(tstamp,'%Y-%m-%dT%H:%M:%S.%f')
except ValueError as err:
print(tstamp, err)
return tstamp
def get_badge_stars(url):
"""
Determine how many stars were on the badge, depending on badge URL
:param url: the urlk of the image of the badge shown
:return: a string containing the number of stars in that badge
"""
if url in BADGE_NONE:
return BADGE_NONE_TXT
elif url in BADGE_ONE:
return BADGE_ONE_TXT
elif url in BADGE_TWO:
return BADGE_TWO_TXT
elif url in BADGE_THREE:
return BADGE_THREE_TXT
elif url in BADGE_FOUR:
return BADGE_FOUR_TXT
else:
print("ERROR:: logfileMOOC.get_badge_stars(): Missing badge ID: " + url)
def get_topic_match(sentence):
"""
Find what the topic match percentage was given the topic match sentence
:param sentence: the relevant sentence containing topic match info
:return: topic match percentage
"""
if sentence.find("Teaching Assistants") >= 0:
return ""
else:
tm = sentence[len(sentence)-7:len(sentence)-1]
tm = float(tm.strip("abcdefghijklmnopqrstuvwxyz% "))
if tm < 1: # Turning decimals into percentage numbers
tm = tm*100
return str(tm)
def get_num_weeks(sentence):
"""
Find how many weeks the helper's been around, given the topic match sentence
:param sentence: the relevant sentence containing the number of weeks info
:return: number of weeks been in the course
"""
if sentence.find("Teaching Assistants") >= 0:
return ""
else:
return sentence[sentence.find("week")-2: sentence.find("week")]
def is_help_topic(sentence):
"""
Determine if the given string (message post) contains a question or help request.
:param sentence: a string sentence / message post
:return: True if the string is about help seeking
"""
# TODO: this is a super naive way to determine this
if "help" in sentence or "question" in sentence or "?" in sentence or "dunno" in sentence or "n't know" in sentence:
return True
if "confus" in sentence or "struggl" in sentence or "lost" in sentence or "stuck" in sentence or "know how" in sentence:
return True
return False
'''
So that logfileMOOC can act as either a reusable module, or as a standalone program.
'''
if __name__ == '__main__':
print("Running logfileMOOC")
run()