In [2]:
import json
import time
from tqdm import trange, tqdm
import os
from typing import Dict, List, Tuple, Union

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
FILEPATH_CSV = "./csc108_fall2021.csv"

In [5]:
data = pd.read_csv(FILEPATH_CSV, index_col=0)
data.head()
data

# 2022-01-03T00:08:09Z   2022-01-03T00:51:25Z

Unnamed: 0_level_0,question_title,question,folders,student_poster_name,date_question_posted,student_answer,student_answer_name,date_student_answer_posted,num_student_helpful,instructor_answer,instructor_answer_name,date_instructor_answer_posted,num_instructor_helpful,is_followup
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
kxxxcewo3k2kd,Summer Courses,Which of the computer science courses are goin...,utm/life/other,ksoq5pmzwq514u,2022-01-03T00:08:09Z,,,,,"We will not know until March, when the summer ...",gzcyozk0MBl,2022-01-03T00:51:25Z,0.0,False
kxxrckc7d8827t,What should I do?,"I failed csc108, so I'm going to re-take it. B...",general,ksoq5tpdnh31kw,2022-01-02T21:20:18Z,There's a chance that csc148 can be taken in s...,ksoq6exja9d3b9,2022-01-02T21:26:51Z,1.0,,,,,False
kxxg3uqr8kc4ui,What is the yotube account that made pcrs videos?,,pcrs,ks9l55ygvlm30f,2022-01-02T16:05:36Z,https://youtube.com/channel/UCu8NnRGTGxHe96Le0...,kr2bzqy2dok7ou,2022-01-02T16:07:36Z,0.0,There isn't a single account. Many of the vide...,gzcyozk0MBl,2022-01-02T16:08:36Z,0.0,False
kxwl9h27m1h33z,UTM Exam fail question,"Hi, I'm just confused about how the fail thing...",utm/life/other,ksoq5j54ygqp0,2022-01-02T01:42:10Z,,,,,The grade you've received on Acorn is Final. S...,jc6jo3gvkr8542,2022-01-02T01:43:55Z,0.0,True
kxves5858h42re,In-person Learning For Winter Term,"Hello, will all courses (such as CSC148H5 and ...",utm/life/other,ksoq6sru6w34ak,2022-01-01T05:52:58Z,I believe so.,ksoq6sajaqn48y,2022-01-01T05:54:54Z,0.0,> Does this mean I’ll be taking online courses...,jc6jo3gvkr8542,2022-01-01T06:13:30Z,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
kspl0arizjk45a,Spatial Skills: Pretest - Shapes difficult to ...,"Hello,\n\n As I was doing the Spatial Skills P...",spatial,ksoq5p0f71h12q,2021-08-24T04:42:07Z,"Its the dark mode making the shapes look odd, ...",ksoq6px0rgz43d,2021-08-30T06:59:49Z,0.0,Could you give us an example (screenshot) of o...,gzcyozk0MBl,2021-08-24T04:45:41Z,0.0,True
kspgo42greo387,Question I make based on Week 1 material,Hello:\nThere are question made by me to check...,general,keivl0bhdc52f5,2021-08-24T02:40:40Z,Q1) 8**6//4+6+6*8\n = 262144//4+6+6*8\n =6...,ksoq5rauj41c2,2021-08-25T06:55:53Z,0.0,,,,,True
ksp2nuwazar3ao,"Spatial skills: Orthographic Views 1, #3",Hello:\nAs I am doing the week 1 spatial Skill...,spatial,keivl0bhdc52f5,2021-08-23T20:08:34Z,Just drop hints here: There will be some edges...,keivl0bhdc52f5,2021-08-23T21:17:37Z,1.0,The key to this question is the dotted lines. ...,k4ddfmb0gsb1h,2021-08-23T21:48:22Z,1.0,True
ksp12grb4ub7aj,I don't think I am in the right lecture,I am currently enrolled in CSC108H5 F LEC 9106...,lecture,ksoq61vj96i2d4,2021-08-23T19:23:56Z,,,,,"Hi Yaseen! Yes you are, all the lectures secti...",k4ddfmb0gsb1h,2021-08-24T00:56:12Z,2.0,False


In [74]:
import html
html.unescape('hello I&#39;m Bob')

"hello I'm Bob"

## Textual and Non-Textual Features

### Textual - no NLP techniques
- question/answer length
  - code snippets start with "```" or "<pre> </pre>" tags
  - omit q&a with images
  - ignore images, hyperlinks and other html tags -> raw q/a length
- level of answer detail
  - links, imgs, code-snippets

- duplicate post; rename to reference to other post
  - check for annotation in response to determine if duplicate?

#### Textual - NLP techniques
- sentiment analysis **vs. politeness**
- grammar/spelling 
- answer relevance to question
- look at individual words and how they relate to is_helpful

- discretize these features


### Non-Textual
- *asker id* 
- response time (responder date) difference in time b/w question posted and response
- *question creation date* 
- category/folder
  - use general categories that are common across classes
    - general/logistics/administrative/misc, lecture, assignment/lab/pcrs/spatial, test/exam, 
    - 0, 1, 2, 3, 4
- is_followup
- **reputation** rename to **authority**
- public vs. private
  - affects helpful bins since cannot be more than 1 helpful
- **close to deadline?**
  - need to manually augment data for this
  - make continuous and add boundaries for use with decision tree 



* *italics* cannot be used for predicting future helpful answers since they are unique to question-answer pair




## Mutual Information

- idea that helpful = "positive feedback" instead of actually "helpful"
- students click "helpful" as "positive feedback" instead of because answer is actually "helpful"

### Classification problem
- binary classification. 
  - (question, answer)  -> helpful/not-helpful, very helpful, mildly helpful, unhelpful (break into 3 discrete categories)
    - 0, 1, > 1
    - most responses only get 0,1 or > 1 helpful


(question, s_answer, i_answer)
- split into 2?
  - (question, s_answer, reputation)
  - (question, i_answer, reputation)  
  - reputation = student|instructor|instr_endorsed_answerer
- take helpful answer. If none, choose arbitrarily
  - cannot use reputation as feature



## Feedback

- ensure metrics are justified or come from literature
- make a table of features with columns: whether feature exists in literature, if doesn't exist, brief justification, mutual information score, other scores




