In [79]:
import json
import time
from tqdm import trange, tqdm
import os
from typing import Dict, List, Tuple, Union

import csv
import unicodedata
import html
from io import StringIO
from html.parser import HTMLParser
from piazza_api import Piazza

import pandas as pd
from pandas import DataFrame, Series
import numpy as np
from numpy import ndarray 
from nptyping import NDArray, Int, Shape
import matplotlib.pyplot as plt
import seaborn as sns

from copy import deepcopy

from piazza_api.network import Network

CRED_FILE = "creds.json"


"""Custom Types"""
Answer = Dict[str,Dict[str,Union[str,int]]]
Post = Dict[str,Union[str, Union[str,int,List]]]

"""Macros"""
# who the answer is coming from
STUDENT, INSTRUCTOR, STUDENT_ENDORSED_ANSWERER = 0, 1, 2


In [11]:
class MyHTMLParser(HTMLParser):
    """taken from: [1]"""
    
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    """strips html tags and substitutes html entities """
    #html = html.unescape(html)
    s =  MyHTMLParser()
    s.feed(html)
    return s.get_data()

In [12]:
def login() -> Tuple[dict, Network]:
    """logs user into Piazza"""

    email:str 
    password:str 
    courseid:str 

    with open(CRED_FILE) as f:
        creds = json.load(f)
        email, password, courseid = creds['email'], creds['password'], creds['courseid']


    #print(f"email: {email} \npassword: {password} \ncourseid: {courseid}")


    p: Piazza = Piazza()
    p.user_login(email, password)
    user_profile: dict = p.get_user_profile()
    course: Network = p.network(courseid)
    return user_profile, course

In [13]:
def get_post_creator(post):
    for entry in post['change_log']:
        if entry['type'] == 'create':
            return entry['uid']


def get_post_created(post):
    """get time post was created"""
    for entry in post['change_log']:
        if entry['type'] == 'create':
            return entry['when']


def get_posts_by_student(filename:str, student_id:str) -> List[Post]:
    student_posts = []
    with open(filename, 'r') as f:
        all_posts = json.load(f)
        for p in all_posts:
            if get_post_creator(p) == student_id:
                student_posts.append(p)
    return student_posts


def get_endorsed_students(course: Network) -> Tuple[Dict, Dict]:
    endorsed_users = {}
    non_endorsed_users = {}
    users = course.get_all_users()
    for u in users:
        if u['endorser']:
            endorsed_users[u['id']] = u['name']
        else:
            non_endorsed_users[u['id']] = u['name']


    return endorsed_users, non_endorsed_users



def get_answers(post:Post, endorsed_students: Dict) -> Answer:
    """ Get student and instructor answers
    """

    answers = {}
    answers['s_answer'] = {}
    answers['i_answer'] = {}

    for t in answers.keys():
        for ans in post['children']:
            if ans['type'] == t:      
                vals = answers[t]
                text = ans['history'][0]['content']
                text = strip_tags(text)
                vals['text'] = text
                vals['poster'] = ans['history'][0]['uid']
                vals['date'] = ans['history'][0]['created']
                vals['num_helpful'] = len(ans['tag_endorse_arr'])
                if get_post_creator(post) in ans['tag_endorse_arr']:
                    vals['is_helpful'] = True 
                else:
                    vals['is_helpful'] = False

                if ans['type'] == "s_answer":
                    
                    student_poster_id = ans['history'][0]['uid'] # id of the most recent student answer editor
                     # check if student is endorsed
                    vals['is_endorser'] = False
                    if student_poster_id in endorsed_students:
                        vals['is_endorser'] = True
                   
                break
    
    return answers

In [254]:
def export_posts_json(filename:str, course:Network) -> None:
    """Create json of all posts saved in current directory"""

    if os.path.exists(filename):
        print(f"{filename} already exists!")
        return 
    posts = course.iter_all_posts()
    all_posts = []
    #text = json.dumps(post['children'][1], sort_keys=True, indent=4)
    try:
        for p in tqdm(posts):
            all_posts.append(p)
    
    finally:
        print('------------------------------------')
        with open(filename, 'w') as f:
            json.dump(all_posts, f)

In [102]:

def json_to_csv(json_file_path: str, csv_filename: str, course: Network, is_overwrite_csv: bool=False):
    """Use param::course to extract student answerer to determine whether they are endorsed"""

    schema = ("post_id,question_title,question,folders,student_poster_name,date_question_posted," 
    "student_answer,student_answer_name,date_student_answer_posted,is_student_endorsed,is_student_helpful,"
    "instructor_answer,instructor_answer_name,date_instructor_answer_posted,is_instructor_helpful," 
    "is_followup\n")

    # schema = ("post_id,question_title,question,folders,student_poster_name,date_question_posted," 
    # "answer,answer_poster,date_answer_posted,num_helpful,"
    # "instructor_answer,instructor_answer_name,date_instructor_answer_posted,num_instructor_helpful," 
    # "is_followup\n")
 

    parser = MyHTMLParser()

    endorsed_students = get_endorsed_students(course)[0]

    with open(json_file_path, 'r') as json_file:
        with open(csv_filename, 'w') as csv_file:
            csv_file.write(schema)
            posts = json.load(json_file)
            for post in tqdm(posts):   
                row = [] 
                if post['type'] == 'question':
                    
                    question = post['history'][0] # newest update of question. Change index to -1 for oldest
                    #question_title =  html.unescape(question['subject'])
                    question_title = strip_tags(question['subject'])
                    question_content = strip_tags(question['content'])
                    folders = ','.join(post['folders'])
                    date_created = get_post_created(post)
                    answers = get_answers(post, endorsed_students)
                    student_answer = answers['s_answer']
                    instructor_answer = answers['i_answer']
                    #print(instructor_answer)


                    row = [post['nr'], question_title, question_content, folders, get_post_creator(post), date_created]
                    s_row, i_row = [], []
                    if student_answer:
                        s_row = [student_answer['text'], student_answer['poster'], student_answer['date'], str(student_answer['is_endorser']), str(student_answer['is_helpful'])] 
                    else:
                        s_row = [None, None, None, None, None]

                    if instructor_answer:
                        i_row = [instructor_answer['text'], instructor_answer['poster'], instructor_answer['date'], str(instructor_answer['is_helpful'])] 
                    else:
                        i_row = [None, None, None, None]
                    
                    row = row + s_row + i_row

                    is_followup = 'False'

                    for c in post['children']:
                        if c['type'] == 'followup':
                            is_followup = 'True'
                    
                    row += [is_followup]
                    #print(row)


                   
                    post_writer = csv.writer(csv_file)
                    post_writer.writerow(row)
                    
                    csv_file.write('\n')

In [103]:
user_profile,course = login()

# e = get_endorsed_students(course)
# n = get_endorsed_students(course)

# e

#export_posts_json("csc108_fall2021.json", course)
json_to_csv("./csc108_fall2021.json", "csc108_fall2021.csv", course)


100%|██████████| 3337/3337 [00:00<00:00, 10244.09it/s]


In [101]:
user_profile,course = login()

user = course.get_users(['krz7jwkviui2p3'])
post = course.get_post('kspl0arizjk45a')

p = course.get_feed()

text = json.dumps(post, sort_keys=True, indent=4)


users = course.get_all_users()

user

print(text)





{
    "anon_icons": true,
    "anon_map": {
        "ksoq5p0f71h12q": "a_0"
    },
    "bookmarked": 5,
    "bucket_name": "Today",
    "bucket_order": 3,
    "change_log": [
        {
            "anon": "no",
            "data": "kspl0arkosj45b",
            "type": "create",
            "uid": "ksoq5p0f71h12q",
            "v": "all",
            "when": "2021-08-24T04:42:07Z"
        },
        {
            "anon": "no",
            "data": "kspl3iukzq81v2",
            "type": "update",
            "uid": "ksoq5p0f71h12q",
            "when": "2021-08-24T04:44:38Z"
        },
        {
            "anon": "no",
            "data": "kspl4w0ow5173u",
            "to": "kspl0arizjk45a",
            "type": "i_answer",
            "uid": "gzcyozk0MBl",
            "when": "2021-08-24T04:45:41Z"
        },
        {
            "anon": "no",
            "cid": "kspl5pjxm5sr0",
            "to": "kspl0arizjk45a",
            "type": "followup",
            "uid": "jm5gjpvvd4o1qe",
    

How to handle posts with imgs? Do we want the img tags stripped? Think about how it will affect textual features
response length, sentiment, 

what elements do q&a contain?
latex, code snippets, imgs/screenshots, links, lists, annotations to prev posts (i.e. @356)

fields that can be added: num_answer_imgs, ...

can remove posts with imgs or include a special field called "num_imgs" so can distinguish b/w posts that have imgs

## Feature Engineering

In [15]:
FILEPATH_CSV = "./csc108_fall2021.csv"

In [106]:
data = pd.read_csv(FILEPATH_CSV, index_col=0)
data.tail()

# data.keys()
# students = data[data[' is_student_endorsed'] == True]['student_answer_name']
# for s in students:
#     print(s)


Unnamed: 0_level_0,question_title,question,folders,student_poster_name,date_question_posted,student_answer,student_answer_name,date_student_answer_posted,is_student_endorsed,is_student_helpful,instructor_answer,instructor_answer_name,date_instructor_answer_posted,is_instructor_helpful,is_followup
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
16,Spatial Skills: Pretest - Shapes difficult to ...,"Hello,\n\n As I was doing the Spatial Skills P...",spatial,ksoq5p0f71h12q,2021-08-24T04:42:07Z,"Its the dark mode making the shapes look odd, ...",ksoq6px0rgz43d,2021-08-30T06:59:49Z,False,False,Could you give us an example (screenshot) of o...,gzcyozk0MBl,2021-08-24T04:45:41Z,False,True
15,Question I make based on Week 1 material,Hello:\nThere are question made by me to check...,general,keivl0bhdc52f5,2021-08-24T02:40:40Z,Q1) 8**6//4+6+6*8\n = 262144//4+6+6*8\n =6...,ksoq5rauj41c2,2021-08-25T06:55:53Z,False,False,,,,,True
13,"Spatial skills: Orthographic Views 1, #3",Hello:\nAs I am doing the week 1 spatial Skill...,spatial,keivl0bhdc52f5,2021-08-23T20:08:34Z,Just drop hints here: There will be some edges...,keivl0bhdc52f5,2021-08-23T21:17:37Z,True,False,The key to this question is the dotted lines. ...,k4ddfmb0gsb1h,2021-08-23T21:48:22Z,False,True
12,I don't think I am in the right lecture,I am currently enrolled in CSC108H5 F LEC 9106...,lecture,ksoq61vj96i2d4,2021-08-23T19:23:56Z,,,,,,"Hi Yaseen! Yes you are, all the lectures secti...",k4ddfmb0gsb1h,2021-08-24T00:56:12Z,True,False
10,Ask Hints on Grading activities,Hello:\nIf when we are stuck on some question ...,general,keivl0bhdc52f5,2021-08-23T16:53:37Z,,,,,,General questions -- ones that you can ask wit...,gzcyozk0MBl,2021-08-23T17:01:32Z,True,True


New schema: (post_id,student_poster_id,close_to_deadline, "is_followup"" 
    
    "answerer_id, date_answer_posted, reputation, is_helpful"
    
    )


    date_question_posted
    - close to deadline: yes/no

In [74]:
import nltk
import ssl
from nltk.tokenize import word_tokenize
from collections import namedtuple

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/Brandon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [119]:
def get_length(text:str) -> int:
    length = 0
    if isinstance(text, str):
        length = len(word_tokenize(text))
    else: # must be nan
        if not isnan(text):
            assert(1 == 0) # shouldn't get here
    
    return length


def add_answer(augmented_data:List[List], append_row:List, post_row:tuple, poster_dict: Dict[str, str], num_instances:int, answer_type:int) -> int:
    """
    Append student or instructor answer fields to augmented_data.

    :param augmented_data: table to add append_row to
    :param append_row: partially filled row to be completed
    :param post_row: namedtuple containing information about the current Piazza post
    :param answer_type: INSTRUCTOR|STUDENT
    :returns: this is a description of what is returned
    :raises Nothing
    """
    poster =  'student' if answer_type == STUDENT  else 'instructor'
    fields = [f'{poster}_answer', f'{poster}_answer_name', f'is_{poster}_helpful']
    increment_num_instances = False
    

    if isinstance(getattr(post_row, f"{poster}_answer"), str): 
        poster_id = getattr(post_row, f'{poster}_answer_name')
        if poster_id not in poster_dict:
            poster_dict[poster_id] = num_instances 
            increment_num_instances = True

        is_helpful = 1 if getattr(post_row, f"is_{poster}_helpful") else 0

        append_row.append(poster_dict[poster_id])
        append_row.append(answer_type)
        append_row.append(is_helpful)
        augmented_data.append(append_row)

    return increment_num_instances

    
 

# Point = namedtuple("Point", "x y")
# point = Point(2, 4)
# t = 'x'
# getattr(point, t)
    

In [127]:
from math import isnan

def augment_data(data: DataFrame) -> DataFrame:

    augmented_data = []

    studentid_to_int = {}
    instructorid_to_int = {}
    num_students, num_instructors = 0, 0

    for r in data.itertuples():
       
        new_row = [r.Index] 
        if r.student_poster_name not in studentid_to_int:
            studentid_to_int[r.student_poster_name] = num_students 
            num_students += 1
        new_row.append(studentid_to_int[r.student_poster_name])

        new_row.append(get_length(r.question))

    
        is_followup = 1 if r.is_followup else 0
        new_row.append(is_followup)

        # add separate rows for student and instructor answer
        num_students += add_answer(augmented_data, deepcopy(new_row), r, studentid_to_int, num_students, STUDENT)
        num_instructors += add_answer(augmented_data, deepcopy(new_row), r, instructorid_to_int, num_instructors, INSTRUCTOR)

        # if isinstance(r.student_answer, str) and not isnan(r.student_answer): # nan if not student answer
            
        #     student_answer_row = deepcopy(new_row)
        #     if r.student_answer_name not in studentid_to_int:
        #         studentid_to_int[r.student_answer_name] = num_students 
        #         num_students += 1

        #     is_helpful = 1 if r.is_student_helpful else 0

        #     student_answer_row.append(studentid_to_int[r.student_answer_name])
        #     student_answer_row.append(STUDENT)
        #     student_answer_row.append(is_helpful)
        #     augmented_data.append(student_answer_row)

        


        # if r.instructor_answer:
        #     instructor_answer_row = deepcopy(new_row)
        #     if r.instructor_answer_name not in instructorid_to_int:
        #         instructorid_to_int[r.instructor_answer_name] = num_instructors
        #         num_instructors += 1
        #     is_helpful = 1 if r.is_instructor_helpful else 0
        #     instructor_answer_row.append(instructorid_to_int[r.instructor_answer_name])
        #     instructor_answer_row.append(INSTRUCTOR)
        #     instructor_answer_row.append(is_helpful)
            
        #     augmented_data.append(instructor_answer_row)
            
  
    augmented_data = np.array(augmented_data)
    augmented_df = pd.DataFrame(augmented_data, columns=['post_id', 'student_poster_id', 'question_length', 
    'is_followup', 'answerer_id',  'reputation', 'is_helpful'])

    return augmented_df



augmented_df = augment_data(data)
#augmented_data[0].shape




In [131]:
augmented_df.head()
# use post number instead of manually assigning number -> easier to search

Unnamed: 0,post_id,student_poster_id,question_length,is_followup,answerer_id,reputation,is_helpful
0,3809,0,23,0,0,1,0
1,3808,1,53,0,2,0,0
2,3807,3,0,0,4,0,0
3,3807,3,0,0,0,1,0
4,3806,5,93,1,1,1,0


In [89]:
a = 0
a += False
a
# sentence = "How the Meta skill be graded, I only got 22 of the marks, however I have finished all the meta skills with answering all the questions,how that happened, thank you so much!"
# s2 = "num_col = longest_chain(matrix[row][col:]) num_rows = 1  temp_last_col = 0 largest_matrix = num_col * num_rows"
# tokenized = word_tokenize(s2)
# print(tokenized)


0

## References

[1] https://stackoverflow.com/questions/753052/strip-html-from-strings-in-python