In [193]:
import json
import time
from tqdm import trange, tqdm
import os
from typing import Dict, List, Tuple, Union

import csv
import unicodedata
import html
from io import StringIO
from html.parser import HTMLParser
from piazza_api import Piazza

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from copy import deepcopy

from piazza_api.network import Network

CRED_FILE = "creds.json"


"""Custom Types"""
Answer = Dict[str,Dict[str,Union[str,int]]]
Post = Dict[str,Union[str, Union[str,int,List]]]

"""Macros"""
# who the answer is coming from
STUDENT, INSTRUCTOR, STUDENT_ENDORSED_ANSWERER = 0, 1, 2


In [194]:
class MyHTMLParser(HTMLParser):
    """taken from: [1]"""
    
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    """strips html tags and substitutes html entities """
    #html = html.unescape(html)
    s =  MyHTMLParser()
    s.feed(html)
    return s.get_data()

In [169]:
def login() -> Tuple[dict, Network]:
    """logs user into Piazza"""

    email:str 
    password:str 
    courseid:str 

    with open(CRED_FILE) as f:
        creds = json.load(f)
        email, password, courseid = creds['email'], creds['password'], creds['courseid']


    #print(f"email: {email} \npassword: {password} \ncourseid: {courseid}")


    p: Piazza = Piazza()
    p.user_login(email, password)
    user_profile: dict = p.get_user_profile()
    course: Network = p.network(courseid)
    return user_profile, course

In [257]:
def get_post_creator(post):
    for entry in post['change_log']:
        if entry['type'] == 'create':
            return entry['uid']


def get_post_created(post):
    """get time post was created"""
    for entry in post['change_log']:
        if entry['type'] == 'create':
            return entry['when']


def get_posts_by_student(filename:str, student_id:str) -> List[Post]:
    student_posts = []
    with open(filename, 'r') as f:
        all_posts = json.load(f)
        for p in all_posts:
            if get_post_creator(p) == student_id:
                student_posts.append(p)
    return student_posts


def get_endorsed_students(course: Network) -> Tuple[Dict, Dict]:
    endorsed_users = {}
    non_endorsed_users = {}
    users = course.get_all_users()
    for u in users:
        if u['endorser']:
            endorsed_users[u['id']] = u['name']
        else:
            non_endorsed_users[u['id']] = u['name']


    return endorsed_users, non_endorsed_users



def get_answers(post:Post, endorsed_students: Dict) -> Answer:
    """ Get student and instructor answers
    """

    answers = {}
    answers['s_answer'] = {}
    answers['i_answer'] = {}

    for t in answers.keys():
        for ans in post['children']:
            if ans['type'] == t:      
                vals = answers[t]
                text = ans['history'][0]['content']
                text = strip_tags(text)
                vals['text'] = text
                vals['poster'] = ans['history'][0]['uid']
                vals['date'] = ans['history'][0]['created']
                vals['num_helpful'] = len(ans['tag_endorse_arr'])
                if get_post_creator(post) in ans['tag_endorse_arr']:
                    vals['is_helpful'] = True 
                else:
                    vals['is_helpful'] = False

                if ans['type'] == "s_answer":
                    
                    student_poster_id = ans['history'][0]['uid'] # id of the most recent student answer editor
                     # check if student is endorsed
                    vals['is_endorser'] = False
                    if student_poster_id in endorsed_students:
                        vals['is_endorser'] = True
                   
                break
    
    return answers

In [254]:
def export_posts_json(filename:str, course:Network) -> None:
    """Create json of all posts saved in current directory"""

    if os.path.exists(filename):
        print(f"{filename} already exists!")
        return 
    posts = course.iter_all_posts()
    all_posts = []
    #text = json.dumps(post['children'][1], sort_keys=True, indent=4)
    try:
        for p in tqdm(posts):
            all_posts.append(p)
    
    finally:
        print('------------------------------------')
        with open(filename, 'w') as f:
            json.dump(all_posts, f)

In [255]:

def json_to_csv(json_file_path: str, csv_filename: str, course: Network, is_overwrite_csv: bool=False):
    """Use param::course to extract student answerer to determine whether they are endorsed"""

    schema = ("post_id,question_title,question,folders,student_poster_name,date_question_posted," 
    "student_answer,student_answer_name,date_student_answer_posted, is_student_endorsed, is_student_helpful,"
    "instructor_answer,instructor_answer_name,date_instructor_answer_posted,is_instructor_helpful," 
    "is_followup\n")

    # schema = ("post_id,question_title,question,folders,student_poster_name,date_question_posted," 
    # "answer,answer_poster,date_answer_posted,num_helpful,"
    # "instructor_answer,instructor_answer_name,date_instructor_answer_posted,num_instructor_helpful," 
    # "is_followup\n")
 

    parser = MyHTMLParser()

    endorsed_students = get_endorsed_students(course)[0]

    with open(json_file_path, 'r') as json_file:
        with open(csv_filename, 'w') as csv_file:
            csv_file.write(schema)
            posts = json.load(json_file)
            for post in tqdm(posts):   
                row = [] 
                if post['type'] == 'question':
                    
                    question = post['history'][0] # newest update of question. Change index to -1 for oldest
                    #question_title =  html.unescape(question['subject'])
                    question_title = strip_tags(question['subject'])
                    question_content = strip_tags(question['content'])
                    folders = ','.join(post['folders'])
                    date_created = get_post_created(post)
                    answers = get_answers(post, endorsed_students)
                    student_answer = answers['s_answer']
                    instructor_answer = answers['i_answer']
                    #print(instructor_answer)


                    row = [post['id'], question_title, question_content, folders, get_post_creator(post), date_created]
                    s_row, i_row = [], []
                    if student_answer:
                        s_row = [student_answer['text'], student_answer['poster'], student_answer['date'], str(student_answer['is_endorser']), str(student_answer['is_helpful'])] 
                    else:
                        s_row = [None, None, None, None, None]

                    if instructor_answer:
                        i_row = [instructor_answer['text'], instructor_answer['poster'], instructor_answer['date'], str(instructor_answer['is_helpful'])] 
                    else:
                        i_row = [None, None, None, None]
                    
                    row = row + s_row + i_row

                    is_followup = 'False'

                    for c in post['children']:
                        if c['type'] == 'followup':
                            is_followup = 'True'
                    
                    row += [is_followup]
                    #print(row)


                   
                    post_writer = csv.writer(csv_file)
                    post_writer.writerow(row)
                    
                    csv_file.write('\n')

In [258]:
user_profile,course = login()

# e = get_endorsed_students(course)
# n = get_endorsed_students(course)

# e

#export_posts_json("csc108_fall2021.json", course)
json_to_csv("./csc108_fall2021.json", "csc108_fall2021.csv", course)


({'kr2c06c5de9ys': 'Abbas Peer Mohammed',
  'jlvii0jzogr570': 'Akshit Goyal',
  'k076o83kby22zd': 'Alexander Feng',
  'ksoq6iqigk43l9': 'Andrey Jimiev',
  'k076nwvvxkw2gt': 'Andriy Drozd',
  'kr2f2ypgo5823': 'Anh Thu Doan',
  'ksoq5y6vr7g246': 'Arian Sadeghi',
  'ktc8bhhayfh2cz': 'Armaan Randhawa',
  'k076nxw6wrv2in': 'Axel Visan',
  'jlzdsl24rjx1oy': 'Bhavya Kasera',
  'ksoq6hhohot3i1': 'Cindy Qiao',
  'kr2c09v56917i': 'Dev Patel',
  'ksoq6pjc9kq42q': 'Eric Shi',
  'ksoq64i18j52j0': 'Eric So',
  'kr2c03lcia8si': 'Fares Alkorani',
  'jlsuxpe85qa2tw': 'Georges Hanna',
  'ksoq6bksq5y33e': 'Giang Bui',
  'jm5gjpvvd4o1qe': 'Haocheng Hu',
  'k06tufhdmjt5tc': 'Harry Ye',
  'kr2bzx1u7wucx': 'Hassan El-Sheikha',
  'ksoq3ogq9sx27i': 'Henrik Zimmermann',
  'ksoq6sajaqn48y': 'Inaam Azeezur-Rahman',
  'ksoq6qj95yc45b': 'Ishan Singh',
  'kqwk2jhcdau31c': 'Japleen Jaria',
  'k4rvcgssqp22bd': 'Jarrod Servilla',
  'ksoq65knjke2nd': 'Jasleen Binning',
  'ksoq6exja9d3b9': 'Jasmine Guruparan',
  'ksoq6go

In [263]:
user_profile,course = login()

user = course.get_users(['krz7jwkviui2p3'])
post = course.get_post('3799')

p = course.get_feed()

text = json.dumps(post, sort_keys=True, indent=4)


users = course.get_all_users()

user

p





{'unseen_items': 0,
 'more': True,
 'last_networks': ['kxhl6o1cccn2oo',
  'ktd8x3i99m165',
  'ky4l8kjvu2fv6',
  'l2pzk13y85t33m',
  'jw2uhydb1dljb',
  'krz7jwkviui2p3'],
 'drafts': {},
 'sort': 'updated',
 'avg_cnt': None,
 'users': 1,
 'tags': {'instructor': ['general',
   'lecture',
   'lab',
   'lab∕lab1',
   'lab∕lab2',
   'lab∕lab3',
   'lab∕lab4',
   'lab∕lab5',
   'lab∕lab6',
   'lab∕lab7',
   'lab∕lab8',
   'lab∕lab9',
   'lab∕lab10',
   'tests/exam',
   'tests/exam∕test1',
   'tests/exam∕test2',
   'tests/exam∕exam',
   'utm/life/other',
   'spatial',
   'pcrs'],
  'instructor_upd': {'pcrs': 1641139536483,
   'lab∕lab4': 1635113988492,
   'lab∕lab3': 1636874840714,
   'lab∕lab2': 1636501098582,
   'utm/life/other': 1641168489336,
   'lab∕lab1': 1633899830524,
   'lab∕lab10': 1640144521020,
   'tests/exam': 1640273653732,
   'lab': 1640144521020,
   'tests/exam∕test2': 1639980631303,
   'general': 1641158418679,
   'tests/exam∕exam': 1640273653732,
   'lecture': 1640493805776,


How to handle posts with imgs? Do we want the img tags stripped? Think about how it will affect textual features
response length, sentiment, 

what elements do q&a contain?
latex, code snippets, imgs/screenshots, links, lists, annotations to prev posts (i.e. @356)

fields that can be added: num_answer_imgs, ...

can remove posts with imgs or include a special field called "num_imgs" so can distinguish b/w posts that have imgs

## Feature Engineering

In [85]:
FILEPATH_CSV = "./csc108_fall2021.csv"

In [265]:
data = pd.read_csv(FILEPATH_CSV, index_col=0)
data.tail()
data.keys()
students = data[data[' is_student_endorsed'] == True]['student_answer_name']
for s in students:
    print(s)


ksoq6exja9d3b9
ksoq6sajaqn48y
kr1fzpj7x1ya1
kr2c09v56917i
kr2c09v56917i
kr1fzpj7x1ya1
ksoq6pjc9kq42q
kr2c0bexfm11al
kqwk2jhcdau31c
ksoq3ogq9sx27i
kr2c03lcia8si
ktc8bhhayfh2cz
kr1fzpj7x1ya1
ksoq3ogq9sx27i
ksoq3ogq9sx27i
ksoq6exja9d3b9
kr2bzuamy3q6v
kr2c0bexfm11al
ksoq5wipt9620g
ksoq6uzdhan4fe
kr2bzuamy3q6v
kr1fzpj7x1ya1
ksoq616byuu2bk
ksoq5fic49oh1
ksoq5wipt9620g
ksoq616byuu2bk
ksoq6mqmqik3v3
ksoq616byuu2bk
ksoq616byuu2bk
ksoq6sajaqn48y
kr2bzx1u7wucx
ktc8bhhayfh2cz
kr2c09v56917i
kr2bzx1u7wucx
ksoq5wipt9620g
ksoq6exja9d3b9
kr2bzz3d7yohk
ksoq6cg2rie35i
ksoq6pjc9kq42q
ksoq6cg2rie35i
kr2bzuamy3q6v
ksoq6cg2rie35i
ktc8bhhayfh2cz
kr2c03lcia8si
ksoq6o0rhux3xy
kr2c09v56917i
kr2c0bexfm11al
ksoq6cg2rie35i
kr2c0bexfm11al
ktc8bhhayfh2cz
kr2c09v56917i
kr2c09v56917i
ksoq6pjc9kq42q
ksoq6pjc9kq42q
ksoq6cg2rie35i
ktc8bhhayfh2cz
ksoq6pjc9kq42q
ksoq6pjc9kq42q
ksoq5wipt9620g
ksoq6qj95yc45b
ksoq6exja9d3b9
ksoq6j73y9b3m3
kr2f2ypgo5823
kr2bzz3d7yohk
kr2bzz3d7yohk
kr2c0bexfm11al
ksoq6mqmqik3v3
ksoq6mqmqik3v3
ks

New schema: (post_id,student_poster_id,close_to_deadline, "is_followup"" 
    
    "answerer_id, date_answer_posted, reputation, is_helpful"
    
    )


    date_question_posted
    - close to deadline: yes/no

In [266]:
augmented_data = []

studentid_to_int = {}
instructorid_to_int = {}
num_posts, num_students, num_instructors = 0, 0, 0

for r in data.itertuples():
    new_row = [num_posts]
    if r.student_poster_name not in studentid_to_int:
        studentid_to_int[r.student_poster_name] = num_students 
        num_students += 1
    new_row.append(studentid_to_int[r.student_poster_name])

    is_followup = 1 if r.is_followup else 0
    new_row.append(is_followup)

    if r.student_answer:
        student_answer_row = deepcopy(new_row)
        if r.student_answer_name not in studentid_to_int:
            studentid_to_int[r.student_answer_name] = num_students 
            num_students += 1

        is_helpful = 1 if r.is_student_helpful else 0

        student_answer_row.append(studentid_to_int[r.student_answer_name])
        student_answer_row.append(STUDENT)
        student_answer_row.append(is_helpful)
        augmented_data.append(student_answer_row)

    
    if r.instructor_answer:
        instructor_answer_row = deepcopy(new_row)
        if r.instructor_answer_name not in instructorid_to_int:
            instructorid_to_int[r.instructor_answer_name] = num_instructors
            num_instructors += 1
        is_helpful = 1 if r.is_instructor_helpful else 0
        instructor_answer_row.append(instructorid_to_int[r.instructor_answer_name])
        instructor_answer_row.append(INSTRUCTOR)
        instructor_answer_row.append(is_helpful)
        
        augmented_data.append(instructor_answer_row)
        
    num_posts += 1

print(num_posts)
print(len(augmented_data[0]))

augmented_data = np.array(augmented_data)
augmented_data.shape


AttributeError: 'Pandas' object has no attribute 'is_student_helpful'

In [267]:
augmented_df = pd.DataFrame(augmented_data, columns=['post_id', 'student_poster_id', 'is_followup', 'answerer_id', 'reputation', 'is_helpful'])
augmented_df

Unnamed: 0,post_id,student_poster_id,is_followup,answerer_id,reputation,is_helpful


In [190]:
a = np.array([[1, 2, 3], [4, 5, 6]])
a.shape

a = {}

if not a:
    print('NOT')

# df2 = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
#                    columns=['a', 'b', 'c'])

NOT


## References

[1] https://stackoverflow.com/questions/753052/strip-html-from-strings-in-python