In [22]:
import json
import time
import math
from datetime import datetime
import os
from tqdm import trange, tqdm
import re
from copy import deepcopy


import csv
import unicodedata
import html
from io import StringIO
from html.parser import HTMLParser


import pandas as pd
from pandas import DataFrame, Series
import numpy as np
from numpy import ndarray 
import matplotlib.pyplot as plt
import seaborn as sns



from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import chi2




from nptyping import NDArray, Int, Shape
from typing import Dict, List, Tuple, Union


from transformers import pipeline
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import nltk
import ssl
from nltk.tokenize import word_tokenize
from collections import namedtuple


from piazza_api import Piazza
from piazza_api.network import Network

CRED_FILE = "creds.json"
NUM_POSTS_TO_SCRAPE = 1000


"""Custom Types"""
Answer = Dict[str,Dict[str,Union[str,int]]]
Post = Dict[str,Union[str, Union[str,int,List]]]

"""Macros"""
# who the answer is coming from
STUDENT, INSTRUCTOR, STUDENT_ENDORSED_ANSWERER = 0, 1, 2
EPSILON = 1e-05

# folder categories
GENERAL, LECTURES, ASSIGNMENTS, TESTS = 0, 1, 2, 3

# labels for sentiment
NEGATIVE, NEUTRAL, POSITIVE = 0, 1, 2

In [2]:
class MyHTMLParser(HTMLParser):
    """taken from: [1]"""
    
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    """strips html tags and substitutes html entities """
    #html = html.unescape(html)
    s =  MyHTMLParser()
    s.feed(html)
    return s.get_data()

In [3]:
def login() -> Tuple[dict, Network]:
    """logs user into Piazza"""

    email:str 
    password:str 
    courseid:str 

    with open(CRED_FILE) as f:
        creds = json.load(f)
        email, password, courseid = creds['email'], creds['password'], creds['courseid']


    #print(f"email: {email} \npassword: {password} \ncourseid: {courseid}")


    p: Piazza = Piazza()
    p.user_login(email, password)
    user_profile: dict = p.get_user_profile()
    course: Network = p.network(courseid)
    return user_profile, course

## Some Utilities

In [4]:
def get_post_creator(post: Post):
    for entry in post['change_log']:
        if entry['type'] == 'create':
            return entry['uid']


def get_post_created(post: Post):
    """get time post was created"""
    for entry in post['change_log']:
        if entry['type'] == 'create':
            return entry['when']


def get_posts_by_student(filename:str, student_id:str) -> List[Post]:
    student_posts = []
    with open(filename, 'r') as f:
        all_posts = json.load(f)
        for p in all_posts:
            if get_post_creator(p) == student_id:
                student_posts.append(p)
    return student_posts


def get_endorsed_students(course: Network) -> Tuple[Dict, Dict]:
    endorsed_users = {}
    non_endorsed_users = {}
    users = course.get_all_users()
    for u in users:
        if u['endorser']:
            endorsed_users[u['id']] = u['name']
        else:
            non_endorsed_users[u['id']] = u['name']


    return endorsed_users, non_endorsed_users


def is_private(post: Post) -> bool:
    """ Return true if post is private """
    for entry in post['change_log']:
        if entry['type'] == 'create':
            return True if entry['v'] == 'private' else False



def get_answers(post:Post, endorsed_students: Dict) -> List[Dict[str, Answer]]:
    """ Get student and instructor answers """

    answers = {}
    answers['s_answer'] = {}
    answers['i_answer'] = {}

    for t in answers.keys():
        for ans in post['children']:
            if ans['type'] == t:      
                vals = answers[t]
                text = ans['history'][0]['content']
                #text = strip_tags(text)
                vals['text'] = text
                vals['poster'] = ans['history'][0]['uid']
                vals['date'] = ans['history'][0]['created']
                vals['num_helpful'] = len(ans['tag_endorse_arr'])
                # post creator is same student that liked response
                if get_post_creator(post) in ans['tag_endorse_arr']:
                    vals['is_helpful'] = True 
                else:
                    vals['is_helpful'] = False

                if ans['type'] == "s_answer":
                    
                    student_poster_id = ans['history'][0]['uid'] # id of the most recent student answer editor
                     # check if student is endorsed (actually not a valid way of checking)
                    vals['is_endorser'] = False
                    if student_poster_id in endorsed_students:
                        vals['is_endorser'] = True
                   
                break
    
    return answers

## Step 1: Convert raw Piazza data to csv file 

In [27]:
def export_posts_json(filename:str, course:Network) -> None:
    """Create json of all posts saved in current directory"""

    if os.path.exists(filename):
        print(f"{filename} already exists!")
        return 
    posts = course.iter_all_posts()
    all_posts = []
    #text = json.dumps(post['children'][1], sort_keys=True, indent=4)
    try:
        for p in tqdm(posts):
            time.sleep(1)
            all_posts.append(p)
    
    finally:
        print('------------------------------------')
        with open(filename, 'w') as f:
            json.dump(all_posts, f)

In [25]:

def json_to_csv(json_file_path: str, csv_filename: str, course: Network, is_overwrite_csv: bool=False) -> None:
    """ 
    :param json_file_path: Path to json file to convert to csv
    :param csv_filename: Name of csv file to save to cur directory
    :param course: Used to extract student profile to determine whether they are endorsed. **Actually not a valid way of checking**
    """

    schema = ("post_id,is_private,question_title,question,folders,student_poster_name,date_question_posted," 
    "student_answer,student_answer_name,date_student_answer_posted,is_student_endorsed,is_student_helpful,"
    "instructor_answer,instructor_answer_name,date_instructor_answer_posted,is_instructor_helpful," 
    "is_followup\n")

    parser = MyHTMLParser()

    endorsed_students = get_endorsed_students(course)[0]

    with open(json_file_path, 'r') as json_file:
        with open(csv_filename, 'w') as csv_file:
            csv_file.write(schema)
            posts = json.load(json_file)
            num_posts = 0
            for post in tqdm(posts):   
                row = [] 
                if post['type'] == 'question':
                    question = post['history'][0] # newest update of question. Change index to -1 for oldest
                    question_title = question['subject']
                    question_content = question['content']
                    folders = ','.join(post['folders'])
                    date_created = get_post_created(post)
                    answers = get_answers(post, endorsed_students)
                    student_answer = answers['s_answer']
                    instructor_answer = answers['i_answer']
                 

                    row = [post['nr'], is_private(post), question_title, question_content, folders, get_post_creator(post), date_created]
                    s_row, i_row = [], []
                    if student_answer:
                        s_row = [student_answer['text'], student_answer['poster'], student_answer['date'], str(student_answer['is_endorser']), str(student_answer['is_helpful'])] 
                    else:
                        s_row = [None, None, None, None, None]

                    if instructor_answer:
                        i_row = [instructor_answer['text'], instructor_answer['poster'], instructor_answer['date'], str(instructor_answer['is_helpful'])] 
                    else:
                        i_row = [None, None, None, None]
                    
                    row = row + s_row + i_row

                    is_followup = 'False'

                    for c in post['children']:
                        if c['type'] == 'followup':
                            is_followup = 'True'
                    
                    row += [is_followup]

                    post_writer = csv.writer(csv_file)
                    post_writer.writerow(row)
                    
                    csv_file.write('\n')

                    num_posts += 1

                    if num_posts == NUM_POSTS_TO_SCRAPE:
                        break






In [29]:
user_profile,course = login()

#export_posts_json("csc108_fall2020.json", course)
json_to_csv("./csc108_fall2020.json", "csc108_fall2020.csv", course)


100%|██████████| 871/871 [00:00<00:00, 59546.52it/s]


## Manually viewing users and posts

In [8]:
# user_profile,course = login()

# user = course.get_users(['krz7jwkviui2p3'])
# post = course.get_post('3809')

# p = course.get_feed()

# text = json.dumps(post, sort_keys=True, indent=4)


# users = course.get_all_users()

# user

# print(text)


How to handle posts with imgs? Do we want the img tags stripped? Think about how it will affect textual features
response length, sentiment, 

what elements do q&a contain?
latex, code snippets, imgs/screenshots, links, lists, annotations to prev posts (i.e. @356)

fields that can be added: num_answer_imgs, ...

can remove posts with imgs or include a special field called "num_imgs" so can distinguish b/w posts that have imgs

## Step 2: Feature Engineering - Transform csv into another csv with relevant features

In [9]:
FILEPATH_CSV = "./csc108_fall2021.csv"
AUGMENTED_FILEPATH_CSV = "./csc108_fall2021_sentiment_aug.csv"

In [10]:
data = pd.read_csv(FILEPATH_CSV, index_col=0)
data.tail()

len(data[data['is_private'] == True])
len(data[data['is_private'] == False])

# data.keys()
# students = data[data[' is_student_endorsed'] == True]['student_answer_name']
# for s in students:
#     print(s)


1101

New schema: (post_id,student_poster_id,close_to_deadline, "is_followup"" 
    
    "answerer_id, date_answer_posted, reputation, is_helpful"
    
    )


    date_question_posted
    - close to deadline: yes/no

In [11]:
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/jaipers5/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Utilities

In [12]:
def get_length(text:str) -> int:
    length = 0
    if isinstance(text, str):
        length = len(word_tokenize(text))
    else: # must be nan
        if not isnan(text):
            assert(1 == 0) # shouldn't get here
    
    return length


def is_references(text: str) -> bool:
    """ Check if answer contains a link to another post (i.e. @256) or a hyperlink
        :param text: question|answer with html stripping 
    """   
    return True if re.search(r'@+\d', text) or 'http' in text else False

def level_of_detail(text: str) -> bool:
    """ Detect imgs or code snippets
        :param text: raw question|answer without html stripping so can detect imgs/code-snippets
    """
    is_image = True if '<img' in text else False
    is_code_snippets = True if '<pre' in text else False

    return True if is_image or is_code_snippets else False


def answer_response_time(t1:str, t2:str) -> int:
    """:returns: Answer response time in mins, rounded up. Add option to use log scale?"""
    d1 = datetime.fromisoformat(t1[:-1])
    d2 = datetime.fromisoformat(t2[:-1])
    delta = d2-d1
    response_time = math.ceil(delta.total_seconds() // 60)
    if response_time == 0:
        response_time = EPSILON
    return response_time
    
def get_category(folder: str, folder_set: dict[str, int]) -> int:
    """ For multi-categories just choose the 1st one.
        Varies b/w classes.

        :param folder: folder1,folder2, ...
        :param folder_set: set of all folders

        Precondition: folder should be part of folder_set

    """
    category = folder.split(',')[0]
    if category not in folder_set:
        assert(1 == 0)

    return folder_set[category]


def get_sentiment(text: str, sentiment_model) -> int:
    senti = sentiment_model(text)[0]['label']
    if senti == 'LABEL_0':
        return NEGATIVE
    elif senti == 'LABEL_1':
        return NEUTRAL
    else:
        return POSITIVE
 

    

def add_answer(augmented_data:List[List], append_row:List, post_row:tuple, poster_dict: Dict[str, str], num_instances:int, answer_type:int,
              add_nlp_features=True, sentiment_model=None) -> int:
    """
    Append student or instructor answer fields to augmented_data.

    :param augmented_data: table to add append_row to
    :param append_row: partially filled row to be completed
    :param post_row: namedtuple containing information about the current Piazza post
    :param answer_type: INSTRUCTOR|STUDENT
    :returns: this is a description of what is returned
    :raises Nothing
    """
    poster =  'student' if answer_type == STUDENT  else 'instructor'
    fields = [f'{poster}_answer', f'{poster}_answer_name', f'is_{poster}_helpful', 'date_{poster}_answer_posted']
    increment_num_instances = False
    
    answer = getattr(post_row, f"{poster}_answer")
   
    if isinstance(answer, str): # if answer is nan, don't add an entry
        stripped_answer = strip_tags(answer)
        poster_id = getattr(post_row, f'{poster}_answer_name')
        if poster_id not in poster_dict:
            poster_dict[poster_id] = num_instances 
            increment_num_instances = True

        is_helpful = 1 if getattr(post_row, f"is_{poster}_helpful") else 0

        
        #append_row.append(stripped_answer)
        append_row.append(poster_dict[poster_id])
        append_row.append(get_length(stripped_answer))
        append_row.append(is_references(stripped_answer))
        append_row.append(level_of_detail(answer))
        
        if add_nlp_features:
            append_row.append(get_sentiment(stripped_answer, sentiment_model))

        t1 = getattr(post_row, 'date_question_posted')
        t2 = getattr(post_row, f'date_{poster}_answer_posted')
        response_time = answer_response_time(t1, t2)


        append_row.append(response_time)
        
        append_row.append(answer_type)
        append_row.append(is_helpful)
        augmented_data.append(append_row)

    return increment_num_instances

    
#'general,lecture'.split(',')
    

In [13]:
#get_sentiment()
# a = "Cats"
# #get_sentiment(a, sentiment_pipeline)

# sentiment_pipeline(a)[0]['label']

# tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

# text = "Good night 😊"
# encoded_input = tokenizer(a, return_tensors='pt', truncation=True, max_length=512)
# output = model(**encoded_input)
# scores = output[0][0].detach().numpy()
#scores = softmax(scores)
#scores

In [14]:
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
sentiment_pipeline = pipeline(model=MODEL, max_length=512, truncation=True)

Downloading:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

### Add non-textual and textual features

Note that the NLP features take a while to compute

In [15]:
from math import isnan

def augment_data(data: DataFrame, folder_set: dict[str, int], add_nlp_features=True, sentiment_model=None, save_to_csv=True) -> DataFrame:

    augmented_data = []

    studentid_to_int = {}
    instructorid_to_int = {}
    num_users = 0
    #num_students, num_instructors = 0, 0

    num_iters = 0
    #num_iters_thresh = 10  # len(data.index)
    num_iters_thresh = len(data.index)

    #sentiment_pipeline = pipeline(model="cardiffnlp/twitter-roberta-base-sentiment")

    for r in tqdm(data.itertuples(), total=num_iters_thresh, colour='green', desc="Augment csv data"):

        if num_iters > num_iters_thresh:
            break

       
        new_row = [r.Index] 
        new_row.append(r.is_private)
        category = get_category(r.folders, folder_set)
        new_row.append(category)
       
        if r.student_poster_name not in studentid_to_int:
            studentid_to_int[r.student_poster_name] = num_users 
            num_users += 1
            
        new_row.append(studentid_to_int[r.student_poster_name])
        stripped_q = ""
        raw_question = ""
        question_sentiment = NEUTRAL
        if isinstance(r.question, str): 
            stripped_q = strip_tags(r.question)
            raw_question = r.question
            if add_nlp_features:
                question_sentiment = get_sentiment(stripped_q, sentiment_model) # strip down to 512 tokens

        #new_row.append(stripped_q)
        new_row.append(get_length(stripped_q))
        new_row.append(is_references(stripped_q))
        new_row.append(level_of_detail(raw_question))

        if add_nlp_features:
            new_row.append(question_sentiment)


    
        is_followup = 1 if r.is_followup else 0
        new_row.append(is_followup)

        # add separate rows for student and instructor answer
        num_users += add_answer(augmented_data, deepcopy(new_row), r, studentid_to_int, num_users, STUDENT, add_nlp_features, sentiment_model)
        num_users+= add_answer(augmented_data, deepcopy(new_row), r, instructorid_to_int, num_users, INSTRUCTOR, add_nlp_features, sentiment_model)
        num_iters += 1
    
    augmented_data = np.array(augmented_data)
    print(augmented_data.shape)
    if add_nlp_features:
        augmented_df = pd.DataFrame(augmented_data, columns=['post_id', 'is_private', 'category', 'student_poster_id', 
        'question_length', 'is_question_references', 'question_lod', 'question_sentiment', 'is_followup', 
        'answerer_id', 'answer_length', 'is_answer_references', 'answer_lod', 'answer_sentiment', 'response_time',  'reputation', 'is_helpful'])
    else:
        augmented_df = pd.DataFrame(augmented_data, columns=['post_id', 'is_private', 'category', 'student_poster_id', 
        'question_length', 'is_question_references', 'question_lod', 'is_followup', 
        'answerer_id', 'answer_length', 'is_answer_references', 'answer_lod', 'response_time',  'reputation', 'is_helpful'])

    if save_to_csv:
        augmented_df.index.name = 'ID'
        augmented_df.to_csv(AUGMENTED_FILEPATH_CSV)


    return augmented_df


csc108_fall2021_categories = {'general': GENERAL, 'lecture': LECTURES, 'lab': ASSIGNMENTS , 'tests/exam': TESTS, 'utm/life/other': GENERAL}

csc108_fall2020_categories = {'general': GENERAL, 'administrative': GENERAL, 'post/uni-life': GENERAL, 'random': GENERAL, 'lecture': LECTURES, 'minor_labs': ASSIGNMENTS, 'major_lab1': ASSIGNMENTS, 'major_lab2': ASSIGNMENTS, 'major_lab3': ASSIGNMENTS, 'major_lab4': ASSIGNMENTS, 'major_lab5': ASSIGNMENTS, 'a1': ASSIGNMENTS, 'a2': ASSIGNMENTS, 'a3': ASSIGNMENTS, 'tests/exam': TESTS}

csc108_fall2019_categories = {'general': GENERAL, 'utm/life/other': GENERAL, 'lecture': LECTURES, 'lab': ASSIGNMENTS, 'tests/exam': TESTS}

ignore_categories = {'spatial', 'pcrs', 'pcrs/spatial_skills'}


#augmented_df = augment_data(data, csc108_fall2021_categories, add_nlp_features=True, sentiment_model=sentiment_pipeline)
#augmented_data[0].shape






### Split into train, val and test sets

### Methods and Sources of Noise in Data

<table>
  <tr>
    <th>Feature</th>
    <th>Extraction Method</th>
    <th>Noise</th>
    <th>in literature?</th>
    <th>justification of choice</th>
  </tr>
    <tr>
    <td>post id</td>
    <td>0.52</td>
    <td>0.13</td>
    <td>0.13</td>
    <td>0.13</td>
  </tr>
    <tr>
    <td>is_private</td>
    <td>0.52</td>
    <td>0.13</td>
    <td>0.13</td>
    <td>0.13</td>
  </tr>
  <tr>
    <td>category</td>
    <td>0.52</td>
    <td>0.13</td>
    <td>0.13</td>
    <td>0.13</td>
  </tr>
  <tr>
    <td>is_question_references</td>
    <td>0.95</td>
    <td>0.78</td>
    <td>0.13</td>
    <td>0.13</td>
  
 
</table>






#### Question length

##### Method
- strip html tags and take token length
##### Noise

- imgs not factored into question length. imgs make question short when it is in reality longer
- question length = 0,1
  - question included in subject. Concat subject+question?
  - question inc. img/screenshot and actual question is in subject
- can include code snippets which makes question really long


#### Is question references
- checks if question contains 

## References

[1] https://stackoverflow.com/questions/753052/strip-html-from-strings-in-python