In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import re
import json
import random

import numpy as np
import pandas as pd
import pickle
import json
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

from pprint import pprint
import collections
from collections import OrderedDict, Counter

### Utils

In [None]:
import nltk 

nltk.download('stopwords')
nltk.download('wordnet')

import string
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()


STOPWORDS = stopwords.words('english')
PUNCTUATIONS = string.punctuation


import spacy
nlp = spacy.load("en_core_web_sm")

### Dependency for Extractive Summarization ###
# !pip install pytorch_transformers tensorboardX multiprocess pyrouge


In [5]:
from typing import List
def save_csv(trg_DF, save_path):
  """ Save DataFrame as .csv """
  trg_DF.to_csv(save_path)

def get_paragraphs(txt_path:str = "/content/drive/Shareddrives/NLP4Education/htmlContent/ArgueTutor.txt") -> list:
  with open(txt_path, 'r') as trg:
    paras = trg.readlines() # Paragraphs separated by \n

def context_clean(text, rm_punct=False):
  """ Remove unexpected reference in brackets and punctuations. """
  text = re.sub(r'[ ]?\(.*?\)[ ]?', ' ', text)
  text = re.sub(r'[ ]?\[.*?\][ ]?', ' ', text)
  text = re.sub(r'[ ]?<.*?>[ ]?', ' ', text)
  if rm_punct:
    for punct in PUNCTUATIONS:
      text = text.replace(punct, '')
  return text

def get_sents(text:str) -> list:
  """ 
    Effect: split paragraph into sentences, remove redundant blanks.
    Input: paragraph (plain text); 
    Output: list of sentences.
  """
  text = re.sub('[\n\t\r]+', ' ', text.strip())
  text = re.sub('[ ]+', ' ', text)

  sent_tokenizer = PunktSentenceTokenizer()
  sents = sent_tokenizer.tokenize(text)
  return [x for x in sents if len(x)>3]

def get_sents_split(para_list: list) -> list:
  paras_sents_list = []
  for para in para_list:
    sent_list = get_sents(para)
    paras_sents_list.append(sent_list)
  return paras_sents_list

def make_presumm_input(
    para_list:list, 
    input_write_into:str = None, 
    trg_write_into:str = None, 
    trg_summ_list:list = None, 
    paragraph_split_sents:bool = False,
    return_input_text:bool = False ):
  """ Create input to BertSum extractive summarization model. 
  params:
    para_list: list of paragraphs;
    input_write_into: file path to save processed input;
    trg_write_into: file path to ground truth summary (used only for evaluation);
    trg_summ_list: ground truth summary (used only for evaluation);
    paragraph_split_sents: whether the paragraphs are splited into sentences;
    return_input_text: whether return the processed input text (check the results).
  """
  if not paragraph_split_sents:
    para_sents_list = get_sents_split(para_list)
  if trg_summ_list is not None:
    with open(input_write_into, 'w') as input_trg:
      with open(trg_write_into, 'w') as trg_trg:
        for psg_sents, summ_sents in tqdm(zip(para_list, trg_summ_list)):
          input_trg.write(' [CLS] [SEP] '.join(psg_sents)+'\n')
          trg_trg.write(summ_sents+'\n')
  elif input_write_into is not None:
    with open(input_write_into, 'w') as input_trg:
      for psg_sents in tqdm(para_sents_list):
        input_trg.write(' [CLS] [SEP] '.join(psg_sents)+'\n')
  if return_input_text:
    return [' [CLS] [SEP] '.join(psg_sents)+'\n' for psg_sents in para_sents_list]
    

In [3]:
with open("/home/oliviaaa/NLP4ActiveReading/NLP4ActiveReading_Subtasks/SentRetrieval-20220817/htmlContent/ArgueTutor.txt", 'r') as trg:
  PARAS = trg.readlines() # Paragraphs separated by \n

In [6]:
get_sents_split(PARAS)[0]

['Techniques from Natural-Language-Processing offer the opportunities to design new dialog-based forms of human-computer interaction as well as to analyze the argumentation quality of texts.',
 'This can be leveraged to provide students with adaptive tutoring when doing a persuasive writing exercise.',
 'To test if individual tutoring for students’ argumentation will help them to write more convincing texts, we developed ArgueTutor, a conversational agent that tutors students with adaptive argumentation feedback in their learning journey.',
 'We compared ArgueTutor with 55 students to a traditional writing tool.',
 'We found students using ArgueTutor wrote more convincing texts with a better quality of argumentation compared to the ones using the alternative approach.',
 'The measured level of enjoyment and ease of use provides promising results to use our tool in traditional learning settings.',
 'Our results indicate that dialog-based learning applications combined with NLP text feed

In [7]:
### Sanity Check ###
PRESUM_INPUT = make_presumm_input(
        para_list=PARAS, 
        input_write_into='/home/oliviaaa/NLP4ActiveReading/NLP4ActiveReading_Subtasks/SentRetrieval-20220817/temp.txt',
        return_input_text=True )

  0%|          | 0/50 [00:00<?, ?it/s]

In [9]:
!python /home/oliviaaa/NLP4ActiveReading/NLP4ActiveReading_Subtasks/SentRetrieval-20220817/PreSumm/src/train.py -task ext -mode test_text -text_src /home/oliviaaa/NLP4ActiveReading/NLP4ActiveReading_Subtasks/SentRetrieval-20220817/temp.txt -test_batch_size 8 -log_file /home/oliviaaa/NLP4ActiveReading/NLP4ActiveReading_Subtasks/SentRetrieval-20220817/temp_logs.txt -test_from /home/oliviaaa/NLP4ActiveReading/NLP4ActiveReading_Subtasks/SentRetrieval-20220817/PreSumm/models/bertext_cnndm_transformer.pt -sep_optim true -use_interval true -visible_gpus 0 -max_pos 512 -max_length 500 -alpha 0.95 -min_length 20 -result_path /home/oliviaaa/NLP4ActiveReading/NLP4ActiveReading_Subtasks/SentRetrieval-20220817/temp_out.txt


/bin/python: can't find '__main__' module in '/'


In [13]:
def get_retrieve(output_path='/home/oliviaaa/NLP4ActiveReading/NLP4ActiveReading_Subtasks/SentRetrieval-20220817/temp_out.txt_step-1.candidate'):
  with open(output_path, 'r') as trg:
    res = trg.readlines()
  return [s.replace('\n', '').split('<q>') for s in res]

In [14]:
demo_result = get_retrieve()

In [15]:
# len(demo_result) => 50
demo_result[0]

['We compared ArgueTutor with 55 students to a traditional writing tool.',
 'We found students using ArgueTutor wrote more convincing texts with a better quality of argumentation compared to the ones using the alternative approach.',
 'This can be leveraged to provide students with adaptive tutoring when doing a persuasive writing exercise.']