In [None]:
import re
import gensim
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from gensim import corpora, models
from gensim.utils import simple_preprocess
from collections import Counter, defaultdict
from transformers import pipeline

In [None]:
import os
os.chdir("../src/backend")
from question_generation.pipelines import pipeline as qg_pipeline

In [None]:
qg = qg_pipeline("question-generation", use_cuda=True)

In [None]:
def get_most_coherent_qa(qa):
    if len(qa) == 1:
        return qa[0]
    scores = [score(pair["question"] + " " + pair["answer"]) for pair in qa]
    idx = np.argmax(scores)
    return qa[idx]

In [None]:
import math
import torch
from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
# Load pre-trained model (weights)
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
model.eval()
# Load pre-trained model tokenizer (vocabulary)
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
def score(sentence):
    tokenize_input = tokenizer.tokenize(sentence)
    tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
    loss=model(tensor_input, lm_labels=tensor_input)
    return math.exp(loss)

In [None]:
os.chdir("../..")
with open("data/econ3_8.txt") as f:
    doc1 = f.read().strip()
#words = doc.split(' ')
#processed_doc = preprocess(doc)

In [None]:
with open("data/econ3_7.txt") as f:
    doc2 = f.read().strip()
#words = doc.split(' ')
#processed_doc = preprocess(doc)

In [None]:
# TODO filter meaningless sentences
def get_teacher_text(doc):
    "Extracts text spoken by teacher"
    matches = re.findall(r"(?:(?:[A-Z][a-z]*\s)*(?:[A-Z][a-z]*)):", doc)
    splits = re.split(r"(?:(?:[A-Z][a-z]*\s)*(?:[A-Z][a-z]*)):", doc)[1:]
    teacher = Counter(matches).most_common(1)[0][0]
    return ''.join([splits[i] for i in range(len(splits)) if matches[i] == teacher])

def clean(doc):
    return re.sub(r"[',]", "", re.sub(r"\s+", " ", doc))

def get_sents(doc):
    return [sent for sent in sent_tokenize(doc)]
def get_filtered_sents(doc):
    return [sent for sent in sent_tokenize(doc) if len(re.findall(r"\s", sent)) > 10]

In [None]:
sents = get_sents(clean(get_teacher_text(doc1)))
filtered_sents = get_filtered_sents(clean(get_teacher_text(doc1)))

In [None]:
stop_words = stopwords.words('english')
#stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

data_words = list(sent_to_words(filtered_sents))
# remove stop words
data_words = remove_stopwords(data_words)

id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

num_topics = 5

lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
lda_model

In [None]:
clusters = defaultdict(str)
labels = [np.array(lda_model.get_document_topics(sent))[:, 1].argmax() for sent in corpus]
confidences = [np.array(lda_model.get_document_topics(sent))[labels[i],1] for i, sent in enumerate(corpus)]
for i in range(len(filtered_sents)):
    clusters[labels[i]] += filtered_sents[i] + " "
for topic in clusters:
    clusters[topic] = re.sub(r"\s+", " ", clusters[topic])
    clusters[topic] = re.sub(r"[',]", "", clusters[topic])
clusters

In [None]:
df = pd.DataFrame().assign(topic=labels, confidence=confidences, sentences=filtered_sents)
top_sentences = df.groupby("topic").apply(lambda grp: df.loc[grp['confidence'].nlargest(5).index])
top_sentences = top_sentences.assign(questions=top_sentences["sentences"].apply(lambda sent: get_most_coherent_qa(qg(sent))))
top_sentences.index = top_sentences.index.droplevel(0)
top_sentences = top_sentences.reset_index(drop=True)
top_sentences.head()

In [None]:
top_sentences

In [None]:
questions = {}
for i in range(num_topics):
    questions[i] = {"questions":top_sentences[top_sentences["topic"] == i]["questions"].to_list()}
questions

In [None]:
from src.backend.nlp import *

In [None]:
summarizer = pipeline("summarization")

In [None]:
summary_sentences = df.groupby("topic").apply(lambda grp: df.loc[grp['confidence'].nlargest(10).index])
summary_sentences.index = summary_sentences.index.droplevel(0)
summary_sentences = summary_sentences.reset_index(drop=True)
to_summarize = summary_sentences.groupby("topic")["sentences"].sum()

In [None]:
summaries = to_summarize.apply(lambda x: ' '.join([t["summary_text"] for t in summarizer(x)])) 
summaries

In [None]:
df.groupby("topic").count()