In [1]:
import xml.etree.ElementTree as ET
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow_hub as hub
import tensorflow_text as text

In [2]:
xmltree = ET.parse("Posts.xml.txt")
root = xmltree.getroot()

# Data Cleaning

In [3]:
def getTopicTags(taglist):
    regex = re.compile(r"(<[^<>]+>)")
    list = regex.findall(taglist)
    output = [topic[1:-1] for topic in list]
    return output

In [4]:
def removeParagraphTags(body):
    body.strip()
    sentences = ""
    splitter = re.compile(r'</p>')
    paragraphs = re.split(splitter, body)
    for p in range(len(paragraphs)):
        if(len(paragraphs[p].strip()) != 0):
            paragraphs[p] += '</p>'
        else:
            del paragraphs[p]
    regex = re.compile(r'<p>(.+)</p>')
    for p in paragraphs:
        if(regex.search(p) is not None):
            output = regex.search(p)
            # print(output.group(1))
            sentences += output.group(1).strip()+" "
    # print(sentences)
    return sentences

In [5]:
def processAnchorTags(para):
    #finding and removing links
    splitter = re.compile(r'</a>')
    lines = re.split(splitter, para)
    line_count = len(lines)
    for i in range(line_count-1):
        lines[i] +='</a>'
    regex = re.compile(r'(<a href=")([^"]+)(".*>)(.*)(</a>)')
    processed_array = []
    processed_para = ''
    
    for line in lines:
        if(regex.search(line) is not None):
            output = regex.search(line)
            repl = output.group(4)
            repl = "/".join(repl.split('\\'))
            # print(repl)
            line = re.sub(regex, repl, line)
            # print("line:\t", line)
            processed_array.append(output.group(2))
        processed_para += line
    res_dict = {}
    res_dict['para'] = processed_para
    res_dict['urls'] = processed_array
    return res_dict

In [6]:
def processMathFormulae(paragraph):
    # print(paragraph)
    splitter = re.compile(r'</span>')
    lines = re.split(splitter, paragraph)
    line_count = len(lines)
    for i in range(line_count-1):
        lines[i] +='</span>'
    res = {}
    res['para'] = ''
    res['exp'] = []
    regex = re.compile(r'(<span class="math-container".*>)(.+)(</span>)')
    for line in lines:
        if regex.search(line) is not None:
            output = regex.search(line)
            line = re.sub(regex,'<math_exp>', line)
            res['exp'].append(output.group(2))
            
        res['para'] += line
    # print(res['exp'])
    return res

In [7]:
def removeAnyOtherTags(body):
    # to be used at the end of cleaning
    tag = re.compile(r'<.?\b(?!math_exp\b)\w+>')
    lines = re.split(tag, body)
    res = "".join(lines)
    return res

In [8]:
no_of_posts_loaded = 1500
nodeListReduced = {}
nodeList = []
for r in range(min(no_of_posts_loaded, len(root))):
    nodeList.append(root[r].attrib)
    
    
for node in nodeList:
    post = {}
    post['Id'] = node['Id']
    
    if (node['PostTypeId'] == '1'): #if question
        post['Type'] = 'question'
        post['Title'] = node['Title']
        post['Tags'] = getTopicTags(node['Tags'])
        if 'AcceptedAnswerId' in node:
            post['AcceptedAnswerId'] = node['AcceptedAnswerId']
            
    if (node['PostTypeId'] == '2'): #if answer
        post['Type'] = 'answer'
        post['ParentId'] = node['ParentId']
        
    body = node['Body']
    body = removeParagraphTags(body)
    
    anchor_data = processAnchorTags(body)
    post['urls'] = anchor_data['urls']
    body = anchor_data['para']
    
    math_data = processMathFormulae(body)
    post['exp'] = math_data['exp']
    body = math_data['para']
    
    body = removeAnyOtherTags(body)
    
    post['Body'] = body
    nodeListReduced[node['Id']] = post

for postId in nodeListReduced:
    print(postId, nodeListReduced[postId])

1 {'Id': '1', 'Type': 'question', 'Title': 'What Does it Really Mean to Have Different Kinds of Infinities?', 'Tags': ['elementary-set-theory', 'intuition', 'infinity', 'faq'], 'AcceptedAnswerId': '9', 'urls': ['http://en.wikipedia.org/wiki/The_Man_Who_Loved_Only_Numbers', 'http://en.wikipedia.org/wiki/Paul_Hoffman_(science_writer)'], 'exp': [], 'Body': 'Can someone explain to me how there can be different kinds of infinities? I was reading "The man who loved only numbers" by Paul Hoffman and came across the concept of countable and uncountable infinities, but they\'re only words to me. Any help would be appreciated. '}
3 {'Id': '3', 'Type': 'question', 'Title': 'List of interesting math podcasts?', 'Tags': ['soft-question', 'big-list', 'online-resources'], 'urls': ['http://mathfactor.uark.edu/'], 'exp': [], 'Body': 'mathfactor is one I listen to.  Does anyone else have a recommendation? '}
4 {'Id': '4', 'Type': 'answer', 'ParentId': '3', 'urls': ['http://www.bbc.co.uk/podcasts/series/

# Preprocessing

In [9]:
def convert_lower_case(data):
    return np.char.lower(data)

In [10]:
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [11]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [12]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [13]:
def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

In [14]:
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

In [15]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

# Text Matching Score


In [16]:
## CREATING DATA FOR SEARCHING     SUBJECT TO CHANGE

q_data = []
a_data = []

# for node in list(nodeListReduced.values()):
#     print(node, type(node))

for node in list(nodeListReduced.values()):
    if(node['Type'] == 'question'):
        data = {}
        data['id'] = node['Id']
        para = node['Title']+" "+node['Body']+" "+". ".join(node['Tags'])
        data['para'] = para
        data['exp'] = node['exp']
        q_data.append(data)
    if(node['Type'] == 'answer'):
        data = {}
        data['id'] = node['Id']
        data['para'] = node['Body']
        data['exp'] = node['exp']
        a_data.append(data)

# for data in q_data:
#     print(data)
# for data in a_data:
#     print(data)

In [17]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

paragraphs = [data['para'] for data in q_data]+[data['para'] for data in a_data]

# paragraphs = [
#     "Three years later, the coffin was still full of Jello.",
#     "Three years later, the coffin was still full of Jello.",
#     "The person box was packed with jelly many dozens of months later.",
#     "The person box was packed with jelly many dozens of months later."
# ]

para_embeddings = model.encode(paragraphs)

idx = 0

for data in q_data:
    data['embedding'] = para_embeddings[idx]
    idx+=1
    
for data in a_data:
    data['embedding'] = para_embeddings[idx]
    idx+=1

table_size = 20
    
for question in q_data:
    score_table = []
    for answer in a_data:
        score = {}
        score['id'] = answer['id']
        value = cosine_similarity([question['embedding']], [answer['embedding']])
        score['value'] = value[0][0]
        score_table.append(score)
    score_table.sort(key=lambda i:i['id'])
    question['text_score'] = score_table # slice this table using table_size variable if needed

count = 0    
for data in q_data:
    answers = [ans['id'] for ans in data['text_score']]
    for ans_id in answers:
        parent_question = nodeListReduced[ans_id]['ParentId']
        if(data['id'] == parent_question):
            count+=1
            break

print("Total posts: ", no_of_posts_loaded)
print("Correctly answered: ", count)
print("Accuracy: ", (count*100)/no_of_posts_loaded,"%")
#     print(data['id'], "-->", data['text_score'])
    


Total posts:  1500
Correctly answered:  381
Accuracy:  25.4 %


# Math Regex Creation

In [17]:
def addEscapeSequence(math_exp):
    chars = ['.', '^', '$', '*', '+', '?','(', ')', '\\', '[', '{', '|', '-', ']']
    new_exp = ''
    for ch in math_exp:
        if ch in chars:
            new_exp += '\\' + ch
        else:
            new_exp += ch
    return new_exp

In [18]:
def replaceNumbers(math_exp):
    new_exp = ''
    regex = re.compile(r'(((\+|-)?([0-9]+)(\.[0-9]+)?)|((\+|-)?\.?[0-9]+))')
    new_exp = re.sub(regex, '(((\+|-)?([0-9]+)(\.[0-9]+)?)|((\+|-)?\.?[0-9]+))', math_exp)
    return new_exp

In [19]:
def replaceVariables(math_exp):
    new_exp = ''
    regex = re.compile(r'[a-zA-Z]+')
    new_exp = re.sub(regex, '[a-zA-Z]+', math_exp)
    return new_exp

In [20]:
def replaceNonTags(math_exp):
    return replaceVariables(replaceNumbers(addEscapeSequence(math_exp)))

In [21]:
def check_character_referecnes(math_exp): # checking for characters in the form of "&(something);" like "&amp;"
    new_exp = ''
    regex = re.compile(r'&.+?;')
    start = 0
    end = len(math_exp)
    while regex.search(math_exp, start, end) is not None:
        x = regex.search(math_exp, start, end)
        new_exp += replaceNonTags(math_exp[start:x.start()])+math_exp[x.start():x.end()]
        start = x.end()
    if len(math_exp[start:end])>0:
        new_exp += replaceNonTags(math_exp[start:end])
    return new_exp        

In [22]:
def processTagsAndVariables(math_exp):
    tag_set = set(['frac', 'tan', 'log_2', 'tanh', 'sqrt'])
    new_exp = ''
    breaks = [0]
    
    for idx in range(len(math_exp)):
        if(math_exp[idx] == '\\'):
            breaks.append(idx)
            
    breaks.append(len(math_exp))
    
    for idx in range(len(breaks)-1):
        start = breaks[idx]
        end = breaks[idx+1]
        sub_exp = math_exp[start:end]
        start = 0
        new_sub_exp = ''
        if(sub_exp == '\\'):
            new_exp += sub_exp
            continue
        elif(len(sub_exp) and sub_exp[0] == '\\'):
            new_sub_exp += '\\'
            start = 1
        found = False
        for pos in reversed(range(start, len(sub_exp))):
            if(sub_exp[start:pos+1] in tag_set):
                found = True
                new_sub_exp += sub_exp[:pos+1]+check_character_referecnes(sub_exp[pos+1:])
                break
        if(not found):
            new_sub_exp = check_character_referecnes(sub_exp)
        new_exp += new_sub_exp
    
    return new_exp

In [23]:
def create_all_regexes(input_regex, regexes):
    stack = []
    for i in range(len(input_regex)):
        if (input_regex[i:i+2] == '\\{'):
            stack.append(i+1)
        elif (input_regex[i] == '}'):
            new_regex = input_regex[0:stack[-1]+1]+'.+?'+input_regex[i:]
            if new_regex in regexes:
                return
            regexes.add(new_regex)
            create_all_regexes(new_regex, regexes)
            del stack[-1]

# Math Score Generation

In [24]:
for data in q_data[:25]:
    print(data)
    

{'id': '1', 'para': 'What Does it Really Mean to Have Different Kinds of Infinities? Can someone explain to me how there can be different kinds of infinities? I was reading "The man who loved only numbers" by Paul Hoffman and came across the concept of countable and uncountable infinities, but they\'re only words to me. Any help would be appreciated.  elementary-set-theory. intuition. infinity. faq', 'exp': []}
{'id': '3', 'para': 'List of interesting math podcasts? mathfactor is one I listen to.  Does anyone else have a recommendation?  soft-question. big-list. online-resources', 'exp': []}
{'id': '5', 'para': 'How can you prove that the square root of two is irrational? I have read a few proofs that <math_exp> is irrational. I have never, however, been able to really grasp what they were talking about. Is there a simplified proof that <math_exp> is irrational?  elementary-number-theory. proof-writing. radicals. rationality-testing', 'exp': ['\\sqrt{2}', '\\sqrt{2}']}
{'id': '6', 'par

In [25]:
def get_ans_exp_score(q_exp, a_exp):
    og = processTagsAndVariables(q_exp)
    all_regex = set()
    all_regex.add(og)
    create_all_regexes(og, all_regex)
    matches = 0
    for regex in all_regex:
        if re.search(regex, a_exp) is not None:
            matches += 1
    score = matches/len(all_regex)
    return score

math_exp = 'whole\\frac{\\tanh{45}}{\\log_2{denominator}}'
math_exp_2 = 'akkar\\frac{\\tanh{x}}{\\log_2{16}}'

print(get_ans_exp_score(math_exp, math_exp_2))

0.4444444444444444


In [30]:
def get_overall_answer_score(q_exp, a_exp_list):
    count = 0
    total = 0
    new_list = [a_exp for a_exp in a_exp_list if len(a_exp)>1]
    if len(new_list) == 0:
        return 0
    for a_exp in new_list:
        count += 1
        total += get_ans_exp_score(q_exp, a_exp)
    score = total/count
    return score

In [27]:
def get_math_matching_score(q_exp_list, a_exp_list):
    max_score = 0
    for q_exp in q_exp_list:
        if len(q_exp) == 1:
            continue
        max_score = max(max_score, get_overall_answer_score(q_exp, a_exp_list))
    return max_score

In [33]:
for question in q_data:
    score_table = []
    for answer in a_data:
        score = {}
        score['id'] = answer['id']
        value = get_math_matching_score(question['exp'], answer['exp'])
        score['value'] = value
        score_table.append(score)
    score_table.sort(key=lambda i: i['id'])
    question['math_score'] = score_table # slice this table using table_size variable if needed
    
    
# count = 0    
# for data in q_data:
#     answers = [ans['id'] for ans in data['math_score']]
#     for ans_id in answers:
#         parent_question = nodeListReduced[ans_id]['ParentId']
#         if(data['id'] == parent_question):
#             count+=1
#             break

# print("Total posts: ", no_of_posts_loaded)
# print("Correctly answered: ", count)
# print("Accuracy: ", (count*100)/no_of_posts_loaded,"%")
# #     print(data['id'], "-->", data['text_score'])

In [28]:
## Testing

## 16 {'Id': '16', 'Type': 'answer', 'ParentId': '5', 'urls': [], 
## 'exp': ['\\sqrt{2}', 'R=\\sqrt{2}=\\frac{Q}{D}', 'Q', 'D', 'R', 'R^2 = 2 = \\frac{Q^2}{D^2}', 
## 'Q', 'D', 'Q^2', '2', 'Q^2 = 2^1 x', 'x', 'Q^2', '\\sqrt{2}'],

query_exp = nodeListReduced['16']['exp'][0]
query_regex = processTagsAndVariables(query_exp)
# query_regex = re.compile(r'{}'.format(regex_str))
print(query_regex, type(query_regex))
regex_set = set()
regex_set.add(query_regex)
create_all_regexes(query_regex, regex_set)
for exp in nodeListReduced['16']['exp']:
    if (len(exp) == 1):
        continue
    match_count = 0
    for regex in regex_set:
        compiled_regex = re.compile(r'{}'.format(regex))
        if re.search(compiled_regex, exp) is not None:
            match_count+=1
    score = match_count/len(regex_set)
    print(exp,"\t\t\t-->",score)
    


\\sqrt\{(((\+|-)?([0-9]+)(\.[0-9]+)?)|((\+|-)?\.?[0-9]+))} <class 'str'>
\sqrt{2} 			--> 1.0
R=\sqrt{2}=\frac{Q}{D} 			--> 1.0
R^2 = 2 = \frac{Q^2}{D^2} 			--> 0.0
Q^2 			--> 0.0
Q^2 = 2^1 x 			--> 0.0
Q^2 			--> 0.0
\sqrt{2} 			--> 1.0
