In [None]:
'''
BELOW ARE ALL THE FUNCTIONS OR FEATURES IMPLEMENTED
'''

import nltk, re, language_tool_python
from pycorenlp import StanfordCoreNLP
import fileinput, statistics
from nltk import Tree
import csv

'''
Simple code below to open and read the files
'''
report = open("/home/praise_1906/Downloads/2019-calculus-RR03-0499.txt").read()
report_lines = open("/home/praise_1906/Downloads/2019-calculus-RR03-0499.txt").readlines()
rubric = open("/home/praise_1906/Downloads/pendulum_rubric.txt").read()
rubric_lines = open("/home/praise_1906/Downloads/pendulum_rubric.txt").readlines()
nlp = StanfordCoreNLP('http://localhost:9000')
    
'''
This function returns the length of the report
'''
def length_report(input_report):
    tokens = nltk.word_tokenize(input_report)
    return len(tokens)

'''
This function counts the number of formulaes in the report
'''
def count_formulas(input_report):
    count_fr = 0
    for line in input_report:
        if line.startswith('') and line.endswith(''):
            if line.count("#") == 2:
                count_fr += 1
    return count_fr

'''
This function counts the number of grammatical and other errors in the report
'''
def count_errors(input_report):
    tool = language_tool_python.LanguageTool('en-US')
    tokens = nltk.sent_tokenize(input_report)
    # Below is a simple test case which can be used to verify 
    # tokens = ['A sentence with a error in the Hitchhiker’s Guide tot he Galaxy', 'This is a perfectly normal sentence']
    res = 0
    for text in tokens:
        matches = tool.check(text)
        res += len(matches)
    return res

'''
This function compares the report and the rubric and finds overlapping words between them
and returns the total number of overlapping words

To achieve an even higher accuracy: use the word2vec to convert the report to vectors and the rubric to vectors and then
take the average of all the words in each sentence and calculate the cosine between the
resulting embeddings
'''
def task_sim_ft(input_report, rubric):
    tokens_report = nltk.word_tokenize(input_report)
    tokens_rubric = nltk.word_tokenize(rubric)
    overlapping_words = set(tokens_report) & set(tokens_rubric)
    return len(overlapping_words)

'''
This function checks for grammar or spelling errors using the language tool (2.7 - Error Features)
and returns the errors with a possible solution/replacement
'''
def error_ft(input_report):
    tool = language_tool_python.LanguageTool('en-US')
    tokens = nltk.sent_tokenize(input_report)
    # Below is a simple test case which can be used to verify
    # tokens = ['A sentence with a error in the Hitchhiker’s Guide tot he Galaxy', 'This is a perfectly normal sentence']
    for text in tokens:
        matches = tool.check(text)
        if matches != []:
            print(matches)

'''
This function analyzes the sentiments of the statements using pycorenlp and nlp.annotate.
Then we convert the sentences to parse trees or syntactic trees using pycorenlp.
We focus on the syntactic features here which are explained in the function
'''
def syntax_ft(input_report):  
    text, parsed_str = "", ""
    depth, num_sentences = 0, 0
    mean_var_lst = []
    
    num_commas, num_prep = 0, 0
    num_modal = 0
    
    for line in input_report:
        
        # This piece of code puts each sentence in new line when it finds a full stop
        pat = ('(?<!Dr)(?<!Esq)\. +(?=[A-Z])')
        list_string = ''.join(re.sub(pat, '.\n', line))
        tokens = nltk.sent_tokenize(list_string)
        
        for token in tokens:
            words = nltk.word_tokenize(token)
            temp_lst = nltk.pos_tag(words)
            
            # The two for loops below allow us to calculate the modal verb and prepositions
            # and the number of commans in each sentence respectively (tells us the total sum)
            for i in range(0, len(temp_lst)):
                if temp_lst[i][1] == 'MD':
                    num_modal += 1
                if temp_lst[i][1] == 'IN':
                    num_prep += 1
            for word in words:
                if word == ',':
                    num_commas += 1
                    
            # This particular if statement focuses on the "Statistics of sentence length" feature
            # which is one of the four syntactical features (Chen and He, 2013)
            if len(words) > 10 or len(words) > 18 or len(words) > 25:
                mean_var_lst.append(len(words))
                num_sentences += 1
                
            # The piece of code below is used to find the parse tree from the tokens for each word in the .txt file
            if token == "\n":
                continue
            else:
                text = ""
                text += token
                output = nlp.annotate(text, properties={
                   'annotators': 'parse',
                   'outputFormat': 'json'
                })
                parsed_str += output['sentences'][0]['parse']
    
    # These print statements are to check the output of the "Statistics of sentence length" feature
    print(mean_var_lst)
    print(statistics.variance(mean_var_lst))
    print(statistics.mean(mean_var_lst))
    
    parsed_sent = nltk.sent_tokenize(parsed_str)

    sbar_lst, node_depth = [], []
    
    # Inside this for loop, we check for "SBAR" which allows us to count the number of
    # subclauses in each sentence and we store it in a list to calculate the mean
    for sent in parsed_sent:
        node_depth.append(sent.count('\n'))
        sent_words = nltk.word_tokenize(sent)  
        for word in sent_words:
            if word == 'SBAR':
                sbar_lst.append(sent_words.count(word))
                
            # This particular piece of code is useful to locate the roots and belongs to the
            # "Sentence level" feature in the syntactical feature part. It helps us calculate the
            # depth and height of the parse tree
            if word == 'ROOT':
                depth += 1
    
    # To calculate the mean of the number of subclauses in each sentence
    print(statistics.mean(sbar_lst))
    
    # To calculate the node depth and height of the parse tree
    print(node_depth)
    print("Height of the parse tree is:", max(node_depth) + 1)
    print(sum(node_depth))
    
    # To print the number of modes, prepositions and commas in each sentence (sum)
    print(num_commas)
    print(num_modal)
    print(num_prep)
    
    # returns the depth of the parse tree
    return depth

'''
This function calls or gets all the above defined functions and is to be used in the SVM
'''
def get_feature(input_report, input_report_lines, input_rubric):
    length_report(input_report)
    count_formulas(input_report_lines)
    count_errors(input_report)
    task_sim_ft(input_report, input_rubric)
    error_ft(input_report)
    syntax_ft(input_report_lines)

'''Below you can test the functions individually'''

#length_report(report)
#count_formulas(report_lines)
#count_errors(report)
#task_sim_ft(report, rubric)
#error_ft(report)
#syntax_ft(report_lines)