In [1]:
# import libraries, packages, modules

import string
import math
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import nltk
import syllapy as syl
from nltk.tokenize import word_tokenize
import joblib


In [2]:
# Define a function for file handling

def process_file(file):
    """ Takes a .txt file as an argument and returns a string object."""
    txt_test = ""
    input_fd = open(file, "r", errors='ignore')
    for line_str in input_fd:
#         line_str = line_str.decode('utf-8',errors='ignore')
        line_str = line_str.strip() # remove the carriage return
        txt_test += " "
        txt_test += line_str

    input_fd.close()
    return txt_test

In [3]:
# Defining functions to get features from the string object
# Note: nltk tokenizer counts sentence-boundaries as tokens. The following functions do not.

def get_tokens(text):
    """Takes a string as an argument, returns a list of lower-case words.
    Contractions are included as words."""
    translator = str.maketrans('', '', ".!?,;:")
    stripped = text.translate(translator) # removes punctuation
    lower_words = stripped.lower() # converts all alpha characters to lowercase
    tokens = lower_words.split() # splits on whitespace
    return tokens

def get_types(tokens):
    types = set(tokens) # creates a set of unique words
    return types

def count_tokens(text):
    """Takes a string as an argument, returns the number of total words."""
    tokens = get_tokens(text)
    token_ct = len(tokens) # counts total words
    return token_ct

# functions for counting characters and syllables

def count_char(tokens):
    """Takes a list of words as an argument. Returns the total number of characters in the words."""
    char_ct = 0
    for token in tokens:
        for char in token:
            char_ct += 1
    return char_ct

def avg_token_len(text):
    """Take a string as an argument. Return the average length of a token in characters."""
    tokens = get_tokens(text)
    char_ct = count_char(tokens)
    avg_token_len = char_ct / len(tokens)
    return avg_token_len

def get_syllables(text):
    """Take a string as an argument. Return the total syllable count of the text."""
    tokens = get_tokens(text)
    syllable_ct = 0
    for token in tokens:
        syllable_ct += syl.count(token)
    return syllable_ct

In [4]:

def break_sentences(text):
    """Takes a string as an argument, returns a list of sentences."""
    sentences = nltk.sent_tokenize(text)
    return sentences

def sentence_count(text):
    """Takes a string as an argument, returns the number of sentences in the string."""
    sentences = break_sentences(text)
    return len(sentences)

def avg_sent_length(text):
    """Takes a string as an argument, returns the average length of the sentences in the string."""
    sentence_ct = sentence_count(text)
    word_ct = count_tokens(text)
    asl = word_ct / sentence_ct
    return asl

In [5]:
# define functions to tag POS with nltk
# and to get features related to POS categories

def get_pos_dict(text):
    """A function to return a dictionary containing the number of instances of each pos category in a text."""
    tag_ct_dict = {}
    sents = break_sentences(text) # get sentences
    for sent in sents:
        tokens = get_tokens(sent)
        tagged_tokens = nltk.pos_tag(tokens)
        for token, tag in tagged_tokens:
            if tag in tag_ct_dict:
              tag_ct_dict[tag] +=1
            else:
              tag_ct_dict[tag] = 1
    return tag_ct_dict

# this function needs to be re-written to take a dictionary as argument
def pos_total_cats(my_dict):
    """Takes a dictionary as an argument. Returns the length of the dictionary."""
    pos_cat_count = len(my_dict) # simply counts how many different POS categories are present in the text
    return pos_cat_count

def tag_in_text(tag, my_dict):
    """This function takes two arguments - a particular tag, and the extracted POS dictionary of the text.
    Returns the count of the tag"""
    if tag in my_dict.keys():
        ct = my_dict.get(tag)
        return ct
    else:
        return 0

def get_full_dict(my_dict):
    """Takes the dictionary directly extracted by the POS tagger and returns a 'full' dictionary.
    * Important! *
    This function relies on the tag_in_text function."""
    penn_tags_list = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', \
                      'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', \
                      'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', \
                      'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']
    full_dict = {}
    for tag in penn_tags_list:
        ct = tag_in_text(tag, my_dict)
        full_dict.update({tag : ct})
    return full_dict

def get_cat_dicts(full_dict):
    """This function returns the broad category counts.
    Takes the 'full' dictionary as an argument and returns summed tallies of general pos categories."""
    jj_dict = dict(filter(lambda item: "JJ" in item[0], full_dict.items()))
    nn_dict = dict(filter(lambda item: "NN" in item[0], full_dict.items()))
    prp_dict = dict(filter(lambda item: "PRP" in item[0], full_dict.items()))
    rb_dict = dict(filter(lambda item: "RB" in item[0], full_dict.items()))
    vb_dict = dict(filter(lambda item: "VB" in item[0], full_dict.items()))
    cpx_list = ['CC', 'IN', 'MD', 'TO', 'WDT', 'WP', 'WP$', 'WRB']
    cpx_dict = {}
    for tag in cpx_list:
        ct = tag_in_text(tag, full_dict)
        cpx_dict.update({tag : ct})
    return jj_dict, nn_dict, prp_dict, rb_dict, vb_dict, cpx_dict

def get_dict_value_ct(my_dict):
    ct = 0
    for key in my_dict.keys():
        if my_dict.get(key) != 0:
            ct += 1
    return ct
    
def get_dict_value_sum(my_dict):
    sum = 0
    for key in my_dict.keys():
        sum += my_dict.get(key)
    return sum

In [6]:
""" This cell reads three csv files.
They contain the academic word list and general service list.
We load those into two dictionaries for feature extraction. """

import csv

my_acad_dict = {}
with open('eap_asl.csv', newline='') as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',')
    for row in csvreader:
        my_acad_dict.update({row[0]:row[1]})
csvfile.close()

my_gsl_dict = {}
"""This block of code imports the general service list.
Each word has an integer frequency value. Higher values indicate
more frequent (common) words. """
with open('gsl_final.csv', newline='') as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',')
    for row in csvreader:
        my_gsl_dict.update({row[0]:row[1]})
csvfile.close()

longman_dict = {}
"""This block of code imports the Longman 3000.
Each word has an integer frequency value. Higher values indicate
more frequent (common) words. """
with open('longman_final.csv', newline='') as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',')
    for row in csvreader:
        longman_dict.update({row[0]:row[1]})
csvfile.close()

In [7]:
# This cell defines five functions to handle academic and general service vocabulary in the text.

def acad_ct(my_list, acad_dict):
    """This function counts how many academic words are present in a list."""
    acad_ct = 0
    for item in my_list:
        if item in acad_dict.keys():
            acad_ct += 1
    return acad_ct

def acad_in_list(my_list, acad_dict):
    """This function returns a list of academic words which apepar in the list.
    Will work best when the list argument is the types of a given text.
    The academic word list has both head-words and inflected forms,
    This version of the function returns only the head-word."""
    output_list = []
    for item in my_list:
        if item in acad_dict.keys():
            output_list.append(acad_dict.get(item))
    output_set = set(output_list)
    return sorted(output_set)

def gsl_simple_ct(my_list, gsl_dict):
    """Return a simple count of how many list items are in the gsl."""
    gsl_ct = 0
    for item in my_list:
        if item in gsl_dict.keys():
            gsl_ct += 1
    return gsl_ct

def get_gsl_freq(my_list, gsl_dict):
    """This function gets the relative frequency of the items in the list.
    It takes two arguments: a list and a dictionary.
    Returns an integer value, the sum of the frequency values for each item.
   """
    gsl_freq_sum = 0
    for item in my_list:
        if item in gsl_dict.keys():
            gsl_freq_sum += (int(gsl_dict.get(item)))
    return gsl_freq_sum

def get_longman_freq(my_list, longman_dict):
    longman_freq_sum = 0
    for item in my_list:
        if item in longman_dict.keys():
            longman_freq_sum += (int(longman_dict.get(item)))
    return longman_freq_sum

In [8]:
""" This cell reads the csv file with the cognates and loads them into four dictionaries. """

en_es_dict = {}
en_fr_dict = {}
en_ht_dict = {}
en_por_dict = {}

with open('cognates_final.csv', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, restval='xxx')
    for row in reader:
        if row['Spanish'] != 'xxx':
            es_cognate = row['Spanish']
            if len(es_cognate) > 2:
                en_es_dict.update({row['English'] : es_cognate})
        if row['French'] != 'xxx':
            fr_cognate = row['French']
            if len(fr_cognate) > 2:
                en_fr_dict.update({row['English'] : fr_cognate})
        if row['Haitian Creole'] != 'xxx':
            ht_cognate = row['Haitian Creole']
            if len(ht_cognate) > 2:
                en_ht_dict.update({row['English'] : ht_cognate})
        if row['Portuguese'] != 'xxx':
            por_cognate = row['Portuguese']
            if len(por_cognate) > 2:
                en_por_dict.update({row['English'] : por_cognate})
csvfile.close()

In [9]:
def get_cognate_token_pct(text, cognate_dict):
    """Given a text and cognate dictionary as arguments, this function returns the percentage of TOKENS that are cognates
    with that L1."""
    tokens = get_tokens(text)
    token_ct = 0
    cognate_ct = 0
    for token in tokens:
        token_ct += 1
        if token in cognate_dict.keys():
            cognate_ct += 1
    return cognate_ct / token_ct

def get_cognate_type_pct(text, cognate_dict):
    """Given a text and cognate dictionary as arguments, this function returns the percentage of TYPES that are cognates
    with that L1."""
    tokens = get_tokens(text)
    types = get_types(tokens)
    type_ct = 0
    cognate_ct = 0
    for my_type in types:
        type_ct += 1
        if my_type in cognate_dict.keys():
            cognate_ct += 1
    return cognate_ct / type_ct

def get_cognate_pct(my_list, cognate_dict):
    """Here the idea is to use a more versatile function that will work on either the tokens or the types."""
    item_ct = 0
    cognate_ct = 0
    for item in my_list:
        item_ct += 1
        if item in cognate_dict.keys():
            cognate_ct += 1
    return cognate_ct / item_ct

def get_cognate_list(text, cognate_dict):
    """This function takes a text and cognate dictionary as arguments, returns a list of the cognates in the text."""
    tokens = get_tokens(text)
    types = get_types(tokens)
    my_cognate_list = []
    for my_type in types:
        if my_type in cognate_dict.keys():
            baby_list = [my_type, cognate_dict.get(my_type)]
            my_cognate_list.append(baby_list)
    return my_cognate_list

In [10]:
# legacy readability indices
# these two use sentences, word count, and syllables

def flesch_kincaid(text):
    sent_ct = sentence_count(text)
    tokens = get_tokens(text)
    word_ct = len(tokens)
    syllable_ct = get_syllables(text)
    fkre = 206.835 - 1.015*(word_ct / sent_ct) - 84.6*(syllable_ct / word_ct)
    if fkre > 0:
        return fkre
    else:
        return 0
 
def gunning_fog(text):
    sent_ct = sentence_count(text)
    tokens = get_tokens(text)
    word_ct = len(tokens)
    complex_ct = 0
    for token in tokens:
        syllable = syl.count(token)
        if syllable > 2:
            complex_ct +=1
    gf_index = 0.4*((word_ct / sent_ct) + 100*(complex_ct / word_ct))
    if gf_index > 0:
        return gf_index
    else:
        return 0

# these two use sentences, word count, and characters

def get_ari(text):
    """This one is called the automated readability index. Military, 1960s."""
    sent_ct = sentence_count(text)
    tokens = get_tokens(text)
    word_ct = len(tokens)
    char_ct = count_char(tokens)
    ari = 4.71*(char_ct / word_ct) + 0.5*(word_ct / sent_ct) - 21.43
    if ari > 0:
        return ari
    else:
        return 0

def coleman_liau(text):
    """The Coleman-Liau Formula is based on total characters per 100 words, sentences per 100 words, and a constant."""
    sent_ct = sentence_count(text)
    tokens = get_tokens(text)
    word_ct = len(tokens)
    char_ct = count_char(tokens)
    coleman_liau = 5.88*(char_ct / word_ct) - 29.6*(sent_ct / word_ct) - 15.8
    if coleman_liau > 0:
        return coleman_liau
    else:
        return 0


In [11]:
# Using the functions above, create a 'list of lists' object containing features for each document.

def get_features(text, L1_val="es"):
    """Takes a long string as an argument, and returns a list of x features:
       """
    feature_list = [] # initialize list object to receive features
    # text = process_file(doc) # get the txt file as a string
    
    tokens = get_tokens(text)
    types = get_types(tokens)
    sentences = break_sentences(text)
    
    # get 6 basic features

    token_ct = len(tokens)
    type_ct = len(types)
    # token_ct, type_ct = tokens_and_types(text) # get token and type counts
    ttr = (type_ct / token_ct) # get type-to-token ratio
    sent_ct = sentence_count(text) # get sentence count
    asl = avg_sent_length(text) # get average sentence length
    # max_sl = max_sent_length(text) # get max sentence length

    # add 6 basic features
    
    # add feature token count
    feature_list.append(token_ct)
    # add feature type count
    feature_list.append(type_ct)
    # add feature ttr
    feature_list.append(ttr)
    # add feature sentence count
    feature_list.append(sent_ct)
    # add feature average sentence length
    feature_list.append(asl)
    # add feature max sentence length
    # feature_list.append(max_sl)    
    
    # get pos dictionaries

    my_dict = get_pos_dict(text)
    full_dict = get_full_dict(my_dict)
    jj_dict, nn_dict, prp_dict, rb_dict, vb_dict, cpx_dict = get_cat_dicts(full_dict)

    # Use this dict to select features from full_dict of pos features
    pos_light_switch = {'CC': 1, 'CD': 1, 'DT': 1, 'EX': 1, 'FW': 0, 'IN': 1, 'JJ': 1, 'JJR': 1, 'JJS': 1, \
                      'LS': 0, 'MD': 1, 'NN': 1, 'NNS': 1, 'NNP': 0, 'NNPS': 0, 'PDT': 0, 'POS': 0, 'PRP': 1, \
                      'PRP$': 0, 'RB': 1, 'RBR': 0, 'RBS': 0, 'RP': 1, 'SYM': 0, 'TO': 1, 'UH': 0, \
                      'VB': 1, 'VBD': 1, 'VBG': 1, 'VBN': 1, 'VBP': 1, 'VBZ': 0, 'WDT': 0, 'WP': 0, 'WP$': 0, 'WRB': 0}
    
    # get and append up to 36 pos counts as features
    for key in full_dict.keys():
        my_int = full_dict.get(key)
        if pos_light_switch.get(key) == 1: # this line allows some but not all POS cats to be used as features, see above
            feature_list.append(my_int)

   
    # get and append up to 20 aggregate pos features
    pos_total_ct = get_dict_value_ct(full_dict) # number of syntactic categories present in the text
    pos_total_ps = pos_total_ct / sent_ct # divided by the number of sentences
    jj_total_ct = get_dict_value_ct(jj_dict) # number of adjective categories present in the text
    jj_total_sum = get_dict_value_sum(jj_dict) # number of total adjectives present in the text
    jj_density = jj_total_sum / token_ct
    nn_total_ct = get_dict_value_ct(nn_dict) # number of noun categories present in the text
    nn_total_sum = get_dict_value_sum(nn_dict) # number of total nouns present in the text
    nn_density = nn_total_sum / token_ct
    prp_total_ct = get_dict_value_ct(prp_dict) # number of pronoun categories present in the text
    prp_total_sum = get_dict_value_sum(prp_dict) # number of total pronouns present in the text
    prp_density = prp_total_sum / token_ct
    rb_total_ct = get_dict_value_ct(rb_dict) # number of adverb categories present in the text
    rb_total_sum = get_dict_value_sum(rb_dict) # number of total adverbs present in the text
    rb_density = rb_total_sum / token_ct
    vb_total_ct = get_dict_value_ct(vb_dict) # number of verb categories present in the text
    vb_total_sum = get_dict_value_sum(rb_dict) # number of total verbs present in the text
    vb_density = vb_total_sum / token_ct
    cpx_ct = get_dict_value_ct(cpx_dict) # number of 'complex' categories present in the text
    cpx_sum = get_dict_value_sum(cpx_dict) # number of total 'complex' funtion words present in the text
    cpx_density = cpx_sum / token_ct

    # add 20 aggregate pos features
    feature_list.append(pos_total_ct)
    feature_list.append(pos_total_ps)
    feature_list.append(jj_total_ct)
    # feature_list.append(jj_total_sum)
    feature_list.append(jj_density)
    feature_list.append(nn_total_ct)
    # feature_list.append(nn_total_sum)
    feature_list.append(nn_density)
    feature_list.append(prp_total_ct)
    # feature_list.append(prp_total_sum)
    feature_list.append(prp_density)
    feature_list.append(rb_total_ct)
    # feature_list.append(rb_total_sum)
    feature_list.append(rb_density)
    feature_list.append(vb_total_ct)
    # feature_list.append(vb_total_sum)
    feature_list.append(vb_density)
    feature_list.append(cpx_ct)
    # feature_list.append(cpx_sum)
    feature_list.append(cpx_density)

    # get cognate feature
    # L1 is an argument, default = es

    language_codes = {"es": "Spanish", "fr": "French", "ht": "Haitian Creole", "por": "Portuguese"}
    cognate_dictionaries = {"es": en_es_dict, "fr": en_fr_dict, "ht": en_ht_dict, "por": en_por_dict}
    cognate_dict = cognate_dictionaries.get(L1_val)

    cognate_token_pct = get_cognate_pct(tokens, cognate_dict)
    cognate_type_pct = get_cognate_pct(types, cognate_dict)
    
    feature_list.append(cognate_token_pct)
    feature_list.append(cognate_type_pct)
    
    # get legacy readability indices:

    fkre = flesch_kincaid(text)
    gf = gunning_fog(text)
    ari = get_ari(text)
    cl = coleman_liau(text)

    feature_list.append(fkre)
    feature_list.append(gf)
    feature_list.append(ari)
    feature_list.append(cl)
    
    # get and append 4 'general service list' word frequency features

    gsl_token_ct = gsl_simple_ct(tokens, my_gsl_dict)
    gsl_type_ct = gsl_simple_ct(types, my_gsl_dict)
    gsl_token_freq = get_gsl_freq(tokens, my_gsl_dict)
    gsl_type_freq = get_gsl_freq(types, my_gsl_dict)

    gsl_pct_tokens = gsl_token_ct / token_ct
    gsl_pct_types = gsl_type_ct / type_ct
    gsl_dfreq_tokens = gsl_token_freq / token_ct
    gsl_dfreq_types = gsl_type_freq / type_ct
    
    # gsl_sum_tokens = get_gsl_sum_tokens(tokens, my_gsl_dict, my_gsl_ct_dict)
    # gsl_sum_types = get_gsl_sum_types(types, my_gsl_dict, my_gsl_ct_dict)
    # gsl_d_tokens = get_gsl_density_tokens(gsl_sum_tokens, token_ct)
    # log_gsl_d_tokens = get_log_gsl_dens_tokens(gsl_sum_tokens, token_ct)
    # gsl_d_types = get_gsl_density_types(gsl_sum_types, type_ct)
    # log_gsl_d_types = get_log_gsl_dens_types(gsl_sum_types, type_ct)

    feature_list.append(gsl_pct_tokens)
    feature_list.append(gsl_pct_types)
    feature_list.append(gsl_dfreq_tokens)
    feature_list.append(gsl_dfreq_types)

    # get and append two academic word list features from eap
    
    acad_token_ct = acad_ct(tokens, my_acad_dict)
    acad_type_ct = acad_ct(types, my_acad_dict)

    acad_pct_tokens = acad_token_ct / token_ct
    acad_pct_types = acad_type_ct / type_ct

    feature_list.append(acad_pct_tokens)
    feature_list.append(acad_pct_types)

    # get and append two word frequency features from longman 3000
    
    longman_token_freq = get_longman_freq(tokens, longman_dict)
    longman_type_freq = get_longman_freq(types, longman_dict)
    longman_pct_tokens = longman_token_freq / token_ct
    longman_pct_types = longman_type_freq / type_ct

    feature_list.append(longman_pct_tokens)
    feature_list.append(longman_pct_types)
    
    return feature_list

At this point, in the main notebook, it was time to train the classifier. Instead we are going to load the trained classifier and use it to make a prediction on a given sample. Also outputting the other lists.

In [13]:
# load the trained model
loaded_clf = joblib.load("./classifier_v4.joblib")
# load the current text

def eval_output(text, loaded_clf):
    features = get_features(text)
    feature_array = np.array(features)
    sample = feature_array.reshape(1, -1)
    my_y_value = loaded_clf.predict(sample)
    if my_y_value == 0:
        output_level = "beginner"
    elif my_y_value == 1:
        output_level = "intermediate"
    else:
        output_level = "advanced"
    eval_str = f"This text is best suited for {output_level}-level ML students."
    return eval_str

def cognate_output(text, L1_val="es"):
    language_codes = {"es": "Spanish", "fr": "French", "ht": "Haitian Creole", "por": "Portuguese"}
    cognate_dictionaries = {"es": en_es_dict, "fr": en_fr_dict, "ht": en_ht_dict, "por": en_por_dict}
    cognate_dict = cognate_dictionaries.get(L1_val)
    cognate_list = get_cognate_list(text, cognate_dict)
    sorted_cognate_list = sorted(cognate_list)
    L1_name = language_codes.get(L1_val)
    header_string = f"Cognates for {L1_name} present in this text:"
    return header_string, sorted_cognate_list, L1_name

def acad_output(text, acad_dict):
    tokens = get_tokens(text)
    acad_list = acad_in_list(tokens, acad_dict)
    return acad_list

def main(input_filepath, L1_val="es", acad_dict=my_acad_dict):
    output_filepath = input_filepath[:-4] + "_output.txt"
    text = process_file(input_filepath)
    eval_str = eval_output(text, loaded_clf)
    cognate_header, cognate_list, L1_name = cognate_output(text, L1_val)
    acad_list = acad_output(text, acad_dict)
    with open(output_filepath, "w", encoding="utf-8") as file:
        print(eval_str, file=file)
        if len(cognate_list) > 1:
            print(" ", file=file)
            print(cognate_header, file=file)
            print(" ", file=file)
            print("{:>16s}   {:<16s}".format("English",L1_name), file=file) # Later this needs to be "L1 name" variable
            print("{:>16s}   {:<16s}".format("----------","----------"), file=file)
            for item in cognate_list:
                line_str = "{:>16s} - {:<16s}".format(item[0],item[1])
                print(line_str, file=file)
        if len(acad_list) > 1:
            print(" ", file=file)
            print("Academic vocabulary from the text:", file=file)
            print(" ", file=file)
            for item in acad_list:
                print(item, file=file)
            

In [14]:
# main("test.txt")