### COMP9417: Machine Learning and Data Mining 
### Group Project (Topic 2)
### CommonLit Readability Prize
#### Features File
Written by WENG XINN CHOW (z5346077) on 24.07.2021

In [55]:
# Import all required packages
import pandas as pd
import numpy as np
import textstat
import nltk
import spacy

from readability import getmeasures
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textstat.textstat import textstatistics
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex

In [56]:
# Read csv file and print the first 5 rows
train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [57]:
# Helper functions for feature engineering

def tokenization(excerpt):
    """
    Tokenize the given excerpt (corpus to list of words) using nltk
    """
    
    words = word_tokenize(excerpt)
    
    return words

    
def stopwords_count(words):
    """
    Count and return the number of stop words 
    """
    
    # Get a set of English stopwords
    stopWords = set(stopwords.words('english'))
    
    count = 0
    # Count the stopwords 
    for i in range(len(words)):
        if words[i] in stopWords:
            count += 1
    
    return count


def sentiment_count(words):
    """
    Count and return the number of positive and negative sentiments
    """
    
    sentiment_intensity = SentimentIntensityAnalyzer()
    
    positive_count = 0
    negative_count = 0
    # Count the positive adjectives 
    for i in range(len(words)):
        # If the compound scroe >= 0.05, it's considered as positive sentiment
        if (sentiment_intensity.polarity_scores(words[i])['compound']) >= 0.05:
            positive_count += 1
        # If the compound scroe <= 0.05, it's considered as negative sentiment
        elif (sentiment_intensity.polarity_scores(words[i])['compound']) <= 0.05:
            negative_count += 1
           
    return positive_count, negative_count

In [58]:
def feature_engineering(excerpt):
    """
    Generate the relevant features related to readability and return all features 
    """
    
    # Extract features using readability library
    output = getmeasures(excerpt, lang = 'en')
    # Need to tokenize the excerpt before generating features (some features only)
    words = tokenization(excerpt)
    
    # Text info 
    num_characters = len(excerpt)
    num_words = len(excerpt.split(' '))
    num_sentences = len(excerpt.split('\n'))
    num_syllables = sum(textstatistics().syllable_count(w) for w in words)
    num_unique_words = len(set(excerpt.split(' ')))
    unique_to_total = num_unique_words / num_words
    # Characters per word and its average
    characters_word = [len(w) for w in excerpt.split(' ')]
    avg_characters_word = np.mean(characters_word)
    # Character per sentence and its average
    characters_sentence = [len(s) for s in excerpt.split('\n')]
    avg_characters_sentence = np.mean(characters_sentence)
    # Words per sentence and its average
    words_sentence = [len(s.split(' ')) for s in excerpt.split('\n')]
    avg_words_sentence = np.mean(words_sentence)
    # Syllables per word and its average
    syllables_word = [textstatistics().syllable_count(w) for w in words]
    avg_syllables_word = np.mean(syllables_word)
    # Number of polysyllables (syllables >= 3)
    num_polysyllables = [textstatistics().syllable_count(w) >= 3 for w in words].count(True)
    num_stopwords = stopwords_count(words)
    num_positive_sentiment, num_negative_sentiment = sentiment_count(words)
    # Number of long words (len > 6)
    num_longwords = [len(w) > 6 for w in excerpt.split(' ')].count(True)
    
    # Readability Scores (different metrics)
    # Flesch Reading Ease
    flesch = textstat.flesch_reading_ease(excerpt)
    # Flesch Kincaid Grade
    kincaid = textstat.flesch_kincaid_grade(excerpt)
    # Gunning Fog Index
    gunning = textstat.gunning_fog(excerpt)
    # SMOG index
    smog = textstat.smog_index(excerpt)
    # Automated Readability Index 
    auto = textstat.automated_readability_index(excerpt)
    # Coleman-Liau Index
    coleman = textstat.coleman_liau_index(excerpt)
    # Linsear Write Formula
    linsear = textstat.linsear_write_formula(excerpt)
    # Dale-Chall Readability Score
    dalechall = textstat.dale_chall_readability_score(excerpt)
    # LIX
    lix = num_longwords / num_words
    # RIX
    rix = num_longwords / num_sentences
    
    # Word usage
    be_verbs = output['word usage']['tobeverb']
    aux_verbs = output['word usage']['auxverb']
    conjunctions = output['word usage']['conjunction']
    prepositions = output['word usage']['preposition']
    nominalizations = output['word usage']['nominalization']
    
    feature_scores = []
    feature_scores.extend((num_characters, num_words, num_sentences, num_syllables, 
                           num_unique_words, unique_to_total, avg_characters_word, 
                           avg_characters_sentence, avg_words_sentence, avg_syllables_word,
                           num_polysyllables, num_stopwords, num_positive_sentiment, 
                           num_negative_sentiment, num_longwords, flesch, kincaid, gunning, smog, 
                           auto, coleman, linsear, dalechall, lix, rix, be_verbs, aux_verbs, 
                           conjunctions, prepositions, nominalizations))
    
    return feature_scores

In [59]:
# Convert the excerpt column in df to numpy array
excerpt_arr = pd.DataFrame(train_df['excerpt']).to_numpy()
features_scores = []

# Create features for all excerpts
for e in range(len(excerpt_arr)):
    features_scores.append(feature_engineering(excerpt_arr.item(e)))


In [60]:
# Convert features into a pandas dataframe
features_names = ['num_characters', 'num_words', 'num_sentences', 'num_syllables', 
                  'num_unique_words', 'unique_to_total', 'avg_characters_word', 
                  'avg_characters_sentence', 'avg_words_sentence', 'avg_syllables_word', 
                  'num_polysyllables', 'num_stopwords', 'num_positive_sentiment', 
                  'num_negative_sentiment', 'num_longwords', 'flesch', 'kincaid', 'gunning', 'smog', 
                  'auto', 'coleman', 'linsear', 'dalechall', 'lix', 'rix', 'be_verbs', 'aux_verbs', 
                  'conjunctions', 'prepositions', 'nominalizations']
features_df = pd.DataFrame(features_scores, columns = features_names)
features_df.head()

Unnamed: 0,num_characters,num_words,num_sentences,num_syllables,num_unique_words,unique_to_total,avg_characters_word,avg_characters_sentence,avg_words_sentence,avg_syllables_word,...,coleman,linsear,dalechall,lix,rix,be_verbs,aux_verbs,conjunctions,prepositions,nominalizations
0,992,174,6,230,112,0.643678,4.706897,164.5,29.833333,1.127451,...,8.06,9.0,6.65,0.224138,6.5,12,1,11,23,1
1,937,164,6,228,123,0.75,4.719512,155.333333,28.166667,1.022422,...,6.78,7.285714,5.92,0.20122,5.5,5,5,7,22,0
2,908,162,5,215,124,0.765432,4.611111,180.8,33.2,1.02381,...,7.2,14.75,6.29,0.209877,6.8,7,1,11,18,0
3,909,163,2,196,117,0.717791,4.582822,454.0,82.0,1.010309,...,8.54,12.5,6.61,0.184049,15.0,1,0,15,26,0
4,723,147,1,170,51,0.346939,3.92517,723.0,147.0,0.971429,...,4.83,13.5,1.57,0.068027,10.0,4,0,10,10,0


In [61]:
def spacy_words_tokens(excerpt_arr):
    """
    Tokenize the given excerpt_arr (corpus array into tokens) and return the list of tokens
    """
    
    # Add regex to Spacy infix to preserve intra-word concatenators
    nlp = spacy.load('en_core_web_sm')
    infixes = nlp.Defaults.prefixes + [r'[./]',r"[-]~",r"(.'.)"]
    infixes_re = spacy.util.compile_infix_regex(infixes)
    nlp.tokenizer = Tokenizer(nlp.vocab, infix_finditer = infixes_re.finditer)
    
    # Tokenize the whole excerpt array 
    words_list = []
    for ex in excerpt_arr:
        doc = nlp(ex)
        tokens = [token for token in doc if not (token.is_punct or token.is_space)]
        words_list.append(tokens)
    
    return words_list


def spacy_features(excerpt_arr):
    """
    Create features (related to word contents) using spacy
    """
    
    # Tokenize the given excerpt array before creating features
    words_list = spacy_words_tokens(excerpt_arr)
    
    # Word contents (adjectives, adverbs, nouns, verbs, pronouns)
    adjectives = [sum([w.pos_ == 'ADJ' for w in wl]) for wl in words_list]
    adverbs = [sum([w.pos_ == 'ADV' for w in wl]) for wl in words_list]
    nouns = [sum([w.pos_ == 'NOUN' for w in wl]) for wl in words_list]
    verbs = [sum([w.pos_ == 'VERB' for w in wl]) for wl in words_list]
    pronouns = [sum([w.pos_ == 'PRON' for w in wl]) for wl in words_list]
    total_words = [len(wl) for wl in words_list]
    contents = [sum([w.pos_ in ['ADJ','ADV','NOUN','VERB'] for w in wl]) for wl in words_list]
    # Measure the proportion of words contents
    content_diversity = np.divide(contents, total_words)
    
    spacy_features_scores = []
    spacy_features_scores.extend((adjectives, adverbs, nouns, verbs, pronouns, content_diversity))
    
    return spacy_features_scores   

In [62]:
# Create spacy features 
spacy_features_scores = spacy_features(train_df['excerpt'])

# Convert spacy features into a pandas dataframe
spacy_features_names = ['adjectives', 'adverbs', 'nouns', 'verbs', 'pronouns', 'content_diversity']
spacy_features_df = pd.DataFrame(np.array(spacy_features_scores).T, columns = spacy_features_names)

# Concatenate both basic features and spacy features 
features_df = pd.concat([features_df, spacy_features_df], axis = 1, join = 'outer')
features_df.head()

Unnamed: 0,num_characters,num_words,num_sentences,num_syllables,num_unique_words,unique_to_total,avg_characters_word,avg_characters_sentence,avg_words_sentence,avg_syllables_word,...,aux_verbs,conjunctions,prepositions,nominalizations,adjectives,adverbs,nouns,verbs,pronouns,content_diversity
0,992,174,6,230,112,0.643678,4.706897,164.5,29.833333,1.127451,...,1,11,23,1,10.0,6.0,44.0,22.0,4.0,0.458101
1,937,164,6,228,123,0.75,4.719512,155.333333,28.166667,1.022422,...,5,7,22,0,13.0,18.0,25.0,33.0,26.0,0.514451
2,908,162,5,215,124,0.765432,4.611111,180.8,33.2,1.02381,...,1,11,18,0,8.0,14.0,19.0,27.0,23.0,0.4
3,909,163,2,196,117,0.717791,4.582822,454.0,82.0,1.010309,...,0,15,26,0,22.0,5.0,38.0,20.0,12.0,0.518293
4,723,147,1,170,51,0.346939,3.92517,723.0,147.0,0.971429,...,0,10,10,0,23.0,4.0,16.0,9.0,8.0,0.353741


In [63]:
# Concatenate both original and features dataframes (Use outer for union concatenation)
train_df = pd.concat([train_df, features_df], axis = 1, join = 'outer')
train_df.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,num_characters,num_words,num_sentences,num_syllables,...,aux_verbs,conjunctions,prepositions,nominalizations,adjectives,adverbs,nouns,verbs,pronouns,content_diversity
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009,992,174,6,230,...,1,11,23,1,10.0,6.0,44.0,22.0,4.0,0.458101
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805,937,164,6,228,...,5,7,22,0,13.0,18.0,25.0,33.0,26.0,0.514451
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676,908,162,5,215,...,1,11,18,0,8.0,14.0,19.0,27.0,23.0,0.4
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007,909,163,2,196,...,0,15,26,0,22.0,5.0,38.0,20.0,12.0,0.518293
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845,723,147,1,170,...,0,10,10,0,23.0,4.0,16.0,9.0,8.0,0.353741


In [64]:
# Read csv file and print the first 5 rows
test_df = pd.read_csv('test.csv')
test_df.head()

Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would ke...
1,f0953f0a5,,,Dotty continued to go to Mrs. Gray's every nig...
2,0df072751,,,It was a bright and cheerful scene that greete...
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,Cell division is the process by which a parent...
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,Debugging is the process of finding and resolv...


In [65]:
# Convert the excerpt column in df to numpy array
excerpt_arr = pd.DataFrame(test_df['excerpt']).to_numpy()
features_scores2 = []

# Create features for all excerpts
for e in range(len(excerpt_arr)):
    features_scores2.append(feature_engineering(excerpt_arr.item(e)))

In [66]:
features_df2 = pd.DataFrame(features_scores2, columns = features_names)
features_df2.head()

Unnamed: 0,num_characters,num_words,num_sentences,num_syllables,num_unique_words,unique_to_total,avg_characters_word,avg_characters_sentence,avg_words_sentence,avg_syllables_word,...,coleman,linsear,dalechall,lix,rix,be_verbs,aux_verbs,conjunctions,prepositions,nominalizations
0,772,147,3,199,102,0.693878,4.258503,256.666667,49.666667,1.184524,...,6.62,11.0,6.24,0.204082,10.0,4,3,4,27,1
1,967,179,3,218,121,0.675978,4.407821,321.666667,60.333333,1.023474,...,6.55,6.428571,5.41,0.134078,8.0,5,6,9,21,0
2,948,168,7,226,122,0.72619,4.64881,134.571429,24.857143,1.041475,...,7.61,14.0,6.78,0.202381,4.857143,8,0,11,15,0
3,1144,179,4,312,106,0.592179,5.396648,285.25,45.5,1.536946,...,13.7,17.25,9.55,0.363128,16.25,11,0,4,28,2
4,1094,167,2,288,124,0.742515,5.556886,546.5,84.0,1.432836,...,13.93,15.2,10.01,0.401198,33.5,8,1,7,22,5


In [67]:
# Create spacy features 
spacy_features_scores2 = spacy_features(test_df['excerpt'])

# Convert spacy features into a pandas dataframe
spacy_features_df2 = pd.DataFrame(np.array(spacy_features_scores2).T, columns = spacy_features_names)

# Concatenate both basic features and spacy features 
features_df2 = pd.concat([features_df2, spacy_features_df2], axis = 1, join = 'outer')
features_df2.head()

Unnamed: 0,num_characters,num_words,num_sentences,num_syllables,num_unique_words,unique_to_total,avg_characters_word,avg_characters_sentence,avg_words_sentence,avg_syllables_word,...,aux_verbs,conjunctions,prepositions,nominalizations,adjectives,adverbs,nouns,verbs,pronouns,content_diversity
0,772,147,3,199,102,0.693878,4.258503,256.666667,49.666667,1.184524,...,3,4,27,1,13.0,6.0,28.0,20.0,15.0,0.446667
1,967,179,3,218,121,0.675978,4.407821,321.666667,60.333333,1.023474,...,6,9,21,0,6.0,14.0,29.0,31.0,20.0,0.434783
2,948,168,7,226,122,0.72619,4.64881,134.571429,24.857143,1.041475,...,0,11,15,0,11.0,14.0,28.0,18.0,27.0,0.40113
3,1144,179,4,312,106,0.592179,5.396648,285.25,45.5,1.536946,...,0,4,28,2,19.0,9.0,63.0,17.0,3.0,0.6
4,1094,167,2,288,124,0.742515,5.556886,546.5,84.0,1.432836,...,1,7,22,5,9.0,8.0,44.0,27.0,5.0,0.52381


In [68]:
# Concatenate both original and features dataframes (Use outer for union concatenation)
test_df = pd.concat([test_df, features_df2], axis = 1, join = 'outer')
test_df.head()

Unnamed: 0,id,url_legal,license,excerpt,num_characters,num_words,num_sentences,num_syllables,num_unique_words,unique_to_total,...,aux_verbs,conjunctions,prepositions,nominalizations,adjectives,adverbs,nouns,verbs,pronouns,content_diversity
0,c0f722661,,,My hope lay in Jack's promise that he would ke...,772,147,3,199,102,0.693878,...,3,4,27,1,13.0,6.0,28.0,20.0,15.0,0.446667
1,f0953f0a5,,,Dotty continued to go to Mrs. Gray's every nig...,967,179,3,218,121,0.675978,...,6,9,21,0,6.0,14.0,29.0,31.0,20.0,0.434783
2,0df072751,,,It was a bright and cheerful scene that greete...,948,168,7,226,122,0.72619,...,0,11,15,0,11.0,14.0,28.0,18.0,27.0,0.40113
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,Cell division is the process by which a parent...,1144,179,4,312,106,0.592179,...,0,4,28,2,19.0,9.0,63.0,17.0,3.0,0.6
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,Debugging is the process of finding and resolv...,1094,167,2,288,124,0.742515,...,1,7,22,5,9.0,8.0,44.0,27.0,5.0,0.52381
