# Pre-flight checks

Import common modules, ensure MatplotLib plots figures inline, and prepare a function to save the figures.

In [3]:
import numpy as np
import os

np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
KAGGLE_COMP_ID = "common_lit_readability"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", KAGGLE_COMP_ID)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Load the data

Load the data from disk (as already downloaded from kaggle website: https://www.kaggle.com/c/commonlitreadabilityprize/data)

In [4]:
import os
import pandas as pd

DATA_PATH = os.path.join("datasets", KAGGLE_COMP_ID)
def load_data(data_path=DATA_PATH):
    csv_path = os.path.join(data_path, "train.csv")
    return pd.read_csv(csv_path)
os.path

<module 'ntpath' from 'C:\\Anaconda3\\envs\\mlenv1\\lib\\ntpath.py'>

In [5]:
train = load_data()
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2834 entries, 0 to 2833
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              2834 non-null   object 
 1   url_legal       830 non-null    object 
 2   license         830 non-null    object 
 3   excerpt         2834 non-null   object 
 4   target          2834 non-null   float64
 5   standard_error  2834 non-null   float64
dtypes: float64(2), object(4)
memory usage: 133.0+ KB


<font color=blue size=4> NB: has separate test.csv, so no need to create test set</font><br>
<font color=red size=4> NB: contains labels </font><br>
<font color=red size=4> NB: contains standard_error for target, but how is this calculated? </font><br>
(from website: standard_error - measure of spread of scores among multiple raters for each excerpt. Not included for test data) <br>
<font color=red size=4> NB: url_legal & license should be irrelevant </font>

In [6]:
train.describe()

Unnamed: 0,target,standard_error
count,2834.0,2834.0
mean,-0.959319,0.491435
std,1.033579,0.034818
min,-3.676268,0.0
25%,-1.69032,0.468543
50%,-0.91219,0.484721
75%,-0.20254,0.506268
max,1.71139,0.649671


In [7]:
y = train["target"].copy()
y

0      -0.340259
1      -0.315372
2      -0.580118
3      -1.054013
4       0.247197
          ...   
2829    1.711390
2830    0.189476
2831    0.255209
2832   -0.215279
2833    0.300779
Name: target, Length: 2834, dtype: float64

In [8]:
y_df = pd.DataFrame(y)
y_df.describe()

Unnamed: 0,target
count,2834.0
mean,-0.959319
std,1.033579
min,-3.676268
25%,-1.69032
50%,-0.91219
75%,-0.20254
max,1.71139


In [9]:
class DataframeMultipleColumnFunctionTransformer():
    def __init__(self, func, columns=[]):
        self.func = func
        self.columns = columns

    def transform(self, input_df, **transform_params):
        return self.func(input_df, self.columns)

    def fit(self, X, y=None, **fit_params):
        return self

def features_drop(df, columns):
    msg = f'Dropping columns: {columns}\n'; print(msg)
    return df.drop(columns, axis=1)

In [10]:
from sklearn.pipeline import Pipeline

nonfeature_columns =  ["url_legal", "license", "target", "standard_error"]

preprocess_pipeline = Pipeline([
    ("prune_features", DataframeMultipleColumnFunctionTransformer(features_drop, nonfeature_columns)),
])

train_preprocessed = preprocess_pipeline.fit_transform(train.copy())
train_preprocessed.info()

Dropping columns: ['url_legal', 'license', 'target', 'standard_error']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2834 entries, 0 to 2833
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       2834 non-null   object
 1   excerpt  2834 non-null   object
dtypes: object(2)
memory usage: 44.4+ KB


In [11]:
train_text = train_preprocessed["excerpt"].copy()
train_text

0       When the young people returned to the ballroom...
1       All through dinner time, Mrs. Fayre was somewh...
2       As Roger had predicted, the snow departed as q...
3       And outside before the palace a great garden w...
4       Once upon a time there were Three Bears who li...
                              ...                        
2829    When you think of dinosaurs and where they liv...
2830    So what is a solid? Solids are usually hard be...
2831    The second state of matter we will discuss is ...
2832    Solids are shapes that you can actually touch....
2833    Animals are made of many cells. They eat thing...
Name: excerpt, Length: 2834, dtype: object

### Get some idea of what readability looks like, by looking at the passages partitioned at regular intervals

In [12]:
argp = np.argmin(y)
print(f"partition value: {y[argp]}\n")
train_text[argp]

partition value: -3.676267773



'The commutator is peculiar, consisting of only three segments of a copper ring, while in the simplest of other continuous current generators several times that number exist, and frequently 120! segments are to be found. These three segments are made so as to be removable in a moment for cleaning or replacement. They are mounted upon a metal support, and are surrounded on all sides by a free air space, and cannot, therefore, lose their insulated condition. This feature of air insulation is peculiar to this system, and is very important as a factor in the durability of the commutator. Besides this, the commutator is sustained by supports carried in flanges upon the shaft, which flanges, as an additional safeguard, are coated all over with hard rubber, one of the finest known insulators. It may be stated, without fear of contradiction, that no other commutator made is so thoroughly insulated and protected. The three commutator segments virtually constitute a single copper ring, mounted i

In [13]:
def argpart(x, partition=0):
  return np.argpartition(x, partition)[partition]

In [14]:
argp = argpart(y, partition=400)
print(f"partition value: {y[argp]}\n")
train_text[argpart(y, partition=argp)]

partition value: -2.149327999



"Blondin, the celebrated tight-rope walker, has just died in London, at the age of seventy-three.\nThe performance which made him famous was the crossing of Niagara Falls on the tight-rope.\nBlondin was a Frenchman, his father having been one of Napoleon's soldiers.\nA story is told of him that when he was five years old he saw an acrobat performing on a tight-rope.\nHe was so pleased with what he saw, that when he got home he stretched a rope between two posts, and, as soon as his mother was out of the way, took his father's fishing-rod, and, using it as a balancing pole, made his first appearance as a tight-rope walker.\nHe was trained for an acrobat and tight-rope walking, and came to this country with a troup of pantomimists.\nWhile here he visited Niagara Falls, and the idea at once struck him that, if he dared to cross those terrible waters on a rope, his fortune would be made. He made up his mind to try it, and stayed in the village of Niagara for weeks, until he had learned jus

In [15]:
argp = argpart(y, partition=800)
print(f"partition value: {y[argp]}\n")
train_text[argpart(y, partition=argp)]

partition value: -1.580279588



'Looking both sides of the road, not daring to think what she would say if she really did see Clem, Polly sped on. But not a glimpse of the tall girl\'s figure met her eyes, and at last she turned in at a gateway and ran up the little path to the door. Mrs. Forsythe saw her through the window that opened on the piazza.\n"Why, Polly Pepper," she cried, "what a pity that Clem didn\'t find you! She went over to your house."\n"Oh, I know, I know," panted Polly, with scarlet cheeks.\n"Don\'t try to talk," said Mrs. Forsythe, "you are all out of breath. Come in, Polly."\n"Oh, I can\'t. I mean I would like to see Clem," mumbled Polly, with an awful dread, now that she was on the point of finding her, of what she should say. It was all she could do to keep from running down the piazza steps and fleeing home as fast as she had come.'

In [16]:
argp = argpart(y, partition= 1200)
print(f"partition value: {y[argp]}\n")
train_text[argpart(y, partition=argp)]

partition value: -1.136051502



'After a time the polished rocky sides of the shaft grew to be of a solemn sameness. Clewe ceased to take notes. He tried to imagine what he would come to when he reached the bottom; it would be some sort of a cave, he thought, in which his shell had made an opening. He began to imagine what sort of a cave it would be, and how high the roof was from the floor. Clewe then suddenly wondered whether his gardener had remembered what he had told him about the flower-beds in front of the house; he wished certain changes made which Margaret had suggested. He tried to keep his mind on the flower-beds, but it drifted away to the cave below. He thought of the danger of coming into some underground body of water, where he would be drowned; but he knew that was a silly idea. If the shell had gone through subterranean reservoirs, the water of these would have run out, and before it reached the bottom of the shaft would have dissipated into mist.'

In [17]:
argp = argpart(y, partition=1600)
print(f"partition value: {y[argp]}\n")
train_text[argpart(y, partition=argp)]

partition value: -0.73102331



"As a statesman, it was the good fortune of Mr. Gladstone that his career was not associated with war. The reforms which he effected, the triumphs which he achieved, were not won by the supreme arbitrament of the sword. The reforms which he effected and the triumphs which he achieved were the result of his power of persuasion over his fellow-men. The reforms which he achieved in many ways amounted to a revolution. They changed, in many particulars, the face of the realm. After Sir Robert Peel had adopted the great principle which eventually carried England from protection to free trade, it was Mr. Gladstone who created the financial system which has been admitted ever since by all students of finance, as the secret of Great Britain's commercial success. He enforced the extension of the suffrage to the masses of the nation, and practically thereby made the government of monarchical England as democratic as that of any republic."

In [18]:
argp = argpart(y, partition=2000)
print(f"partition value: {y[argp]}\n")
train_text[argpart(y, partition=argp)]

partition value: -0.349618621



'A board was floating along on the swollen waters of Black Creek. On it sat Master Meadow Mouse. He was very happy. He was having his first ride, of any sort.\n"This raft—" he said to himself proudly—"this raft belongs to me. I\'ll be a traveler. I\'ll see the world—at least as far as the big willow at the lower end of the meadow!"\nHe scarcely cared to go beyond the big willow. Beyond it lay another farm. And Master Meadow Mouse had never been off Farmer Green\'s place in his whole life. He feared that he might not be able to find his way back, if he ventured too far from home.\nSoon he spied a friend on the bank of the creek. Master Meadow Mouse cried, "Goodbye!" and waved a paw at him.\nThe person on the bank was one of his many cousins. And when he caught sight of Master Meadow Mouse he stared hard for a few moments. Then he shouted, "Don\'t jump! I\'ll rescue you." He was already running to the water\'s edge when Master Meadow Mouse stopped him.'

In [19]:
argp = argpart(y, partition=2400)
print(f"partition value: {y[argp]}\n")
train_text[argpart(y, partition=argp)]

partition value: 0.143048465



'The Monday after the walking expedition, Grace Harlowe set out for school full of an idea that had been revolving in her busy brain for weeks. The time had come for herself and for her three chums to bind themselves together as a sorority. As charter members, they would initiate four other girls, as soon as proper rites could be thought of. It should be a Greek letter society. Grace thought "Phi Sigma Tau" would sound well. Aside from the social part, their chief object would be to keep a watchful eye open for girls in school who needed assistance of any sort.\nMrs. Gray\'s anxiety over Eleanor Savell had set the bee in Grace\'s bonnet buzzing, and now her plans were practically perfected. All that remained to be done was to tell her three friends, and consult them as to what other four girls would be eligible to membership.'

In [20]:
argp = np.argmax(y)
print(f"partition value: {y[argp]}\n")
train_text[argp]

partition value: 1.7113898269999999



'When you think of dinosaurs and where they lived, what do you picture? Do you see hot, steamy swamps, thick jungles, or sunny plains? Dinosaurs lived in those places, yes. But did you know that some dinosaurs lived in the cold and the darkness near the North and South Poles?\nThis surprised scientists, too. Paleontologists used to believe that dinosaurs lived only in the warmest parts of the world. They thought that dinosaurs could only have lived in places where turtles, crocodiles, and snakes live today. Later, these dinosaur scientists began finding bones in surprising places.\nOne of those surprising fossil beds is a place called Dinosaur Cove, Australia. One hundred million years ago, Australia was connected to Antarctica. Both continents were located near the South Pole. Today, paleontologists dig dinosaur fossils out of the ground. They think about what those ancient bones must mean.'

### Some initial thoughts

For lower scores, consider: 
- Longer sentences
- Higher average word length
- passive clauses
- subordinate clauses (number of occurences of 'which'??)
- 

### First, see where a basic word count vector regressor gets us

In [21]:
import re
import html
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin

# Clean up the passage, making it ready for word counting
class PassageTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, remove_escapes=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.remove_escapes = remove_escapes
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for text in X:
            text = html.unescape(text)
            if self.lower_case:
                text = text.lower()
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            if self.remove_escapes:
                text = re.sub(r"\n"," ", text)
                text = re.sub("\\\'", '`', text)
            X_transformed.append(text)
        return np.array(X_transformed)

In [22]:
X_clean = PassageTransformer(remove_punctuation=False).fit_transform(train_text)
# check passage with escapes
argp = argpart(y, partition=2400)
X_clean[argpart(y, partition=argp)]

'the monday after the walking expedition, grace harlowe set out for school full of an idea that had been revolving in her busy brain for weeks. the time had come for herself and for her three chums to bind themselves together as a sorority. as charter members, they would initiate four other girls, as soon as proper rites could be thought of. it should be a greek letter society. grace thought "phi sigma tau" would sound well. aside from the social part, their chief object would be to keep a watchful eye open for girls in school who needed assistance of any sort. mrs. gray`s anxiety over eleanor savell had set the bee in grace`s bonnet buzzing, and now her plans were practically perfected. all that remained to be done was to tell her three friends, and consult them as to what other four girls would be eligible to membership.'

In [23]:
# Set required word class transformers
import nltk

stemmer = nltk.PorterStemmer()
# and test
for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Computator"):
    print(word, "=>", stemmer.stem(word))

Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Computator => comput


In [24]:
import re
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin

class WordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, stemming=True):
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for text in X:
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [25]:
X_few = X_clean[:3]
X_few

array(['when the young people returned to the ballroom, it presented a decidedly changed appearance. instead of an interior scene, it was a winter landscape. the floor was covered with snow-white canvas, not laid on smoothly, but rumpled over bumps and hillocks, like a real snow field. the numerous palms and evergreens that had decorated the room, were powdered with flour and strewn with tufts of cotton, like snow. also diamond dust had been lightly sprinkled on them, and glittering crystal icicles hung from the branches. at each end of the room, on the wall, hung a beautiful bear-skin rug. these rugs were for prizes, one for the girls and one for the boys. and this was the game. the girls were gathered at one end of the room and the boys at the other, and one end was called the north pole, and the other the south pole. each player was given a small flag which they were to plant on reaching the pole. this would have been an easy matter, but each traveller was obliged to wear snowshoes.

In [26]:
X_few_wordcounts = WordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

array([Counter({'the': 19, 'and': 9, 'wa': 6, 'a': 5, 'of': 4, 'on': 4, 'were': 4, 'one': 4, 'to': 3, 'with': 3, 'at': 3, 'each': 3, 'end': 3, 'for': 3, 'it': 2, 'an': 2, 'but': 2, 'like': 2, 'had': 2, 'room,': 2, 'been': 2, 'hung': 2, 'girl': 2, 'thi': 2, 'pole.': 2, 'when': 1, 'young': 1, 'peopl': 1, 'return': 1, 'ballroom,': 1, 'present': 1, 'decidedli': 1, 'chang': 1, 'appearance.': 1, 'instead': 1, 'interior': 1, 'scene,': 1, 'winter': 1, 'landscape.': 1, 'floor': 1, 'cover': 1, 'snow-whit': 1, 'canvas,': 1, 'not': 1, 'laid': 1, 'smoothly,': 1, 'rumpl': 1, 'over': 1, 'bump': 1, 'hillocks,': 1, 'real': 1, 'snow': 1, 'field.': 1, 'numer': 1, 'palm': 1, 'evergreen': 1, 'that': 1, 'decor': 1, 'powder': 1, 'flour': 1, 'strewn': 1, 'tuft': 1, 'cotton,': 1, 'snow.': 1, 'also': 1, 'diamond': 1, 'dust': 1, 'lightli': 1, 'sprinkl': 1, 'them,': 1, 'glitter': 1, 'crystal': 1, 'icicl': 1, 'from': 1, 'branches.': 1, 'wall,': 1, 'beauti': 1, 'bear-skin': 1, 'rug.': 1, 'these': 1, 'rug': 1, 'priz

In [27]:
from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.most_common_ = most_common
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [28]:
to_vector_trans = WordCounterToVectorTransformer(vocabulary_size=300)
X_few_word_vectors = to_vector_trans.fit_transform(X_few_wordcounts)
X_few_word_vectors.toarray()

array([[ 0, 19,  9,  3,  6,  0,  5,  4,  3,  4,  0,  2,  3,  4,  0,  0,
         2,  2,  4,  0,  2,  2,  3,  3,  3,  1,  0,  0,  0,  0,  1,  1,
         2,  1,  1,  1,  2,  1,  2,  2,  2,  2,  1,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 

In [29]:
preprocess_pipeline = Pipeline([
    ("word_counter", WordCounterTransformer()),
    ("to_vectors", to_vector_trans)
])

X_vector = preprocess_pipeline.fit_transform(X_clean)
X_vector

<2834x301 sparse matrix of type '<class 'numpy.intc'>'
	with 143063 stored elements in Compressed Sparse Row format>

In [30]:
X_vector.shape

(2834, 301)

In [31]:
to_vector_trans.vocabulary_

{'the': 1,
 'and': 2,
 'of': 3,
 'to': 4,
 'a': 5,
 'in': 6,
 'wa': 7,
 'is': 8,
 'it': 9,
 'that': 10,
 'he': 11,
 'as': 12,
 'for': 13,
 'with': 14,
 'hi': 15,
 'on': 16,
 'they': 17,
 'be': 18,
 'had': 19,
 'are': 20,
 'at': 21,
 'not': 22,
 'by': 23,
 'i': 24,
 'but': 25,
 'from': 26,
 'thi': 27,
 'she': 28,
 'have': 29,
 'were': 30,
 'or': 31,
 'which': 32,
 'her': 33,
 'one': 34,
 'you': 35,
 'all': 36,
 'when': 37,
 'their': 38,
 'so': 39,
 'an': 40,
 'we': 41,
 'there': 42,
 'can': 43,
 'veri': 44,
 'would': 45,
 'littl': 46,
 'into': 47,
 'number': 48,
 'been': 49,
 'up': 50,
 'out': 51,
 'use': 52,
 'some': 53,
 'other': 54,
 'ha': 55,
 'will': 56,
 'about': 57,
 'who': 58,
 'like': 59,
 'if': 60,
 'my': 61,
 'could': 62,
 'no': 63,
 'him': 64,
 'what': 65,
 'more': 66,
 'then': 67,
 'these': 68,
 'them': 69,
 'call': 70,
 'do': 71,
 'make': 72,
 'than': 73,
 'two': 74,
 'said': 75,
 'made': 76,
 'time': 77,
 'mani': 78,
 'great': 79,
 'our': 80,
 'look': 81,
 'over': 82,
 'o

## TODO
- process further with word relevance with tf-idf etc.

# Try some simple regression training models

In [32]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
def rough_score(model):
    scores = cross_val_score(model, X_vector, y, scoring="neg_mean_squared_error", cv=10)
    rmse_scores = np.sqrt(-scores)
    display_scores(rmse_scores)

### Linear Regression

In [33]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

model_lin_reg = Ridge()
rough_score(model_lin_reg)

Scores: [0.72694971 0.8302842  0.86237999 0.92107784 0.70845068 0.79678526
 0.82555318 0.84382759 0.70902002 0.75689229]
Mean: 0.7981220749545953
Standard deviation: 0.06772767796173992


### RandomForestRegressor

In [34]:
from sklearn.ensemble import RandomForestRegressor

model_forest = RandomForestRegressor(random_state=42)
rough_score(model_forest)

Scores: [0.7175718  0.8253037  0.90773836 0.92764329 0.7888376  0.87455935
 0.85325812 0.88207377 0.7113172  0.83213016]
Mean: 0.8320433344786782
Standard deviation: 0.07015732836642817


### Conclusion 1
For such a simple approach, better than expected... (less than 20% of the range, so better than random at least)

## Try a different approach, engineering whole-passage features and combining them with the word vector prediction

A paper on readability features: https://www.aclweb.org/anthology/2020.bea-1.1.pdf

### TODO
First features to capture:
- Longer sentences
- passive clauses
- subordinate clauses (number of occurences of 'which'??)

In [55]:
# Add average word size feature

def average_word_size_feature(X, y=None):
    word_size_count = 0
    word_count = 0
    word_count_array = []
    for text in X:
        for word in text.split():
            word_size_count = word_size_count + len(word)
            word_count = word_count + 1
        word_count_array.append(word_size_count / word_count)
    return np.array(word_count_array)

In [56]:
df = pd.DataFrame()

In [57]:
df["avg_word_size"] = average_word_size_feature(X_clean)

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2834 entries, 0 to 2833
Data columns (total 1 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   avg_word_size  2834 non-null   float64
dtypes: float64(1)
memory usage: 22.3 KB


In [59]:
df.describe()

Unnamed: 0,avg_word_size
count,2834.0
mean,4.70205
std,0.113658
min,4.368753
25%,4.655808
50%,4.724962
75%,4.779325
max,4.857056


#### Note: range ~ 0.5, perhaps not so relevant?