![Image of Yaktocat](https://storage.googleapis.com/kaggle-competitions/kaggle/25914/logos/header.png)

In this competition, you’ll build algorithms to rate the complexity of reading passages for grade 3-12 classroom use. To accomplish this, you'll pair your machine learning skills with a dataset that includes readers from a wide variety of age groups and a large collection of texts taken from various domains. Winning models will be sure to incorporate text cohesion and semantics.

### Data Description

#### Files
* **train.csv** - the training set
* **test.csv** - the test set
* **sample_submission.csv** - a sample submission file in the correct format

#### Columns
* `id` - unique ID for excerpt
* `url_legal` - URL of source - this is blank in the test set.
* `license` - license of source material - this is blank in the test set.
* `excerpt` - text to predict reading ease of
* `target` - reading ease
* `standard_error` - measure of spread of scores among multiple raters for each excerpt. Not included for test data.

# EDA

In [None]:
import re
from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk import tokenize
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer

sns.set()

In [None]:
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')

# Target and Standard Error

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
sns.histplot(
    data=train,
    x='target',
    stat='probability',
    ax=ax
)

ax.set_title('Target\'s distribution')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(
    data=train,
    x='target',
    y='standard_error',
    ax=ax
)

ax.set_title('How standard error changes with target values')
plt.show()

- In general, the standard error is lowest when the readability is around -1. It tends to get higher for more extreme values of readability.
- There is one data point with 0 readability and 0 standard error.

# License and Target

In [None]:
train['license_type_cnt'] = train.groupby('license').transform('count')['id']
license_data = train[train['license_type_cnt'] >= 10]

fig, ax = plt.subplots(figsize=(12, 8))
sns.pointplot(
    data=license_data,
    x='license',
    y='target',
    ci='sd',
    join=False
)

plt.show()

Different licenses have different average readability, but the standard deviation is quite high.

Note that in test data, the license field is always blank.

# Clean and standardize texts

In [None]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

lemmatizer = WordNetLemmatizer()
def clean_text(text):
    text = re.sub('[^A-Za-z0-9]+', ' ', text.lower())
    words = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(words)
    words = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in tagged]
    words = [word for word in words if word not in stopwords.words('english')]
    return words

def get_ngrams(words, n):
    return [tuple(words[i:i+n]) for i in range(len(words)-n+1)]

In [None]:
corpus = []
for text, target in train[['excerpt', 'target']].itertuples(index=False):
    sentences = []
    for sentence in tokenize.sent_tokenize(text):
        words = clean_text(sentence)
        unigrams = get_ngrams(words, n=1)
        bigrams = get_ngrams(words, n=2)
        trigrams = get_ngrams(words, n=3)
        sentences.append(words)
    corpus.append({
        'target' : target,
        'text' : text,
        'sentences' : sentences,
        'unigrams' : unigrams,
        'bigrams' : bigrams,
        'trigrams' : trigrams,
    })

corpus = sorted(corpus, key=lambda x: x['target'])

# N-grams

In [None]:
def plot_grams_target(gram_type):
    gram_cnt = defaultdict(lambda: 0)
    gram_sum = defaultdict(lambda: 0.)
    gram_avg = {}

    for datapoint in corpus:
        for gram in datapoint[gram_type]:
            gram_cnt[gram] += 1
            gram_sum[gram] += datapoint['target']

    for gram in gram_cnt:
        if gram_cnt[gram] >= 5:
            gram_avg[gram] = gram_sum[gram] / gram_cnt[gram]
    
    fig, ax = plt.subplots(1, 2, figsize=(12, 8))
    
    top_lowest = sorted(gram_avg.items(), key=lambda x: x[1])[:10]
    ngrams, avg_target = zip(*top_lowest)
    ax[0].bar(
        range(len(ngrams)),
        avg_target
    )

    ax[0].set_title(f'{gram_type} with lowest readability')
    ax[0].set_xlabel(gram_type)
    ax[0].set_ylabel('Average readability')
    ax[0].set_xticks(range(len(ngrams)))
    ax[0].set_xticklabels([' '.join(x) for x in ngrams], rotation='vertical')
    
    top_highest = sorted(gram_avg.items(), key=lambda x: x[1])[-10:]
    ngrams, avg_target = zip(*top_highest)
    ax[1].bar(
        range(len(ngrams)),
        avg_target
    )

    ax[1].set_title(f'{gram_type} with highest readability')
    ax[1].set_xlabel(gram_type)
    ax[1].set_ylabel('Average readability')
    ax[1].set_xticks(range(len(ngrams)))
    ax[1].set_xticklabels([' '.join(x) for x in ngrams], rotation='vertical')

    plt.show()

In [None]:
plot_grams_target('unigrams')

In [None]:
plot_grams_target('bigrams')

Note that we only examine grams with at least 5 occurrences.

# Target and sentence length

In [None]:
top_lowest = corpus[:500]
lowest_target_sentence_lengths = [ \
    np.mean([len(sentence) for sentence in datapoint['sentences']]) \
    for datapoint in top_lowest \
]

top_highest = corpus[-500:]
highest_target_sentence_lengths = [ \
    np.mean([len(sentence) for sentence in datapoint['sentences']]) \
    for datapoint in top_highest \
]

top_lowest_mean = np.mean(lowest_target_sentence_lengths)
top_lowest_std = np.std(lowest_target_sentence_lengths)
top_highest_mean = np.mean(highest_target_sentence_lengths)
top_highest_std = np.std(highest_target_sentence_lengths)

fig, ax = plt.subplots(figsize=(6, 8))
ax.errorbar(
    x=[0, 1],
    y=[top_lowest_mean, top_highest_mean],
    yerr=[top_lowest_std, top_highest_std],
    fmt='o'
)

ax.set_title('Average sentence length and Readability')
ax.set_ylabel('Sentence length')
ax.set_xticks([0, 1])
ax.set_xticklabels(['Top lowest readability', 'Top highest readability'])

plt.show()

Texts with shorter sentence lengths are often easier to read.

In [None]:
!pip install wordfreq

from wordfreq import word_frequency
lowest_target_word_freq = [
    [word_frequency(word[0], 'en') for word in datapoint['unigrams']]
    for datapoint in top_lowest
]
highest_target_word_freq = [
    [word_frequency(word[0], 'en') for word in datapoint['unigrams']]
    for datapoint in top_highest
]

In [None]:
lowest_min_freq = [np.min(datapoint) for datapoint in lowest_target_word_freq]
highest_min_freq = [np.min(datapoint) for datapoint in highest_target_word_freq]

fig, ax = plt.subplots(figsize=(6, 8))
ax.errorbar(
    x=[0, 1],
    y=[np.mean(lowest_min_freq), np.mean(highest_min_freq)],
    yerr=[np.std(lowest_min_freq), np.std(highest_min_freq)],
    fmt='o'
)

ax.set_ylabel('Min word frequency in general English')
ax.set_xticks([0, 1])
ax.set_xticklabels(['Top lowest readability', 'Top highest readability'])

plt.show()

Hard-to-read texts often contain some less-usual words.