# Introduction 📝
🎯 **Goal:** To build algorithms to rate the complexity of reading passages for grade 3-12 classroom use. 

📖 **Data:** 
> **train.csv / test.csv** - the training and testing set
> - ```id``` - unique ID for excerpt
> - ```url_legal``` - URL of source 
> - ```license``` - license of source material 
> - ```excerpt``` - text to predict reading ease of
> - ```target``` - reading ease
> - ```standard_error``` - measure of spread of scores among multiple raters for each excerpt

📌 **Note:** ```url_legal```, ```license``` and ```standard error``` are blank in the test set.

### Copied and adapted from https://www.kaggle.com/ruchi798/commonlit-readability-prize-eda-baseline

# Import libraries 📚

In [None]:

#!pip install textstat
#!pip install nlpaug
#!pip install torch>=1.6.0 transformers>=4.0.0
#!pip install torch>=1.6.0 fairseq>=0.9.0 sacremoses>=0.0.43 fastBPE>=0.1.0
#!pip install nltk>=3.4.5

import sys

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import re
import nltk
sys.path.append('..input/textstat')
#import textstat
import time
import wandb
import xgboost as xgb


from pandas import DataFrame
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk import pos_tag, pos_tag_sents
from wordcloud import WordCloud,STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error as mse

from nltk import tokenize
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer

from scipy.stats import gaussian_kde

#nltk.download('stopwords')

# Data loading

In [None]:
train_df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

# Statistical feature extraction

We want to extract some feature that could influence the readability of a text, like the average sentence length, average word length ...

In [None]:
stopwords_en = set(stopwords.words('english'))
lemma = nltk.WordNetLemmatizer()

def tokenize(text):
    text = re.sub(r'[^a-z]', ' ', text.lower())
    words = nltk.word_tokenize(text)
    words = [lemma.lemmatize(w) for w in words if not w in stopwords_en]
    return words
def normalize(text):
    return ' '.join(tokenize(text))
def word_count(text):
    return len(text.split(' '))
def long_words(text, length):
    return len([w for w in text.split(' ') if len(w) >= length])
def max_word_len(text):
    words = (re.sub(r"[,.;@#?!&$]+\ *-", " ", text)).split()
    return np.max([len(w) for w in text.split(' ')])
def avg_sen_len(text):
    text.replace('...','.')
    sen_lens = []
    for sentence in text.split('.'):
        sen_lens.append(len(sentence.split())) 
    return sum(sen_lens)/len(sen_lens)
def avg_word_len(text):
    words = (re.sub(r"[,.;@#?!&$]+\ *-", " ", text)).split()
    return sum([len(word) for word in words])/len(words)
def punct_count(text):
    return sum(text.count(x) for x in "[,.;:@#?!&$]+\ *-")
def min_sen_len(text):
    text.replace('...','.')
    text.replace('..','.')
    sen_lens = []
    for sentence in text.split('.'):
        sen_lens.append(len(sentence.split())) 
    return np.min(sen_lens)
def max_sen_len(text):
    text.replace('...','.')
    text.replace('..','.')
    sen_lens = []
    for sentence in text.split('.'):
        sen_lens.append(len(sentence.split())) 
    return np.max(sen_lens)
    

def extract_statistical_feature(dataframe):
    dataframe['tokens'] = [np.array(tokenize(str(dataframe['excerpt'][i]))) for i in range(0, dataframe['excerpt'].size)]
    dataframe['normalized'] = [normalize(str(dataframe['excerpt'][i])) for i in range(0, dataframe['excerpt'].size)]
    dataframe['count'] = dataframe['excerpt'].apply(word_count)
    dataframe['len'] = dataframe['excerpt'].apply(len)
    dataframe['word7'] = dataframe['excerpt'].apply(lambda t: long_words(t, 7))
    dataframe['word10'] = dataframe['excerpt'].apply(lambda t: long_words(t, 10))
    dataframe['word13'] = dataframe['excerpt'].apply(lambda t: long_words(t, 13))
    dataframe['max_word_len'] = [max_word_len(dataframe['excerpt'][i]) for i in range(0,dataframe['excerpt'].size)]
    dataframe['punct_count'] = [punct_count(dataframe['excerpt'][i]) for i in range(0, dataframe['excerpt'].size)]
    #dataframe['sentence_count'] = [textstat.sentence_count(dataframe['excerpt'][i]) for i in range(0,dataframe['excerpt'].size)]
    dataframe['avg_sen_len'] = [avg_sen_len(dataframe['excerpt'][i]) for i in range(0,dataframe['excerpt'].size)]
    dataframe['min_sen_len'] = [min_sen_len(dataframe['excerpt'][i]) for i in range(0,dataframe['excerpt'].size)]
    dataframe['max_sen_len'] = [max_sen_len(dataframe['excerpt'][i]) for i in range(0,dataframe['excerpt'].size)]
    dataframe['avg_word_len'] = [avg_word_len(dataframe['excerpt'][i]) for i in range(0,dataframe['excerpt'].size)]
    #dataframe['syllabe_num'] = [textstat.syllable_count(dataframe['excerpt'][i]) for i in range(0,dataframe['excerpt'].size)]
    #dataframe['lexicon_count'] = [textstat.lexicon_count(dataframe['excerpt'][i]) for i in range(0,dataframe['excerpt'].size)]
    #dataframe['difficult_words'] = [textstat.difficult_words(dataframe['excerpt'][i]) for i in range(0,dataframe['excerpt'].size)]




In [None]:
extract_statistical_feature(train_df)

In [None]:
def readability_label(target):#these parameters allow an almost even distribution among the classes
    if target >= -0.5:
        return 3
    elif target >= -1.5:
        return 2
    else:
        return 1

train_df['readability_label'] = [readability_label(train_df['target'][i]) for i in range(0,train_df['excerpt'].size)]  #useful for plotting

In [None]:
train_df['readability_label'].hist()

In [None]:
training_vars = ['count','len','word7','word10','word13',
                 'avg_sen_len','avg_word_len','punct_count','max_sen_len','min_sen_len','max_word_len']

In [None]:
train_df.describe()

In [None]:
train_df.head()

In [None]:
print(train_df['excerpt'][0])
print(train_df['target'][0])
print(train_df['standard_error'][0])

# Data visualization

In [None]:
#plt.scatter(train_df['target'],train_df['standard_error'])

x = train_df['target']
y = train_df['standard_error']

# Calculate the point density
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)

# Sort the points by density, so that the densest points are plotted last
idx = z.argsort()
x, y, z = x[idx], y[idx], z[idx]

fig, ax = plt.subplots()
ax.scatter(x, y, c=z, s=0.5)
plt.xlabel('target')
plt.ylabel('standard error')
plt.show()

We wan to eliminate the point with target and standard error equal to 0 in order to better see the plot

In [None]:
train_df.drop(columns = ['url_legal','license','id'], inplace = True)
train_df = train_df[train_df['standard_error'] != 0.].reset_index()
train_df.drop(columns = ['index'], inplace = True)

In [None]:
#plt.scatter(train_df['target'],train_df['standard_error'])

x = train_df['target']
y = train_df['standard_error']

# Calculate the point density
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)

# Sort the points by density, so that the densest points are plotted last
idx = z.argsort()
x, y, z = x[idx], y[idx], z[idx]

fig, ax = plt.subplots()
ax.scatter(x, y, c=z, s=0.5)
plt.xlabel('target')
plt.ylabel('standard error')
plt.show()

In [None]:
fig = 1

def show_word_cloud(corpus):
    global fig
    wc = WordCloud(stopwords=STOPWORDS, width=1000, height=600, max_words=150)
    wc.generate(' '.join(corpus['normalized']))
    plt.figure(fig)
    fig += 1
    plt.imshow(wc, interpolation='bilinear')

show_word_cloud(train_df)
plt.show()

We plot the scatter plot between each pair of variables. This allow us to see if there is some evident correlation between the variables we extracted.

P.S. Plotting this could be a bit slow

In [None]:
datasScatter = sns.pairplot(train_df, hue = 'readability_label', plot_kws={'alpha': 0.2}, 
                            corner = True, diag_kind="kde", palette = 'cividis')
#datasScatter.map_lower(sns.kdeplot, levels=4, color=".2")
#datasScatter.savefig("Datas_scatter.png", facecolor = 'white')

# XGBOOST regressor model

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train_df[training_vars], train_df['target'], test_size=0.1, random_state=42)

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.5, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100, verbosity = 1, random_state = 42)

xg_reg.fit(x_train,y_train)

preds = xg_reg.predict(x_test)

rmse = np.sqrt(mse(y_test, preds))
print("RMSE: %f" % (rmse))         #about 0.82

In [None]:
xg_reg.fit(train_df[training_vars], train_df['target'])

extract_statistical_feature(test_df)
test_pred = xg_reg.predict(test_df[training_vars])

# Submission file 📝

In [None]:
predictions = pd.DataFrame()
predictions['id'] = test_df['id']
predictions['target'] = test_pred
predictions.to_csv("/kaggle/working/submission.csv", index=False)
predictions