<p>Based on the following kernels:
<p>I.CommonLit: Explore + XGBRF&RepeatedFold Model</p>
<p>https://www.kaggle.com/andradaolteanu/i-commonlit-explore-xgbrf-repeatedfold-model</p>
<br>
<p>CommonLit Readability Prize: EDA + Baseline</p>
<p>https://www.kaggle.com/ruchi798/commonlit-readability-prize-eda-baseline</p>

<h1>Import libraries 📚</h1>

In [None]:
! pip install textstat

import os
import numpy as np
import pandas as pd
import re
import nltk
import time
import string
import pickle
import textstat
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import Ridge
from xgboost import XGBRFRegressor


In [None]:
# read train and test datasets
train_df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")


<h1>Preprocessing</h1>

In [None]:
class color:
    BOLD = '\033[1m' + '\033[93m'
    END = '\033[0m'


In [None]:
def clean_paragraph(paragraph, verbose=False):
    '''Cleans paragraph before tokenization'''

    # Tokenize & convert to lower case
    tokens = word_tokenize(paragraph)
    tokens = [t.lower() for t in tokens]

    # Remove punctuation & non alphabetic characters from each word
    table = str.maketrans('', '', string.punctuation)
    tokens = [t.translate(table) for t in tokens]
    tokens = [t for t in tokens if t.isalpha()]

    # Filter out stopwords
    stop_words = stopwords.words('english')
    tokens = [t for t in tokens if t not in stop_words]

    # Lemmatizer
    lemmatizer = WordNetLemmatizer()
    tokens_lemm = [lemmatizer.lemmatize(t) for t in tokens]

    if verbose:
        print(color.BOLD +
              "Show difference between original and lemmatized token:" +
              color.END)
        for a, b, in zip(tokens, tokens_lemm):
            if a != b:
                print(a, " | ", b)

    return " ".join(tokens_lemm)


In [None]:
# Example
cleaned_paragraph = clean_paragraph(
    paragraph=train_df["excerpt"][1], verbose=True)

print("\n" +
      color.BOLD + "Original Text:" + color.END, "\n" +
      train_df["excerpt"][1], "\n"*2 +
      color.BOLD + "After Cleaning:" + color.END, "\n" +
      cleaned_paragraph)

# Apply to the entire text
train_df["text"] = train_df["excerpt"].apply(lambda x: clean_paragraph(x))
test_df["text"] = test_df["excerpt"].apply(lambda x: clean_paragraph(x))


<h1>XGBRFRegressor model ⚙️</h1>

In [None]:
X = train_df["text"]
y = train_df['target']

rkf = RepeatedKFold(n_repeats=5, n_splits=5, random_state=47)


In [None]:
xgbrf_model = XGBRFRegressor(n_estimators=120, n_jobs=6)

model = make_pipeline(
    TfidfVectorizer(binary=True, ngram_range=(1, 1)),
    xgbrf_model,
)

cv_results = cross_validate(model, X, y,
                            cv=rkf,
                            scoring='neg_root_mean_squared_error')
cv_results = pd.DataFrame(cv_results)
cv_results


In [None]:
print(f'test score mean: {np.abs(cv_results.test_score.mean())}')


In [None]:
y_pred = model.fit(X, y).predict(X)
print(mse(y, y_pred, squared=False))


<h1>XGBRFRegressor model (with new features) ⚙️</h1>

<h3>Features creation</h3>

In [None]:
def features_extraction(df, train=True, tfidfv=None):
    # English Word Frequencies Dataset
    word_freq = pd.read_csv("../input/english-word-frequency/unigram_freq.csv")

    # Convert it into a dict (i.e. hashmap)
    word_freq = dict(zip(word_freq["word"], word_freq["count"]))
    available_words = set(word_freq.keys())

    # Tokenize full text
    df["split_text"] = df["excerpt"].apply(
        lambda x: [word for word in x.split(" ")])

    # Get word count for each word
    df["freq_text"] = df["split_text"].apply(
        lambda x: [word_freq.get(word, 0)
                   for word in x if word in available_words])

    # Get sum, mean, std etc. from the text frequencies
    df["freq_sum"] = df["freq_text"].apply(lambda x: np.sum(x))
    df["freq_mean"] = df["freq_text"].apply(lambda x: np.mean(x))
    df["freq_std"] = df["freq_text"].apply(lambda x: np.std(x))
    df["freq_min"] = df["freq_text"].apply(lambda x: np.min(x))
    df["freq_max"] = df["freq_text"].apply(lambda x: np.max(x))

    # Get more info from text itself
    df["no_words"] = df["text"].apply(lambda x: len(x.split(" ")))
    df["no_words_paragraph"] = df["excerpt"].apply(lambda x: len(x.split(" ")))

    # new features with textstat
    df['flr'] = df['excerpt'].apply(lambda x: textstat.flesch_reading_ease(x))
    df['flkg'] = df['excerpt'].apply(
        lambda x: textstat.flesch_kincaid_grade(x))
    df['fs'] = df['excerpt'].apply(lambda x: textstat.gunning_fog(x))
    df['ar'] = df['excerpt'].apply(
        lambda x: textstat.automated_readability_index(x))
    df['cole'] = df['excerpt'].apply(lambda x: textstat.coleman_liau_index(x))
    df['lins'] = df['excerpt'].apply(
        lambda x: textstat.linsear_write_formula(x))

    # Scale these features (as they are HUGE)
    X = df[['freq_sum', 'freq_mean', 'freq_std', 'freq_min',
            'freq_max', 'no_words', 'no_words_paragraph',
            'flr', 'flkg', 'fs', 'ar', 'cole', 'lins']]
    if train:
        y = pd.Series(df["target"])
    else:
        y = None

    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X))
    X_scaled.columns = X.columns

    if train:
        tfidfv.fit(df["text"])
    else:
        tfidfv.transform(df["text"])
    train_tf_matrix = pd.DataFrame.sparse.from_spmatrix(
        tfv.transform(df["text"]))
    pickle.dump(tfv.vocabulary_, open("tfidfvectorizer.pkl", "wb"))

    # Create final X variable, containing all info
    X = pd.concat([X_scaled, train_tf_matrix], axis=1)

    return X, y


In [None]:
tfv = TfidfVectorizer(min_df=3, max_features=None,
                      strip_accents='unicode', analyzer='word',
                      token_pattern=r'\w{1,}', ngram_range=(1, 3),
                      use_idf=1, smooth_idf=1, sublinear_tf=1,
                      stop_words='english')

X_train, y_train = features_extraction(train_df, tfidfv=tfv)
X_test, y_test = features_extraction(test_df, train=False, tfidfv=tfv)


In [None]:
print(f'Number of features: {X_train.shape[1]}')


<h3>Feature analysis</h3>

In [None]:
def correlation_heatmap(df):
    '''function that prints the correlation matrix of a dataframe'''
    _, ax = plt.subplots(figsize=(14, 12))
    colormap = sns.diverging_palette(220, 10, as_cmap=True)

    _ = sns.heatmap(
        df.corr(),
        cmap=colormap,
        square=True,
        cbar_kws={'shrink': .9},
        ax=ax,
        annot=False,
        linewidths=0.1,
        vmax=1.0,
        vmin=-1.0,
        linecolor='white',
        annot_kws={'fontsize': 12}
    )

    plt.title('Pearson Correlation of Features', y=1.05, size=15)


In [None]:
# correlation matrix for the first 13 features
# (not including the tfidf's)
correlation_heatmap(X_train.iloc[:, :13])


In [None]:
# correlation between the first 13 features
# and the target
spearman_corr_scores = dict()

for feature in X_train.columns[:13]:
    corr, p = spearmanr(X_train.loc[:, feature], y_train)
    spearman_corr_scores[feature] = np.abs(corr)


In [None]:
spearman_corr_scores = {k: v for k, v in sorted(
    spearman_corr_scores.items(), key=lambda item: item[1])}
plt.figure(figsize=(16, 9))
plt.barh(
    list(spearman_corr_scores.keys()), list(spearman_corr_scores.values()))
plt.title('Correlations between new features and the target')
plt.show()


<h3>Model Eval (cross validation)</h3>

In [None]:
xgbrf_model = XGBRFRegressor(n_estimators=120, n_jobs=6)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    cv_results = cross_validate(xgbrf_model,
                                X_train,
                                y_train,
                                cv=rkf,
                                scoring='neg_root_mean_squared_error')

cv_results = pd.DataFrame(cv_results)
cv_results


In [None]:
plt.figure(figsize=(16, 6))
plt.bar(x=list(cv_results.index), height=list(np.abs(cv_results.test_score)))
plt.axhline(y=np.abs(np.mean(cv_results.test_score)), color='k')
plt.text(x=21.5, y=0.84, s=np.abs(np.mean(cv_results.test_score)))
plt.title('Test scores (cross validation)')
plt.show()


In [None]:
y_pred = xgbrf_model.fit(X_train, y_train).predict(X_train)
print(mse(y_train, y_pred, squared=False))


In [None]:
fe_dict = xgbrf_model.get_booster().get_score(importance_type='weight')
fe_dict = pd.DataFrame({"feature": fe_dict.keys(),
                        "weight": fe_dict.values()})\
            .sort_values("weight", ascending=False).head(10)


In [None]:
# Plot
plt.figure(figsize=(16, 9))
ax = sns.barplot(data=fe_dict, x="feature", y="weight", palette="ocean")
for i, v in enumerate(fe_dict.values):
    plt.text(i, v[1]+10, str(v[1]), color='k', fontweight='bold')
plt.title(f"XGBRF: Feature Importance", size=25)
plt.xlabel("Features", size=20)
plt.ylabel("Weight", size=20)
plt.yticks([])
plt.show()


<h1>Using the Ridge with the same new features</h1>

In [None]:
ridge_model = Ridge(fit_intercept=True, normalize=False)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    cv_results = cross_validate(ridge_model,
                                X_train,
                                y_train,
                                cv=rkf,
                                scoring='neg_root_mean_squared_error')

cv_results = pd.DataFrame(cv_results)
cv_results


In [None]:
plt.figure(figsize=(16, 6))
plt.bar(x=list(cv_results.index), height=list(np.abs(cv_results.test_score)))
plt.axhline(y=np.abs(np.mean(cv_results.test_score)), color='k')
plt.text(x=21.5, y=0.71, s=np.abs(np.mean(cv_results.test_score)))
plt.title('Test scores (cross validation)')
plt.show()


In [None]:
y_pred = ridge_model.fit(X_train, y_train).predict(X_train)
print(mse(y_train, y_pred, squared=False))


In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    y_pred = ridge_model.predict(X_test)

predictions = pd.DataFrame()
predictions['id'] = test_df['id']
predictions['target'] = y_pred
predictions.to_csv("submission.csv", index=False)
predictions


<h1>PCA</h1>

In [None]:
# function that will print the scree
def display_scree_plot(pca):
    plt.figure(figsize=(12, 9))
    scree = pca.explained_variance_ratio_ * 100
    plt.bar(np.arange(len(scree))+1, scree)
    plt.plot(np.arange(len(scree))+1, scree.cumsum(), c="red", marker='o')
    plt.xlabel("explained variance rank")
    plt.ylabel("explained variance percentage")
    plt.title("Scree of eigenvalues")
    plt.show(block=False)


In [None]:
from sklearn.decomposition import PCA

# conputing of the principal components
pca = PCA(n_components=8)
t0 = time.time()
pca.fit(X_train)
print("fit time: %.2fs" % (time.time() - t0))

# scree of eigenvalues
display_scree_plot(pca)

print((pca.explained_variance_ratio_ * 100))


In [None]:
X_train_pca = pca.transform(X_train)

# predictions using the Ridge model
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    cv_results = cross_validate(ridge_model,
                                X_train_pca,
                                y_train,
                                cv=rkf,
                                scoring='neg_root_mean_squared_error')

cv_results = pd.DataFrame(cv_results)

print(f'test score mean: {np.abs(cv_results.test_score.mean())}')


In [None]:
y_pred = ridge_model.fit(X_train_pca, y_train).predict(X_train_pca)
print(mse(y_train, y_pred, squared=False))


In [None]:
# predictions using the XGBRF model
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    cv_results = cross_validate(xgbrf_model,
                                X_train_pca,
                                y_train,
                                cv=rkf,
                                scoring='neg_root_mean_squared_error')

cv_results = pd.DataFrame(cv_results)

print(f'test score mean: {np.abs(cv_results.test_score.mean())}')


In [None]:
y_pred = xgbrf_model.fit(X_train_pca, y_train).predict(X_train_pca)
print(mse(y_train, y_pred, squared=False))