In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
from sklearn.model_selection import train_test_split 
import tensorflow as tf

In [None]:
nltk.download("stopwords")

# **CommonLit Readability Prize *(A Kaggle competition)***
### Rate the complexity of literary passages for grades 3-12 classroom use 



**Goal:**  To identify the appropriate reading level of a passage of text and to improve readability rating methods

### Kaggle assingmet
In this competition, you’ll build algorithms to rate the complexity of reading passages for grade 3-12 classroom use. To accomplish this, you'll pair your machine learning skills with a dataset that includes readers from a wide variety of age groups and a large collection of texts taken from various domains.If successful, you'll aid administrators, teachers, and students. Literacy curriculum developers and teachers who choose passages will be able to quickly and accurately evaluate works for their classrooms. Plus, these formulas will become more accessible for all. Perhaps most importantly, students will benefit from feedback on the complexity and readability of their work, making it far easier to improve essential reading skills.

In [None]:
df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
df.head()



# Text preprocessing

In [None]:
# Exclude stop words from text
def text_preprocessing(df, col): 
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    for i in range(0, len(df)):
        row = df.iloc[i, col]
        words = row.lower().split()
        text = ''
    
    for word in words:
          if not word in stop_words:
            word = stemmer.stem(word)
            text = text + word + ' '

    df.iloc[i, col] = text 
      
    return df


df = text_preprocessing(df,3)

# Exploring the Data

### Distribution of the values for `target` (histogram)

In [None]:
fig, ax = plt.subplots(tight_layout=True)

# N is the count in each bin, bins is the lower-limit of the bin
N, bins, patches = ax.hist(df.target, bins=50)

# Color code by height
fracs = df.target / df.target.max()

# Normalize
norm = colors.Normalize(fracs.min(), fracs.max())

# Set color in a loop
for thisfrac, thispatch in zip(fracs, patches):
    color = plt.cm.viridis(norm(thisfrac))
    thispatch.set_facecolor(color)

# Data Preparation for Deep Learning

In [None]:
#select input and output 
Y = df.target.values.reshape(df.shape[0],1) #select the label (correct output) 
df = df.drop('target', 1) #remove the label from input 
X = df.iloc[:,3].values 

X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                         test_size=0.33, random_state=42)


training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

# str(s.tonumpy()) is needed in Python3 instead of just s.numpy()
for i in range(0,len(X_train)):
    training_sentences.append(X_train[i])
    training_labels.append(Y_train[i])
    training_labels = [float(i) for i in training_labels]

for i in range(0,len(X_test)):
    testing_sentences.append(X_test[i])
    testing_labels.append(Y_test[i])
    testing_labels = [float(i) for i in testing_labels]

training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)


In [None]:
embedding_dim = 32
trunc_type='post'
oov_tok = "<OOV>"


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
max_length = max(len(l) for l in sequences)
vocab_size = len(word_index) + 1

padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length)

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import callbacks
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Deep Learning Model 
* `keras.Sequential` model with Embedding layer and LSTM layer 

In [None]:
def create_model():
    
    model = keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
        tf.keras.layers.LSTM(embedding_dim),
        layers.Dense(units=512, kernel_initializer='normal', activation='relu'),
        layers.Dense(units=256, kernel_initializer='normal', activation='relu'),
        layers.Dense(units=128, kernel_initializer='normal', activation='relu'),
        # Regularization
        layers.Dropout(0.4),
        # the linear output layer 
        layers.Dense(units=1, kernel_initializer='normal', activation='linear'),
    ])
    
    model.compile(optimizer = 'adam', loss='mean_squared_error')
    
    return model

model = create_model()
model.summary()

## Finding the best hyperparamets for the model

In [None]:
'''
# Create a KerasClassifier
model_KR = KerasRegressor(build_fn = create_model)

# define the parameters to try out
params = {'batch_size':[16, 32, 128], 'epochs':[10, 20, 50]}

# define RandomizedSearchCV
random_searcher = RandomizedSearchCV(model_KR, param_distributions = params, cv = KFold(5))

# fit the model
random_searcher.fit(padded, training_labels_final)

# take a look a the results
print(random_searcher.best_params_)
print(random_searcher.best_score_)

# get the mean accuracy
print('The mean accuracy:', kfolds.mean())
'''

## Training the model with the best hyperparamets

In [None]:
# Create a KerasClassifier with best parameters
model_KR = KerasRegressor(build_fn = create_model, batch_size = 16, epochs = 10)

model_KR.fit(padded, training_labels_final)

In [None]:
# Test set score calculation
score = model_KR.score(testing_padded, testing_labels_final)

In [None]:
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

test = df = text_preprocessing(test,3)

X_test = test.iloc[:,3].values 

testing_sentences = []

for i in range(0,len(X_test)):
    testing_sentences.append(X_test[i])


testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length)

In [None]:
#model_KR.fit(padded, training_labels_final)
prediction = model_KR.predict(testing_padded)
prediction_list = [i for i in prediction]

In [None]:
# create submission file
submission = pd.DataFrame({'id' : test['id'], 'target' : prediction_list})
submission.to_csv('submission.csv', index=False)
