# Problem Statement
We have a dataset containing excerpts of different pasaages and target column indicating the difficult of the passage i.e the reading ease. We need to predict the reading ease of unknown passages in test data.

## Approach Followed:
Here we follow below steps to solve this problem -
* Combine the train and test excerpts to preprocess them at once
* Apply preprocessing to these texts
* Split the data into train and test sets using sklearn
* Use Google Word2Vec pretrained embeddings to get an average vector for each excerpt text as an input to tree based models
* Run XGBoost and LightGBM models with embedding vectors as an input and reading ease as target variable
* Pick one of these basis rmse to optimize using Hyperopt
* Run the Hyperopt trials to get best hyperparameters for LGBM
* Make predictions with optimized LGBM

# Load Data

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

train_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')
print(train_df.shape)
print(test_df.shape)
train_df.head(5)

# Preprocess the Data

In [None]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemma = WordNetLemmatizer()
def preprocess(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    words = text.split()
    words = [lemma.lemmatize(word) for word in words if word not in stopwords.words('english')]
    return words

In [None]:
excerpt_text = train_df['excerpt'].append(test_df['excerpt'])
excerpt_text = excerpt_text.apply(lambda x: preprocess(x))
excerpt_text.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(excerpt_text.head(train_df.shape[0]), train_df['target'])
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# Use Google Word2Vec

In [None]:
embedding = gensim.models.KeyedVectors.load_word2vec_format('../input/word2vec-google/GoogleNews-vectors-negative300.bin', binary=True)

In [1]:
# this function computes the averaged vector for each sentence
def get_averaged_vectors(embedding, X, size=300):
    vectors = []
    for row in X.values:
        feature_vec = np.zeros(size, dtype='float32')
        i = 0
        for token in row:
            try:
                feature_vec = np.add(feature_vec, embedding[token])
                i += 1
            except:
                pass
        feature_vec = np.divide(feature_vec, i) 
        vectors.append(feature_vec)
    
    return vectors

In [None]:
train_vectors = get_averaged_vectors(embedding, X_train, 300)
test_vectors = get_averaged_vectors(embedding, X_test, 300)

# Run XGB and LGBM

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators=100, n_jobs=-1, learning_rate=0.01)
xgb.fit(pd.DataFrame(train_vectors), y_train)
y_test_pred = xgb.predict(pd.DataFrame(test_vectors))
print('Test set RMSE %s' % np.sqrt(mean_squared_error(y_test, y_test_pred)))

In [None]:
from lightgbm import LGBMRegressor

lgb = LGBMRegressor(n_estimators=100, n_jobs=-1, learning_rate=0.01)
lgb.fit(pd.DataFrame(train_vectors), y_train)
y_test_pred = lgb.predict(pd.DataFrame(test_vectors))
print('Test set RMSE %s' % np.sqrt(mean_squared_error(y_test, y_test_pred)))

# Optimize using Hyperopt

In [None]:
from sklearn.model_selection import KFold, cross_val_score

random_state = 42
num_folds = 5
n_iter = 50
kf = KFold(n_splits=num_folds)

In [None]:
from hyperopt import fmin, tpe, hp, anneal, Trials
def gb_mse_cv(params, random_state=random_state, cv=kf, X=pd.DataFrame(train_vectors), y=y_train):
    # the function gets a set of variable parameters in "param"
    params = {'n_estimators': int(params['n_estimators']), 
              'max_depth': int(params['max_depth']), 
             'learning_rate': params['learning_rate']}
    
    # we use this params to create a new LGBM Regressor
    model = LGBMRegressor(random_state=random_state, **params)
    
    # and then conduct the cross validation with the same folds as before
    score = -cross_val_score(model, X, y, cv=cv, scoring="neg_mean_squared_error", n_jobs=-1).mean()

    return score

%%time

# possible values of parameters
space={'n_estimators': hp.quniform('n_estimators', 100, 2000, 1),
       'max_depth' : hp.quniform('max_depth', 2, 20, 1),
       'learning_rate': hp.loguniform('learning_rate', -5, 0)
      }

# trials will contain logging information
trials = Trials()

best=fmin(fn=gb_mse_cv, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          max_evals=n_iter, # maximum number of iterations
          trials=trials, # logging
          rstate=np.random.RandomState(random_state) # fixing random state for the reproducibility
         )

# computing the score on the test set
model = LGBMRegressor(random_state=random_state, n_estimators=int(best['n_estimators']),
                      max_depth=int(best['max_depth']),learning_rate=best['learning_rate'])
model.fit(pd.DataFrame(train_vectors), y_train)

# Make Predictions

In [None]:
lgb = LGBMRegressor(learning_rate=0.009507284992946358, max_depth=20,
              n_estimators=1617, random_state=42)

lgb.fit(pd.DataFrame(train_vectors), y_train)
y_test_pred = lgb.predict(pd.DataFrame(test_vectors))
print('Test set RMSE %s' % np.sqrt(mean_squared_error(y_test, y_test_pred)))

In [None]:
pred_test_set_vectors = get_averaged_vectors(excerpt_text[-test_df.shape[0]:])
predictions = lgb.predict(pd.DataFrame(pred_test_set_vectors))
sample_submission = pd.DataFrame(list(zip(test_df['id'], predictions)), columns=['id', 'target'])
sample_submission.to_csv('submission.csv',index=False)