In [4]:
import pandas as pd
import numpy as np
import re

from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU, Rescaling
from tensorflow.keras.models import Sequential
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from keras.preprocessing.text import Tokenizer

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split as tts

import lightgbm as lgb

In [7]:
seed = 0

# load training and testing data
train_df = pd.read_csv("../input/feedback-prize-english-language-learning/train.csv", index_col="text_id")
X_train = train_df['full_text']
cols = [col for col in train_df.columns if col != "full_text"]
y_train = train_df[cols]

print(X_train[0])

I think that students would benefit from learning at home,because they wont have to change and get up early in the morning to shower and do there hair. taking only classes helps them because at there house they'll be pay more attention. they will be comfortable at home.

The hardest part of school is getting ready. you wake up go brush your teeth and go to your closet and look at your cloths. after you think you picked a outfit u go look in the mirror and youll either not like it or you look and see a stain. Then you'll have to change. with the online classes you can wear anything and stay home and you wont need to stress about what to wear.

most students usually take showers before school. they either take it before they sleep or when they wake up. some students do both to smell good. that causes them do miss the bus and effects on there lesson time cause they come late to school. when u have online classes u wont need to miss lessons cause you can get everything set up and go take a

In [8]:
test_df = pd.read_csv("../input/feedback-prize-english-language-learning/test.csv", index_col="text_id")
X_test = test_df['full_text']
X_test_idx = X_test.index

print(X_test_idx)

Index(['0000C359D63E', '000BAD50D026', '00367BB2546B'], dtype='object', name='text_id')


In [9]:
# clean the punctuation within the text
X_train = X_train.replace(re.compile(r'[\n\t]'), ' ', regex=True)
X_train = X_train.replace(re.compile(r'[^\w\s]'), '', regex=True)

print(X_train[0])

I think that students would benefit from learning at homebecause they wont have to change and get up early in the morning to shower and do there hair taking only classes helps them because at there house theyll be pay more attention they will be comfortable at home  The hardest part of school is getting ready you wake up go brush your teeth and go to your closet and look at your cloths after you think you picked a outfit u go look in the mirror and youll either not like it or you look and see a stain Then youll have to change with the online classes you can wear anything and stay home and you wont need to stress about what to wear  most students usually take showers before school they either take it before they sleep or when they wake up some students do both to smell good that causes them do miss the bus and effects on there lesson time cause they come late to school when u have online classes u wont need to miss lessons cause you can get everything set up and go take a shower and whe

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_matrix(X_train, "tfidf")
X_test = tokenizer.texts_to_matrix(X_test, "tfidf")

pca = PCA(n_components=100, whiten=True, random_state=seed)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

X_train, X_val, y_train, y_val = tts(X_train, y_train, test_size=.1, random_state=seed)

lgb_trains = {}
lgb_vals = {}
for col in cols:
    exec(f"lgb_trains['{col}'] = lgb.Dataset(X_train, y_train.{col})")
    exec(f"lgb_vals['{col}'] = lgb.Dataset(X_val, y_val.{col})")

In [11]:
@tf.autograph.experimental.do_not_convert
def MCRMSE_keras(y_true, y_pred):
    return tf.reduce_mean(tf.reduce_mean(tf.square(y_true - y_pred), axis=1))

def MCRMSE_lgb(preds, eval_data):
    diff = eval_data - preds
    sq = np.square(diff)
    rmse = np.sum(sq, axis=0) / eval_data.shape[0]
    return "MCRMSE", np.sum(rmse) / eval_data.shape[1], False

In [12]:
keras1_model = Sequential()
keras1_model.add(Dense(500, input_dim=X_train.shape[1], activation="relu"))
keras1_model.add(BatchNormalization())
keras1_model.add(Dense(500, activation="relu"))
keras1_model.add(Dropout(.3))
keras1_model.add(Dense(500, activation=LeakyReLU(.1)))
keras1_model.add(Dropout(.2))
keras1_model.add(Dense(500, activation="relu"))
keras1_model.add(Dense(y_train.shape[1], activation="sigmoid"))
keras1_model.add(Rescaling(4, offset=1))

optimizer = optimizers.Adam(amsgrad=True)
keras1_model.compile(loss=MCRMSE_keras, optimizer=optimizer, metrics=[MCRMSE_keras])
keras1_model.fit(X_train, y_train, batch_size=2**3, epochs=30, verbose=1,
          validation_data=(X_val, y_val), workers=30, use_multiprocessing=True,
          callbacks=[EarlyStopping(monitor="loss", patience=3, restore_best_weights=True)])

keras1_pred = pd.DataFrame(keras1_model.predict(X_test), columns=cols, index=X_test_idx)
keras1_pred

2022-11-22 01:04:58.224242: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


Unnamed: 0_level_0,cohesion,syntax,vocabulary,phraseology,grammar,conventions
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0000C359D63E,3.222412,3.024141,3.204347,3.079329,2.683869,2.933691
000BAD50D026,2.909045,2.680854,2.884666,2.703308,2.446693,2.912002
00367BB2546B,3.587918,3.540501,3.697871,3.71903,3.489804,3.619429


In [13]:
keras2_model = Sequential()
keras2_model.add(Dense(2000, input_dim=X_train.shape[1], activation="relu"))
keras2_model.add(BatchNormalization())
keras2_model.add(Dense(2000, activation="relu"))
keras2_model.add(Dropout(.3))
keras2_model.add(Dense(3000, activation=LeakyReLU(.1)))
keras2_model.add(Dropout(.2))
keras2_model.add(Dense(2000, activation="relu"))
keras2_model.add(Dense(500, activation="relu"))
keras2_model.add(Dense(3000, activation="softplus"))
keras2_model.add(BatchNormalization())
keras2_model.add(Dense(1000, activation=LeakyReLU(.1)))
keras2_model.add(Dropout(.3))
keras2_model.add(Dense(3000, activation="softsign"))
keras2_model.add(Dense(1000, activation=LeakyReLU(.1)))
keras2_model.add(Dropout(.1))
keras2_model.add(Dense(3000, activation="softplus"))
keras2_model.add(Dropout(.4))
keras2_model.add(Dense(3000, activation="relu"))
keras2_model.add(Dense(1000, activation="relu"))
keras2_model.add(BatchNormalization())
keras2_model.add(Dense(y_train.shape[1], activation="sigmoid"))
keras2_model.add(Rescaling(4, offset=1))

optimizer = optimizers.Adam(amsgrad=True)
keras2_model.compile(loss=MCRMSE_keras, optimizer=optimizer, metrics=[MCRMSE_keras])
keras2_model.fit(X_train, y_train, batch_size=2**3, epochs=40, verbose=1,
          validation_data=(X_val, y_val), workers=30, use_multiprocessing=True,
          callbacks=[EarlyStopping(monitor="loss", patience=3, restore_best_weights=True)])

keras2_pred = pd.DataFrame(keras2_model.predict(X_test), columns=cols, index=X_test_idx)
keras2_pred

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40


Unnamed: 0_level_0,cohesion,syntax,vocabulary,phraseology,grammar,conventions
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0000C359D63E,3.071095,2.970881,3.2511,3.109678,3.05417,3.09991
000BAD50D026,2.720232,2.673352,3.069246,2.787282,2.762738,2.811052
00367BB2546B,4.050164,3.804771,3.698727,3.818188,3.535031,3.749262


In [42]:
lgb1_models = {}
lgb1_preds = {}

for score in cols:
    lgb1_params = {'objective': 'regression',
                   'metric': 'rmse',
                   'verbosity': 0,
                   'early_stopping_round': 50,
                   'random_state': seed,
                   }
    
    train_set=lgb_trains[score]
    valid_sets=lgb_vals[score]

    lgb1_model = lgb.train(
        params=lgb1_params,
        train_set=train_set,
        num_boost_round=1000,
        valid_sets=(train_set, valid_sets),
        callbacks=None,
        verbose_eval=100
    )
    
    lgb1_models[score] = lgb1_model
    lgb1_preds[score] = lgb1_model.predict(X_test)
    
lgb1_pred = pd.DataFrame(lgb1_preds, index=X_test_idx)
lgb1_pred



You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 0.232565	valid_1's rmse: 0.593093
Early stopping, best iteration is:
[107]	training's rmse: 0.220856	valid_1's rmse: 0.592106
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 0.231739	valid_1's rmse: 0.53191
Early stopping, best iteration is:
[63]	training's rmse: 0.308689	valid_1's rmse: 0.530372
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[38]	training's rmse: 0.330625	valid_1's rmse: 0.503609
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 0.226362	valid_1's rmse: 0.55456
Early stopping, best iteration is:
[62]	training's rmse: 0.305181	valid_1's rmse:

Unnamed: 0_level_0,cohesion,syntax,vocabulary,phraseology,grammar,conventions
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0000C359D63E,2.888183,2.985096,3.191583,2.987695,2.697193,3.052911
000BAD50D026,3.003593,2.703766,2.883729,2.806867,2.647724,3.158523
00367BB2546B,3.561003,3.514718,3.680797,3.526439,3.436347,3.382326
