# Goal

<h3 style="color:blue">assess the quality of summaries written by students</h3>
<h3 style="color:indigo">evaluate how well a student represents the main idea and details of a source text, as well as the clarity, precision, and fluency of the language used in the summary</h3>
<h3 style="color:red">Freely & publicly available external data is <b>allowed</b>, including pre-trained models</h3>
<h3>This is Multi-Output problem</h3>

### Use Hugging Face Library
### Use NLTK
### Use Tensorflow

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import math
import subprocess
from tqdm import tqdm
import pickle

In [None]:
import tensorflow as tf

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score, median_absolute_error

In [None]:
import transformers
from transformers import XLNetTokenizerFast, TFXLNetModel, XLNetConfig

In [None]:
prompts_train = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv')
summaries_train = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv')
prompts_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')
summaries_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')

In [None]:
train = pd.merge(prompts_train, summaries_train, on='prompt_id')
test = pd.merge(prompts_test, summaries_test, on='prompt_id')

In [None]:
train.rename(columns = {'text' : 'summary'}, inplace=True)
test.rename(columns = {'text' : 'summary'}, inplace=True)

In [None]:
test

In [None]:
def prepare_data(data):
    dataset = []
    for index, row in data.iterrows():
        sentence = f'Text :   {data["prompt_text"][index]}  summary :  {data["summary"][index]}'
        dataset.append(sentence)
    return dataset

In [None]:
dataset_train = prepare_data(train)
dataset_test = prepare_data(test)

In [None]:
tokenizer = XLNetTokenizerFast.from_pretrained('/kaggle/input/tensorlfow-xlnet')
model = TFXLNetModel.from_pretrained('/kaggle/input/tensorlfow-xlnet')

#from transformers import RobertaTokenizer, TFRobertaModel
#tokenizer = RobertaTokenizer.from_pretrained('roberta-base-cased')
#model = TFRobertaModel.from_pretrained('roberta-base-cased', return_dict=True)

#from transformers import AutoTokenizer, TFBertModel
#model = TFBertModel.from_pretrained('/kaggle/input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased')
#tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased')

### Next time use prepare_tf_dataset which is used to directly tokenize and data colat and
### make dataset compatible with tensorflow
####       https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset

In [None]:
configuration = XLNetConfig()

In [None]:
def get_embeddings(list):
    vectors = []
    for text in tqdm(list):
        text_tokens = tokenizer(text, return_tensors="np",max_length = 5000, padding='max_length', truncation=True)

        vectors.append(text_tokens['input_ids'])
    return vectors
    

In [None]:
train_data = get_embeddings(dataset_train)

In [None]:
test_data = get_embeddings(dataset_test)

In [None]:
target1 = np.array(train['content'])
target1 = target1.astype('float32')

target2 = np.array(train['wording'])
target2 = target2.astype('float32')

#target = (target1, target2)

In [None]:
def flattening(data):
    dataset = []
    for d in data :
        dataset.append(d.flatten().tolist())
    return dataset
train_data = np.array(flattening(train_data))
test_data = np.array(flattening(test_data))

In [None]:
from tensorflow.keras.layers import Dense, Input, Flatten

In [None]:
# Define the model architecture
model_content = tf.keras.Sequential([
      tf.keras.layers.Dense(256, activation='linear'),
      tf.keras.layers.Dense(128, activation='linear'),
      tf.keras.layers.Dense(64, activation='linear'),
      tf.keras.layers.Dense(32, activation='linear'),
      tf.keras.layers.Dense(1, activation='linear')
])

model_wording = tf.keras.Sequential([
      tf.keras.layers.Dense(256, activation='linear'),
      tf.keras.layers.Dense(128, activation='linear'),
      tf.keras.layers.Dense(64, activation='linear'),
      tf.keras.layers.Dense(32, activation='linear'),
      tf.keras.layers.Dense(1, activation='linear')
])

In [None]:
model_content.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae', 'mse'])
model_wording.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae', 'mse'])

In [None]:
model_content.fit(train_data, target1, epochs=10,batch_size=50, validation_split=0.2)

In [None]:
model_wording.fit(train_data, target2, epochs=10,batch_size=50, validation_split=0.2)

In [None]:
evaluate_on_train_content = model_content.evaluate(train_data, target1)
evaluate_on_train_wording = model_wording.evaluate(train_data, target2)

In [None]:
print('evaluate_on_train_content', evaluate_on_train_content)
print('evaluate_on_train_wording',evaluate_on_train_wording)

In [None]:
content_prediction = model_content.predict(test_data)
wording_prediction = model_wording.predict(test_data)

### Predict on test

In [None]:
test_pred_content = model_content.predict(test_data)
test_pred_wording = model_wording.predict(test_data)

## submission

In [None]:
test_pred_content = test_pred_content.reshape(-1)
test_pred_wording = test_pred_wording.reshape(-1)

In [None]:
submission = pd.DataFrame({
    'student_id' : test['student_id'],
    'content' : test_pred_content,
    'wording' : test_pred_wording
})

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
submission.head()