# Goal

<h3 style="color:blue">assess the quality of summaries written by students</h3>
<h3 style="color:indigo">evaluate how well a student represents the main idea and details of a source text, as well as the clarity, precision, and fluency of the language used in the summary</h3>
<h3 style="color:red">Freely & publicly available external data is <b>allowed</b>, including pre-trained models</h3>
<h3>This is Multi-Output problem</h3>

### Use Hugging Face Library
### Use NLTK
### Use Tensorflow

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import math
import subprocess
from tqdm import tqdm
import pickle

In [3]:
import tensorflow as tf

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score, median_absolute_error

In [5]:
import transformers
from transformers import XLNetTokenizerFast, TFXLNetModel, XLNetConfig

In [6]:
prompts_train = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv')
summaries_train = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv')
prompts_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')
summaries_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')

In [7]:
train = pd.merge(prompts_train, summaries_train, on='prompt_id')
test = pd.merge(prompts_test, summaries_test, on='prompt_id')

In [8]:
train.rename(columns = {'text' : 'summary'}, inplace=True)
test.rename(columns = {'text' : 'summary'}, inplace=True)

In [9]:
test

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,summary
0,abc123,Summarize...,Example Title 1,Heading\nText...,000000ffffff,Example text 1
1,abc123,Summarize...,Example Title 1,Heading\nText...,222222cccccc,Example text 3
2,def789,Summarize...,Example Title 2,Heading\nText...,111111eeeeee,Example text 2
3,def789,Summarize...,Example Title 2,Heading\nText...,333333dddddd,Example text 4


In [10]:
def prepare_data(data):
    dataset = []
    for index, row in data.iterrows():
        sentence = f'Text :   {data["prompt_text"][index]}  summary :  {data["summary"][index]}'
        dataset.append(sentence)
    return dataset

In [11]:
dataset_train = prepare_data(train)
dataset_test = prepare_data(test)

In [12]:
tokenizer = XLNetTokenizerFast.from_pretrained('/kaggle/input/tensorlfow-xlnet')
model = TFXLNetModel.from_pretrained('/kaggle/input/tensorlfow-xlnet')

#from transformers import RobertaTokenizer, TFRobertaModel
#tokenizer = RobertaTokenizer.from_pretrained('roberta-base-cased')
#model = TFRobertaModel.from_pretrained('roberta-base-cased', return_dict=True)

#from transformers import AutoTokenizer, TFBertModel
#model = TFBertModel.from_pretrained('/kaggle/input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased')
#tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased')

Some layers from the model checkpoint at /kaggle/input/tensorlfow-xlnet were not used when initializing TFXLNetModel: ['lm_loss']
- This IS expected if you are initializing TFXLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFXLNetModel were initialized from the model checkpoint at /kaggle/input/tensorlfow-xlnet.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLNetModel for predictions without further training.


### Next time use prepare_tf_dataset which is used to directly tokenize and data colat and
### make dataset compatible with tensorflow
####       https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset

In [13]:
configuration = XLNetConfig()

In [14]:
def get_embeddings(list):
    vectors = []
    for text in tqdm(list):
        text_tokens = tokenizer.encode(text, return_tensors="np",max_length = 5000, padding='max_length', truncation=True)
        #output  = model(text_tokens)
        #vectors.append(output.last_hidden_state)
        vectors.append(text_tokens)
    return vectors
    

In [15]:
train_data = get_embeddings(dataset_train)

100%|██████████| 7165/7165 [00:45<00:00, 159.17it/s]


In [16]:
test_data = get_embeddings(dataset_test)

100%|██████████| 4/4 [00:00<00:00, 285.58it/s]


In [17]:
target1 = np.array(train['content'])
target1 = target1.astype('float32')

target2 = np.array(train['wording'])
target2 = target2.astype('float32')

#target = (target1, target2)

In [18]:
def flattening(data):
    dataset = []
    for d in data :
        dataset.append(d.flatten().tolist())
    return dataset
train_data = np.array(flattening(train_data))
test_data = np.array(flattening(test_data))

In [19]:
from tensorflow.keras.layers import Dense, Input, Flatten, Bidirectional, Dropout, LSTM
from tensorflow.keras.models import Sequential

In [40]:
len(train_data)

7165

In [83]:
# Define the model architecture

model_content = Sequential()

model_content.add(Bidirectional(LSTM(356, return_sequences=True), input_shape=(len(train_data[0]) , 1)))
model_content.add(Dropout(0.3))
model_content.add(Bidirectional(LSTM(128)))
model_content.add(Dropout(0.3))
model_content.add(Dense(1,  activation='linear') )



In [85]:
model_wording = Sequential()
model_wording.add(Bidirectional(LSTM(356, return_sequences=True), input_shape=(len(train_data[0]) , 1)))
model_wording.add(Dropout(0.3))
model_wording.add(Bidirectional(LSTM(128)))
model_wording.add(Dropout(0.3))
model_wording.add(Dense(1,  activation='linear') )

In [86]:
model_content.compile(optimizer='adam', loss='mean_squared_error', metrics='mae')
model_wording.compile(optimizer='adam', loss='mean_squared_error', metrics='mae')

In [88]:
model_content.fit(train_data, target1, epochs=1, validation_split=0.2)



<keras.callbacks.History at 0x7dd90c8446a0>

In [89]:
model_wording.fit(train_data, target2, epochs=1,batch_size=50, validation_split=0.2)



<keras.callbacks.History at 0x7dd90c882530>

In [90]:
evaluate_on_train_content = model_content.evaluate(train_data, target1)
evaluate_on_train_wording = model_wording.evaluate(train_data, target2)



In [91]:
print('evaluate_on_train_content', evaluate_on_train_content)
print('evaluate_on_train_wording',evaluate_on_train_wording)

evaluate_on_train_content [1.0628072023391724, 0.8118907809257507]
evaluate_on_train_wording [1.0731637477874756, 0.8291302919387817]


In [92]:
content_prediction = model_content.predict(test_data)
wording_prediction = model_wording.predict(test_data)



### Predict on test

In [93]:
test_pred_content = model_content.predict(test_data)
test_pred_wording = model_wording.predict(test_data)



## submission

In [94]:
test_pred_content = test_pred_content.reshape(-1)
test_pred_wording = test_pred_wording.reshape(-1)

In [95]:
original_test_ids = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')

In [96]:
test_pred_content

array([-0.28653428, -0.29926196, -0.28764176, -0.30977756], dtype=float32)

In [97]:
submission = pd.DataFrame({
    'student_id' : original_test_ids['student_id'],
    'content' : test_pred_content,
    'wording' : test_pred_wording
})

In [98]:
submission.to_csv('submission.csv', index=False)

In [99]:
submission.head()

Unnamed: 0,student_id,content,wording
0,000000ffffff,-0.286534,-0.069756
1,111111eeeeee,-0.299262,-0.077285
2,222222cccccc,-0.287642,-0.070378
3,333333dddddd,-0.309778,-0.085776
