Задача создать RNN модель, которая предсказывает 6 оценок текст.

The task is to create an RNN model that predicts 6 text ratings

In [25]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


### RNN model

In [2]:
csv_file = 'train.csv'
df = pd.read_csv(csv_file)

In [8]:
df.head(4)

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0


In [9]:
text_data = df['full_text']
labels = df[['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']]

In [16]:
labels.min(), labels.max()

(cohesion       1.0
 syntax         1.0
 vocabulary     1.0
 phraseology    1.0
 grammar        1.0
 conventions    1.0
 dtype: float64,
 cohesion       5.0
 syntax         5.0
 vocabulary     5.0
 phraseology    5.0
 grammar        5.0
 conventions    5.0
 dtype: float64)

In [10]:
#Токенезируем текстовые данные и последовательности заполнения, чтобы сделать их одинаковыми по длине:
#Tokenize the text data and pad sequences to make them uniform in length:
max_sequence_length = 100  
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
text_sequences = tokenizer.texts_to_sequences(text_data)
text_sequences = pad_sequences(text_sequences, maxlen=max_sequence_length, padding='post')

In [17]:
X_train, X_test, y_train, y_test = train_test_split(text_sequences, labels, test_size=0.2, random_state=42)


In [18]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_sequence_length),
    tf.keras.layers.LSTM(128),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(6, activation='linear') 
])

model.compile(optimizer='adam', loss='mean_squared_error')


model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x16af29ed0>

In [19]:
loss = model.evaluate(X_test, y_test)
print("Test Loss:", loss)

Test Loss: 0.4889070391654968


### pre-trained BERT model