In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from datasets import Dataset

In [2]:
from datasets import load_dataset
dataset = load_dataset("juancavallotti/bea-19-corruption")
print(dataset)
print(dataset.keys())

  from .autonotebook import tqdm as notebook_tqdm
Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['sentence', 'broken', 'annotator', 'tags', 'edit', '__index_level_0__'],
        num_rows: 84106
    })
})
dict_keys(['train'])


In [3]:
df = pd.DataFrame(dataset['train'])

In [4]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df,test_size=0.2,random_state=42)

train_df,val_df = train_test_split(train_df, test_size=0.125,random_state=42)


# EDA

In [5]:
train_df.columns, train_df.info

(Index(['sentence', 'broken', 'annotator', 'tags', 'edit', '__index_level_0__'], dtype='object'),
 <bound method DataFrame.info of                                                 sentence  \
 69919  One day , he took part in a competition intend...   
 44407  However , she never imagined how dangerous thi...   
 16447  Consequently , they probably fail to experienc...   
 83553  They are scared of what a woman can do and som...   
 29419  Today private vehicle become pride of the life...   
 ...                                                  ...   
 18345  But she is boyfriend live upstate and hasn't g...   
 49495  On the other hand , studies showed , in many o...   
 15945  But you could been lucky because the summer is...   
 1749   In the sea , include the route and port constr...   
 37840  I live here with my wonderful family based of ...   
 
                                                   broken annotator  \
 69919  One day , he took part in a competition intend...        

In [6]:
train_df.isna().sum()

sentence             0
broken               0
annotator            0
tags                 0
edit                 0
__index_level_0__    0
dtype: int64

## Don't change anythings rating 

In [7]:
noop_rate = (train_df["broken"] == train_df["sentence"]).mean()

noop_rate

np.float64(0.17406960746012604)

## Check Empty Sentence

In [8]:
(train_df["broken"].str.strip().eq("").mean(), train_df["sentence"].str.strip().eq("").mean())

(np.float64(0.0), np.float64(8.49285750683675e-05))

## Distribution of TAG'S ERROR

In [9]:
train_df["tags"].value_counts().head(15)

tags
noop          8948
PUNCT         8660
OTHER         6402
DET           5604
PREP          4894
VERB:TENSE    3045
VERB          2844
ORTH          2395
NOUN          2134
NOUN:NUM      2000
SPELL         1939
VERB:FORM     1748
UNK           1300
PRON          1299
VERB:SVA      1090
Name: count, dtype: int64

# PREPROCESSING

In [10]:
from transformers import AutoTokenizer 

model_ckpt = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)

In [11]:
train_df['sentence']

69919    One day , he took part in a competition intend...
44407    However , she never imagined how dangerous thi...
16447    Consequently , they probably fail to experienc...
83553    They are scared of what a woman can do and som...
29419    Today private vehicle become pride of the life...
                               ...                        
18345    But she is boyfriend live upstate and hasn't g...
49495    On the other hand , studies showed , in many o...
15945    But you could been lucky because the summer is...
1749     In the sea , include the route and port constr...
37840    I live here with my wonderful family based of ...
Name: sentence, Length: 58873, dtype: object

In [12]:
train_df['broken']

69919    One day , he took part in a competition intend...
44407    However , she never imagined how dangerous thi...
16447    Consequently , they probably fail to experienc...
83553    They are scared of what a woman can do and som...
29419    Today personal vehicle become pride of the lif...
                               ...                        
18345    But she is boyfriend live upstate and hasn't g...
49495    On the other hand , the studies showed , in ma...
15945    But you can been lucky because the summer is v...
1749     In the sea , include the route and port constr...
37840    I live here with my wonderful family based on ...
Name: broken, Length: 58873, dtype: object

In [13]:
train_df.head()

Unnamed: 0,sentence,broken,annotator,tags,edit,__index_level_0__
69919,"One day , he took part in a competition intend...","One day , he took part in a competition intend...",0,noop,-1 -1|||noop|||-NONE-|||-NONE-,69919
44407,"However , she never imagined how dangerous thi...","However , she never imagined how dangerous thi...",0,noop,-1 -1|||noop|||-NONE-|||-NONE-,44407
16447,"Consequently , they probably fail to experienc...","Consequently , they probably fail to experienc...",0,NOUN:NUM,9 10|||R:NOUN:NUM|||live|||lives,16447
83553,They are scared of what a woman can do and som...,They are scared of what a woman can do and som...,0,noop,-1 -1|||noop|||-NONE-|||-NONE-,8607
29419,Today private vehicle become pride of the life...,Today personal vehicle become pride of the lif...,0,ADJ,1 2|||R:ADJ|||personal|||private,29419


In [14]:
max_input_len = 256
max_target_len = 256

In [15]:
def preprocess(batch):
    model_inputs = tokenizer(
        batch["broken"],
        max_length = max_input_len,
        truncation = True
    )
    
    labels = tokenizer(
        batch["sentence"],
        max_length = max_target_len,
        truncation = True
    )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [22]:

train_ds = Dataset.from_pandas(train_df, preserve_index=False)
val_ds   = Dataset.from_pandas(val_df, preserve_index=False)

train_tok = train_ds.map(
    preprocess,
    batched=True,
    remove_columns=train_ds.column_names
)

val_tok = val_ds.map(
    preprocess,
    batched=True,
    remove_columns=val_ds.column_names
)


Map: 100%|██████████| 58873/58873 [00:01<00:00, 31027.77 examples/s]
Map: 100%|██████████| 8411/8411 [00:00<00:00, 39043.75 examples/s]


## collator = “đóng gói batch + pad cho khớp shape”

In [24]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True  
)


2025-12-23 21:42:00.897514: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-23 21:42:01.149108: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-23 21:42:02.380448: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
  if not hasattr(np, "object"):


In [26]:
train_tok.save_to_disk("/home/thanhfvux/GrammarFixer/ml/data/processed/train_tok")
val_tok.save_to_disk("/home/thanhfvux/GrammarFixer/ml/data/processed/val_tok")

Saving the dataset (1/1 shards): 100%|██████████| 58873/58873 [00:00<00:00, 2300630.37 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 8411/8411 [00:00<00:00, 1514609.78 examples/s]
