<a href="https://colab.research.google.com/github/ThanhHung2112/LMS/blob/main/tense_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m79.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m77.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertForSequenceClassification, BertTokenizer, TrainingArguments, Trainer
from torch.utils.data import DataLoader, TensorDataset

In [None]:
# Khởi tạo BERT tokenizer và mô hình BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
data = pd.read_csv("/content/tense.csv", encoding='latin-1')
data.columns = ["sentence", "tense"]
data

Unnamed: 0,sentence,tense
0,I am eating breakfast,present
1,She will go to the park,future
2,They played soccer yesterday,past
3,I will be going to the concert,future
4,She is eating lunch now,present
...,...,...
2269,She will have been leading the research team f...,future perfect continuous
2270,I am creating a new field of study in theoreti...,present continuous
2271,"By 2030, we will have developed a cure for all...",future perfect
2272,They will be volunteering in underserved commu...,future continuous


In [None]:
tense_labels = {
    'present': 0,
    'future': 1,
    'past': 2,
    'present perfect continuous': 3,
    'future perfect': 4,
    'past perfect': 5,
    'future continuous': 6,
    'past perfect continuous': 7,
    'present continuous': 8,
    'past continuous': 9,
    'future perfect continuous': 10,
    'present perfect': 11,
}

In [None]:
for item in data["tense"]:
    if item not in tense_labels:
        print(f"Tense value '{item}' not found in tense_labels dictionary.")

Tense value ' future continuous' not found in tense_labels dictionary.


In [None]:
def clean_and_map_tense(tense):
    if pd.notna(tense):
        tense = tense.lower()
        if tense in tense_labels:
            return tense
    return None

# Clean and map tenses
data["tense"] = data["tense"].apply(clean_and_map_tense)

# Remove rows with None (unrecognized tenses)
data = data.dropna()

data

Unnamed: 0,sentence,tense
0,I am eating breakfast,present
1,She will go to the park,future
2,They played soccer yesterday,past
3,I will be going to the concert,future
4,She is eating lunch now,present
...,...,...
2269,She will have been leading the research team f...,future perfect continuous
2270,I am creating a new field of study in theoreti...,present continuous
2271,"By 2030, we will have developed a cure for all...",future perfect
2272,They will be volunteering in underserved commu...,future continuous


In [None]:
# Check and remove unrecognized tenses from tense_labels
unrecognized_tenses = [item for item in data["tense"] if item not in tense_labels]
for unrecognized_tense in unrecognized_tenses:
    del tense_labels[unrecognized_tense]

In [None]:
missing_values = data.isnull().sum()
print("Nan Data:")
print(missing_values)

Nan Data:
sentence    0
tense       0
dtype: int64


In [None]:
# Tách dữ liệu thành tập huấn luyện và tập kiểm tra
train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)

def prepare_input_data(data):
    encoded_data = tokenizer(data["sentence"].tolist(), padding=True, truncation=True, return_tensors="pt")
    return encoded_data

In [None]:
train_inputs = prepare_input_data(train_data)
test_inputs = prepare_input_data(test_data)

train_labels = torch.tensor([tense_labels[item] for item in train_data["tense"]])
test_labels = torch.tensor([tense_labels[item] for item in test_data["tense"]])


In [None]:
class TenseClassifier(nn.Module):

    def __init__(self, bert_model, num_classes):
        super(TenseClassifier, self).__init__()
        self.bert = bert_model
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        x = self.relu1(pooled_output)
        x = self.relu2(x)
        logits = self.fc(x)
        return logits

num_classes = 12
model = TenseClassifier(bert_model, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

In [None]:
# Define batch size
batch_size = 32

# Create DataLoader for training data
train_dataset = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], train_labels)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_data_loader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {total_loss / len(train_data_loader)}')

Epoch 1/10, Average Loss: 1.4232420607617027
Epoch 2/10, Average Loss: 0.4186065542070489
Epoch 3/10, Average Loss: 0.2309194190198915
Epoch 4/10, Average Loss: 0.15373705079158148
Epoch 5/10, Average Loss: 0.13823684233061054
Epoch 6/10, Average Loss: 0.10301513623511582
Epoch 7/10, Average Loss: 0.0892762784895144
Epoch 8/10, Average Loss: 0.08019526322421275
Epoch 9/10, Average Loss: 0.07002977749104039
Epoch 10/10, Average Loss: 0.06924490633894477


In [None]:
# Evaluation
model.eval()
with torch.no_grad():
    logits = model(test_inputs['input_ids'], test_inputs['attention_mask'])
    predicted_labels = torch.argmax(logits, dim=1)
    accuracy = accuracy_score(test_labels, predicted_labels)
    print(f'Accuracy on test set: {accuracy * 100:.2f}%')

In [None]:
def predict_tense(sentence, model, tokenizer, tense_labels):
    # tokenizer
    encoded_sentence = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt")

    with torch.no_grad():
        logits = model(encoded_sentence['input_ids'], encoded_sentence['attention_mask'])
        predicted_label = torch.argmax(logits, dim=1).item()

    predicted_tense = [k for k, v in tense_labels.items() if v == predicted_label][0]

    return predicted_tense

sentence_to_predict = """
In 2009, the inhabitants in Vietnam reached 95 million individuals.
"""
# Simultaneously, the population in Japan hit 50 million citizens.
predicted_tense = predict_tense(sentence_to_predict, model, tokenizer, tense_labels)
print(f"The predicted tense for the sentence is: {predicted_tense}")

The predicted tense for the sentence is: present perfect continuous


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model_save_path = 'tense.pt'
tokenizer_save_path = 'tense_tokenizer'

torch.save(model.state_dict(), model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)


In [None]:
import shutil
import zipfile

# Thư mục bạn muốn nén thành tệp ZIP
folder_to_compress = '/kaggle/working/tense_tokenizer'

# Tên tệp ZIP đầu ra
output_zip_file = '/kaggle/working/tense_tokenizer.zip'

# Nén thư mục thành tệp ZIP
shutil.make_archive(output_zip_file, 'zip', folder_to_compress)

print(f'The folder {folder_to_compress} has been compressed to {output_zip_file}.')


The folder /kaggle/working/tense_tokenizer has been compressed to /kaggle/working/tense_tokenizer.zip.


<a href="/kaggle/working/tense_tokenizer.zip.zip"> Download File </a>

In [None]:
import zipfile
import os
from IPython.display import FileLink

def zip_dir(directory = os.curdir, file_name = '/kaggle/working/tense.pth'):
    """
    zip all the files in a directory

    Parameters
    _____
    directory: str
        directory needs to be zipped, defualt is current working directory

    file_name: str
        the name of the zipped file (including .zip), default is 'directory.zip'

    Returns
    _____
    Creates a hyperlink, which can be used to download the zip file)
    """
    os.chdir(directory)
    zip_ref = zipfile.ZipFile(file_name, mode='w')
    for folder, _, files in os.walk(directory):
        for file in files:
            if file_name in file:
                pass
            else:
                zip_ref.write(os.path.join(folder, file))

    return FileLink(file_name)
zip_dir()

# Training using  TrainingArguments & Trainer

In [None]:
from transformers import TrainingArguments, Trainer
from torch.utils.data import Dataset

# Define a custom dataset
class CustomDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx],
        }

# Create a custom dataset
train_dataset = CustomDataset(train_inputs['input_ids'], train_inputs['attention_mask'], train_labels)
eval_dataset = CustomDataset(test_inputs['input_ids'], test_inputs['attention_mask'], test_labels)

In [None]:
# Define batch size
batch_size = 32
num_epochs = 10
# Set up TrainingArguments
training_args = TrainingArguments(
    output_dir='./output',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    evaluation_strategy="steps",
    eval_steps=500,
)

# Create a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Provide the train_dataset
    eval_dataset=eval_dataset,
)

In [None]:
# Training loop
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=120, training_loss=-2.643637847900391, metrics={'train_runtime': 28.8134, 'train_samples_per_second': 263.766, 'train_steps_per_second': 4.165, 'total_flos': 0.0, 'train_loss': -2.643637847900391, 'epoch': 10.0})

In [None]:
from sklearn.metrics import accuracy_score
# Get model predictions on the test dataset
predictions = trainer.predict(eval_dataset).predictions
# Convert predictions to class labels
predicted_labels = predictions.argmax(axis=1)
# Convert test labels to NumPy array for comparison
true_labels = test_labels.numpy()
# Calculate accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
# Print accuracy
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 25.65%


In [None]:
tense_labels = {
    0: 'present',
    1: 'future',
    2: 'past',
    3: 'present perfect continuous',
    4: 'future perfect',
    5: 'past perfect',
    6: 'future continuous',
    7: 'past perfect continuous',
    8: 'present continuous',
    9: 'past continuous',
    10: 'future perfect continuous',
    11: 'present perfect',
}

def predict_tense(text, model):
    input_data = tokenizer(text, return_tensors="pt")
    device="cuda:0"
    input_data = {key: value.to(device) for key, value in input_data.items()}
    model = model.to(device)
    with torch.no_grad():
        logits = model(input_data['input_ids'], input_data['attention_mask'])
    predicted_label = torch.argmax(logits, dim=1).item()
    predicted_tense = tense_labels[predicted_label]
    return predicted_tense

text_to_predict = "She is studying for her final exams."
predicted_tense = predict_tense(text_to_predict, model)
print(f"The predicted tense is: {predicted_tense}")


The predicted tense is: future
