In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import re
from transformers import AutoTokenizer ,BertForSequenceClassification
from sklearn.metrics import classification_report

## 0. Read data

In [44]:
# read train data
train_data=pd.read_csv("train.csv")
# read test data
test_data=pd.read_csv("test.csv")

In [45]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [46]:
test_data.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


## 1. Preprocessing
+ Convert NaN values ​​to empty strings
+ Remove unnecessary characters
+ Convert words to lowercase

In [47]:
def processing(text:str):
    new_text=re.sub(r'[^\w\s]', '', text)
    new_text=new_text.lower()
    return new_text

In [48]:
# Convert NaN values ​​to empty strings
train_data=train_data.fillna("")
test_data=test_data.fillna("")

In [49]:
# train data
for key in train_data.keys():
    if key=="id" or key=="target":
        continue
    train_data[key]=train_data[key].apply(processing)
# test data  
for key in test_data.keys():
    if key=="id" or key=="target":
        continue
    test_data[key]=test_data[key].apply(processing)

In [50]:
def join_col(data:pd.Series):
    x=data["keyword"]+data["location"]+data["text"]
    return (x,data["target"])

In [51]:
new_train_data=train_data.apply(join_col,axis=1).values.tolist()

## 2. Transform data

In [52]:
class Mydata(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, idx):
        return self.data[idx]

    def __len__(self):
        return len(self.data)

In [53]:
train, valid = train_test_split(new_train_data, test_size = 0.2)

In [54]:
train=Mydata(train)
valid=Mydata(valid)

In [55]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [56]:
print("Model max_length: ", tokenizer.model_max_length)

Model max_length:  512


In [57]:
def collate_fn(data: list[tuple[str, int]]):
    texts = []
    labels = []
    for content, label in data:
        texts.append(content)
        labels.append(label)    
    input_ids = tokenizer.batch_encode_plus(texts, padding = True, truncation = True)['input_ids']
    input_ids = torch.tensor(input_ids)
    labels = torch.tensor(labels)
    return input_ids, labels

In [58]:
train_dataloader = torch.utils.data.DataLoader(dataset = train, batch_size = 32, collate_fn = collate_fn, shuffle = True)
valid_dataloader = torch.utils.data.DataLoader(dataset = valid, batch_size = 32, collate_fn = collate_fn)

In [59]:
for input_ids, labels in train_dataloader:
    print(input_ids, labels)
    break

tensor([[  101, 27885, 22779,  ...,     0,     0,     0],
        [  101, 11652,  2638,  ...,     0,     0,     0],
        [  101,  7738, 10760,  ...,     0,     0,     0],
        ...,
        [  101,  4586, 19718,  ...,     0,     0,     0],
        [  101,  8647, 14289,  ...,     0,     0,     0],
        [  101,  5968,  2860,  ...,     0,     0,     0]]) tensor([0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1,
        1, 1, 0, 1, 0, 0, 1, 1])


## 3. Model

In [60]:
class MyBertForClassification(torch.nn.Module):
    def __init__(self, num_labels) -> None:
        super().__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
    def forward(self, input_ids, labels):
        outputs = self.bert(input_ids=input_ids, labels=labels)
        probs = torch.softmax(outputs['logits'], dim = -1)
        preds = torch.argmax(probs, dim = -1)
        outputs['preds'] = preds
        return outputs

In [61]:
model = MyBertForClassification(2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

## 4. Train

In [66]:
LR = 1e-4
EPOCH = 3
LOG_STEP = 90

In [63]:
#criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

In [67]:
print(f"Train Loader Step: {len(train_dataloader)}")
print(f"Valid Loader Step: {len(valid_dataloader)}")

Train Loader Step: 191
Valid Loader Step: 48


In [68]:
# for epoch in range(EPOCH):
#     running_loss = 0.0
#     print(f"Epoch {epoch}:")
#     for i, (input_id, label) in enumerate(train_dataloader):
#         input_ids = input_id
#         labels = label

#         optimizer.zero_grad()

#         outputs = model(input_ids, labels)
#         loss = outputs['loss']
#         loss.backward()
#         optimizer.step()

#         running_loss += loss.item()

#         if i % LOG_STEP == LOG_STEP-1:
#             print('[Epoch %d, Batch %d] loss: %.3f' %
#                   (epoch + 1, i + 1, running_loss / LOG_STEP))
#             running_loss = 0.0

#     model.eval()
#     valid_loss = 0
#     with torch.no_grad():
#         for step, (input_ids, labels) in enumerate(valid_dataloader):
#             outputs = model(input_ids, labels)
#             loss = outputs['loss']
#             valid_loss += loss.item()
#     print(f"Valid loss: {valid_loss / len(valid_dataloader)}")

# # Lưu mô hình
# torch.save(model.state_dict(), 'bert_classifier.pth')
model.load_state_dict(torch.load('bert_classifier.pth'))
model.eval()

Epoch 0:
[Epoch 1, Batch 90] loss: 0.321
[Epoch 1, Batch 180] loss: 0.328
Valid loss: 0.3955647118079166
Epoch 1:
[Epoch 2, Batch 90] loss: 0.172
[Epoch 2, Batch 180] loss: 0.213
Valid loss: 0.5218357254440585
Epoch 2:
[Epoch 3, Batch 90] loss: 0.119
[Epoch 3, Batch 180] loss: 0.124
Valid loss: 0.4967885751587649


## 5. Test

#### 5.1 Transform data

In [69]:
def join_col(data:pd.Series):
    x=data["keyword"]+data["location"]+data["text"]
    return (x,data["id"])

In [70]:
new_test=test_data.apply(join_col,axis=1).values.tolist()

In [95]:
def collate_fn(data: list[tuple[str, int]]):
    texts = []
    ids = []
    for content, id in data:
        texts.append(content)
        ids.append(id)    
    input_ids = tokenizer.batch_encode_plus(texts, padding = True, truncation = True)['input_ids']
    input_ids = torch.tensor(input_ids)
    ids = torch.tensor(ids)
    return input_ids, ids

In [105]:
inputs,ids=collate_fn(new_test)

In [117]:
test_dataloader = torch.utils.data.DataLoader(dataset = new_test, batch_size = 1, collate_fn = collate_fn)

In [124]:
result_test={
    'id':[],
    'target':[]
}

In [125]:
with torch.no_grad():
    for step, (input_ids, idx) in enumerate(test_dataloader):
        outputs = model(input_ids=input_ids, labels=None)
        preds = outputs['preds']
        span = preds[0].item()
        result_test['id'].append(idx.item())
        result_test['target'].append(span)

In [127]:
df=pd.DataFrame(result_test)

In [132]:
df.to_csv("result.csv",index=False)