# 预测推文是否关于真正的灾害

Kaggle: https://www.kaggle.com/c/nlp-getting-started

# 导入所需的库，读入数据集

In [4]:
import pandas as pd
import numpy as np
import re
import string

import torch
from torch import nn

In [5]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
label = train['target']
train.drop(['target', 'id'], axis=1, inplace=True)
test.drop(['id'] , axis=1, inplace=True)

data = pd.concat([train, test])
data.head()

Unnamed: 0,keyword,location,text
0,,,Our Deeds are the Reason of this #earthquake M...
1,,,Forest fire near La Ronge Sask. Canada
2,,,All residents asked to 'shelter in place' are ...
3,,,"13,000 people receive #wildfires evacuation or..."
4,,,Just got sent this photo from Ruby #Alaska as ...


# 数据预处理

## EDA，探索性数据分析
根据[kaggle上的code](https://www.kaggle.com/code/shahules/basic-eda-cleaning-and-glove#Exploratory-Data-Analysis-of-tweets)EDA部分的分析说明，开始进行预处理。

## 数据清洗

做一些基础的清洗，比如去除标点符号、去除html标签以及emoji等。

In [6]:
# removing html tags

def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

data['text'] = data['text'].apply(remove_html)

In [7]:
# removing emojis
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

data['text'] = data['text'].apply(remove_emoji)

In [8]:
# removing punctuations

def remove_punc(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

data['text'] = data['text'].apply(remove_punc)

In [9]:
data.drop(['keyword', 'location'], axis=1, inplace=True)

## 划分数据集

In [10]:
train_set = data[:len(train)]
test_set = data[len(train):]
train_set = pd.concat([train_set, label], axis=1)

sentences = train_set['text'].values
labels = train_set['target'].values

## 处理文本数据

词元化

In [11]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [12]:
print('Original: ', sentences[0])

print('Tokenized: ', tokenizer.tokenize(sentences[0]))

print('Token ID: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))


Original:  Our Deeds are the Reason of this earthquake May ALLAH Forgive us all
Tokenized:  ['our', 'deeds', 'are', 'the', 'reason', 'of', 'this', 'earthquake', 'may', 'allah', 'forgive', 'us', 'all']
Token ID:  [2256, 15616, 2024, 1996, 3114, 1997, 2023, 8372, 2089, 16455, 9641, 2149, 2035]


格式化

1. 填充特殊词元
2. 将每个句子填充或截断为一个固定长度
3. 使用`mask attention`显式区分真实词元和填充词元

填充词元：
* [SEP]，添加在每个句子的末尾
* [CLS]，transformer层输出和输入长度相同的词嵌入向量，在分类时，只需要得到第一个词向量并进行分类即可

词元数据集

In [13]:
# 查看句子长度情况
max_len = 0

for sent in sentences:
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))

print('Max length: ', max_len)

Max length:  55


In [14]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(sent, add_special_tokens=True, max_length=75, padding='max_length', return_attention_mask=True, return_tensors='pt')

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

In [15]:
print('Originals: ', sentences[1])
print('Token IDs: ', input_ids[1])

Originals:  Forest fire near La Ronge Sask Canada
Token IDs:  tensor([  101,  3224,  2543,  2379,  2474,  6902,  3351, 21871,  2243,  2710,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0])


In [16]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

dataset = TensorDataset(input_ids, attention_masks, labels)

batch_size = 32
train_loader = DataLoader(dataset, sampler=RandomSampler(dataset), batch_size=batch_size)

# 建模并训练

使用BertForSequenceClassification。这是正常的BERT模型，顶部添加了单个线性层进行分类，用作句子分类器。当我们输入数据时，整个预训练的BERT模型和额外的未经训练的分类层在我们的特定任务中进行训练。

In [17]:
from transformers import BertForSequenceClassification, BertConfig

# 后两个参数将使得模型不返回attention参数和hidden states
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, output_attentions=False, output_hidden_states=False)

model.cuda()

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [18]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5, eps=1e-8)

epoch = 5

# 计算正确率
def accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [19]:
training_stats = []

for i in range(epoch):
    print('Epoch {}/{}'.format(i + 1, epoch))
    print('-' * 10)

    total_train_loss = 0

    model.train()
    for step, batch in enumerate(train_loader):
        
        b_input_ids = batch[0].to('cuda')
        b_input_mask = batch[1].to('cuda')
        b_labels = batch[2].to('cuda')

        model.zero_grad()

        # 前向传播
        loss, logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels, return_dict=False)
        total_train_loss += loss.item()

        # 反向传播
        loss.backward()

        # 梯度裁剪
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        optimizer.zero_grad()

    avg_train_loss = total_train_loss / len(train_loader)

    print('')
    print('Training Loss: {:.3f}'.format(avg_train_loss))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': i + 1,
            'Training Loss': avg_train_loss,
        }
    )

print("")
print('Training complete!')

Epoch 1/5
----------

Training Loss: 0.438
Epoch 2/5
----------

Training Loss: 0.313
Epoch 3/5
----------

Training Loss: 0.195
Epoch 4/5
----------

Training Loss: 0.148
Epoch 5/5
----------

Training Loss: 0.106

Training complete!


In [20]:
pd.set_option('precision', 2)
df_stats = pd.DataFrame(data=training_stats)
df_stats = df_stats.set_index('epoch')
df_stats

Unnamed: 0_level_0,Training Loss
epoch,Unnamed: 1_level_1
1,0.44
2,0.31
3,0.19
4,0.15
5,0.11


# 预测

In [21]:
sentences = test_set['text'].values

input_ids = []
attention_masks = []

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(sent, add_special_tokens=True, max_length=75, padding='max_length', return_attention_mask=True, return_tensors='pt')

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

batch_size = 32
test_loader = DataLoader(TensorDataset(input_ids, attention_masks), batch_size=batch_size)

In [22]:
predictions = []

for batch in test_loader:
    batch = tuple(t.to('cuda') for t in batch)
    b_input_ids, b_input_mask = batch

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    predictions.append(logits)

In [23]:
flat_pred = np.concatenate(predictions, axis=0)
flat_pred = np.argmax(flat_pred, axis=1).flatten()

df = pd.read_csv('./data/sample_submission.csv')
df['target'] = flat_pred
df.to_csv('./data/submission.csv', index=False)