<a href="https://colab.research.google.com/github/Somani-Harsh/data-science-python/blob/master/Chinese_news_article_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
from google.colab import drive
 
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
!pip install stopwordsiso

Collecting stopwordsiso
[?25l  Downloading https://files.pythonhosted.org/packages/3e/03/4c5f24b654bb9459f81aa5c1b60b094b804286b99dca9f2e116c9eb01ac8/stopwordsiso-0.6.1-py3-none-any.whl (73kB)
[K     |████████████████████████████████| 81kB 5.3MB/s 
[?25hInstalling collected packages: stopwordsiso
Successfully installed stopwordsiso-0.6.1


In [8]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/f4/9f93f06dd2c57c7cd7aa515ffbf9fcfd8a084b92285732289f4a5696dd91/transformers-3.2.0-py3-none-any.whl (1.0MB)
[K     |▎                               | 10kB 26.3MB/s eta 0:00:01[K     |▋                               | 20kB 6.3MB/s eta 0:00:01[K     |█                               | 30kB 6.1MB/s eta 0:00:01[K     |█▎                              | 40kB 6.5MB/s eta 0:00:01[K     |█▋                              | 51kB 6.7MB/s eta 0:00:01[K     |██                              | 61kB 7.5MB/s eta 0:00:01[K     |██▎                             | 71kB 7.8MB/s eta 0:00:01[K     |██▋                             | 81kB 7.6MB/s eta 0:00:01[K     |███                             | 92kB 8.0MB/s eta 0:00:01[K     |███▎                            | 102kB 8.2MB/s eta 0:00:01[K     |███▋                            | 112kB 8.2MB/s eta 0:00:01[K     |███▉                            | 122kB 8.2M

In [1]:
import pandas as pd
import numpy as np
import re
from stopwordsiso import stopwords
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from torch.nn import functional as F
import torch
import tqdm
from transformers import get_linear_schedule_with_warmup

In [2]:
torch.cuda.is_available()

True

In [3]:
df = pd.read_csv("drive/My Drive/interviews/Sirion labs/train_data.csv")
print(df.shape)
df.head()

(2500, 3)


Unnamed: 0.1,Unnamed: 0,text,category
0,1497,﻿ 日月 光华 - - Traffic _ Info 精华区 文章 阅读- - - ...,traffic
1,436,﻿ 日本 去年 海外 资产 和 负债额 均 创 ...,economic
2,736,﻿ 梁斌 黄胄 联袂 在 京 举办 画展新华社 ...,art
3,2673,﻿ 记者来信 ： 乱 降价 也 有害新华社 北京...,economic
4,2266,﻿ 巴解 愿同 叙利亚 和解新华社 开罗 ５ 月...,politics


In [4]:
df = df[~df["text"].isna()]
df.shape

(2498, 3)

In [5]:
def preprocess_text(text):
  text = re.sub("-", " ", text)  # Remove dashes
  text = " ".join(list(filter(lambda x: x not in stopwords("zh"), text.split())))    # Remove Stopwords
  return text


In [6]:
df["text"] = df["text"].apply(preprocess_text)
df["text"] = df["text"].apply(lambda x: "[CLS] " + x + " [END]")

In [7]:
df["text"].str.split().apply(len).describe(percentiles=(0.5, 0.75, 0.8, 0.9, 0.95, 0.99))

count    2498.000000
mean      326.978383
std       495.234674
min        19.000000
50%       168.000000
75%       322.750000
80%       395.000000
90%       702.600000
95%      1199.100000
99%      2502.870000
max      6248.000000
Name: text, dtype: float64

In [8]:
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

model = BertForSequenceClassification.from_pretrained("bert-base-chinese", num_labels=10)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
le = LabelEncoder()
labels = le.fit_transform(df["category"])
print(le.classes_)

['art' 'computer' 'economic' 'education' 'environment' 'medical'
 'military' 'politics' 'sports' 'traffic']


In [10]:
X_train, X_test, y_train, y_test = train_test_split(df["text"].tolist(), labels, 
                                                    test_size=0.2, stratify=labels, random_state=40)

In [11]:
encoding = tokenizer(X_train, return_tensors='pt', padding=True, truncation=True, \
                     max_length=512)

train_input_ids = encoding["input_ids"]
train_attention_mask = encoding["attention_mask"]

In [12]:
encoding = tokenizer(X_test, return_tensors='pt', padding=True, truncation=True, \
                     max_length=512)

test_input_ids = encoding["input_ids"]
test_attention_mask = encoding["attention_mask"]

In [13]:
num_labels = len(le.classes_)

In [14]:
model.classifier

Linear(in_features=768, out_features=10, bias=True)

In [15]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [16]:
batch_size = 16
n_epochs = 5
N = len(train_input_ids)
N

1998

In [17]:
# model.to(device)

In [18]:
optimizer = torch.optim.AdamW(model.parameters() , lr=2e-3)
criterion = torch.nn.CrossEntropyLoss()
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=125*n_epochs)

In [20]:
for epoch_i in range(n_epochs):
    print("Epoch : ", epoch_i)

    model.train()

    for i in range(N//batch_size + 1):
        
        inputs = train_input_ids[i*batch_size:(i+1)*batch_size].to(device)
        mask = train_attention_mask[i*batch_size:(i+1)*batch_size].to(device)
        target = torch.from_numpy(y_train[i*batch_size:(i+1)*batch_size]).to(device)
        
        optimizer.zero_grad()
        
        pred = model(inputs, mask)[0]

        loss = criterion(pred, target)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        accuracy = sum(np.argmax(pred.detach().cpu().numpy(), axis=1) == target.cpu().numpy()) / len(target)

        if i % 30 == 29:
            print("Step ", i, " Loss: ", loss, " Acc:", accuracy)

    # true_labels = 0
    # for i in range(len(test)//batch_size + 1):
    #     test_inputs = test_input_ids[i*batch_size:(i+1)*batch_size].to(device)
    #     test_mask = test_attention_mask[i*batch_size:(i+1)*batch_size].to(device)

    #     test_pred = net(test_inputs, test_mask)
    #     test_target = y_test[i*batch_size:(i+1)*batch_size]
    #     true_labels += sum(np.argmax(net(test_input_ids.to(device), test_attention_mask.to(device)).detach().numpy(), axis=1) == test_target)

    # print("Validation Accuracy :" , true_labels/len(y_test))
        

Epoch :  0
Step  29  Loss:  tensor(2.2328, device='cuda:0', grad_fn=<NllLossBackward>)  Acc: 0.25
Step  59  Loss:  tensor(2.3368, device='cuda:0', grad_fn=<NllLossBackward>)  Acc: 0.125
Step  89  Loss:  tensor(2.1748, device='cuda:0', grad_fn=<NllLossBackward>)  Acc: 0.25
Step  119  Loss:  tensor(2.2084, device='cuda:0', grad_fn=<NllLossBackward>)  Acc: 0.125
Epoch :  1
Step  29  Loss:  tensor(2.2420, device='cuda:0', grad_fn=<NllLossBackward>)  Acc: 0.1875
Step  59  Loss:  tensor(2.3265, device='cuda:0', grad_fn=<NllLossBackward>)  Acc: 0.25
Step  89  Loss:  tensor(2.1674, device='cuda:0', grad_fn=<NllLossBackward>)  Acc: 0.25
Step  119  Loss:  tensor(2.2600, device='cuda:0', grad_fn=<NllLossBackward>)  Acc: 0.125
Epoch :  2
Step  29  Loss:  tensor(2.1469, device='cuda:0', grad_fn=<NllLossBackward>)  Acc: 0.3125
Step  59  Loss:  tensor(2.2679, device='cuda:0', grad_fn=<NllLossBackward>)  Acc: 0.1875
Step  89  Loss:  tensor(2.2508, device='cuda:0', grad_fn=<NllLossBackward>)  Acc: 0.06

In [21]:
torch.save(model.state_dict(), "drive/My Drive/interviews/Sirion labs/model.pickle")

In [32]:
torch.cuda.empty_cache()
del loss
del train_attention_mask
del train_input_ids

NameError: ignored

In [18]:
model.load_state_dict(torch.load("drive/My Drive/interviews/Sirion labs/model.pickle"))

<All keys matched successfully>

In [22]:
# model = model.cpu()
model = model.to(device)
_  = model.eval()
batch_size = 4

In [23]:
true_labels = 0
for i in tqdm.tqdm(range(len(y_test)//batch_size + 1)):
    test_inputs = test_input_ids[i*batch_size:(i+1)*batch_size].to(device)
    test_mask = test_attention_mask[i*batch_size:(i+1)*batch_size].to(device)

    test_pred = model(test_inputs, test_mask)[0]
    test_target =  y_test[i*batch_size:(i+1)*batch_size]
    true_labels += sum(np.argmax(test_pred.detach().cpu().numpy(), axis=1) == test_target)

print("Validation Accuracy :" , true_labels/len(y_test))

100%|██████████| 126/126 [00:09<00:00, 13.21it/s]

Validation Accuracy : 0.16



