# 情绪识别

## 调用需要使用的库

In [1]:
import torch
from torch.utils.data import Dataset,DataLoader
from sklearn.model_selection import train_test_split 
import pandas as pd
import transformers
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


## 数据处理

In [3]:
dataset = pd.read_csv('dataset.csv').iloc[0:11000] #选取前11000条数据
dataset.head(10) #查看数据

Unnamed: 0,文本,情绪标签
0,气死姐姐了，快二是阵亡了吗，尼玛，一个半小时过去了也没上车,angry
1,妞妞啊，今天又承办了一个发文登记文号是126~嘻~么么哒~晚安哟,happy
2,这里还值得注意另一个事实，就是张鞠存原有一个东溪草堂为其读书处。,neutral
3,这在前华约国家(尤其是东德)使用R-73的首次联合演习期间，被一些北约组织的飞行员所证实。,neutral
4,TinyThief上wii了？！,surprise
5,每天都以紧张的心情工作，真的是太累，太不放松了，想要爆发一下,angry
6,语文军，数学军，英语军，物理军，政治军，历史军，生物军，地理军，八科联军，侵犯我班，我班战败...,angry
7,我不是一个优秀的演员……不能微笑着旁观你们幸福。,sad
8,当你变优秀时，你想要的都会来找你,happy
9,累了一天！会宿舍听下我搞基新歌！在看看我段宜恩美图！心都被治愈了,happy


In [4]:
dataset["情绪标签"].replace({
    "angry":0,
    "happy":1,
    "neutral":2,
    "surprise":3,
    "fear":4,
    "sad":5
},inplace=True)
dataset.head(5)

Unnamed: 0,文本,情绪标签
0,气死姐姐了，快二是阵亡了吗，尼玛，一个半小时过去了也没上车,0
1,妞妞啊，今天又承办了一个发文登记文号是126~嘻~么么哒~晚安哟,1
2,这里还值得注意另一个事实，就是张鞠存原有一个东溪草堂为其读书处。,2
3,这在前华约国家(尤其是东德)使用R-73的首次联合演习期间，被一些北约组织的飞行员所证实。,2
4,TinyThief上wii了？！,3


In [5]:
dataset.isnull().sum()

文本      0
情绪标签    0
dtype: int64

In [6]:
dataset.dropna(axis=0, how='any',inplace=True) #处理缺失值
dataset.isnull().sum()

文本      0
情绪标签    0
dtype: int64

In [7]:
tokenizer = transformers.AutoTokenizer.from_pretrained('bert-base-chinese') #使用bert的tokenizer

#### 定义数据集

In [8]:
class MyDataset(Dataset):
    def __init__(self,dataset,tokenizer):
        self.X = tokenizer(dataset['文本'].tolist(),truncation=True, padding=True)
        self.Y = dataset['情绪标签'].astype('long').tolist()
        
        self.len = len(self.Y)
        
        self.input_ids = [torch.tensor(x).long() for x in self.X.input_ids]
        self.token_type_ids = [torch.tensor(x).long() for x in self.X.token_type_ids]
        self.attention_mask = [torch.tensor(x).long() for x in self.X.attention_mask]
        
    def __getitem__(self,index):
        data = {'features':{'input_ids': self.input_ids[index],
                            'token_type_ids': self.token_type_ids[index],
                            'attention_mask':self.attention_mask[index]
                           },
                'labels' : self.Y[index]
               }
        return data

        
    def __len__(self):
        return self.len

In [9]:
DataSet = MyDataset(dataset,tokenizer)
# 将数据的80%作为训练集，剩下20%作为测试集
train_data, test_data = train_test_split(DataSet, test_size = 0.2) 
train_loader = DataLoader(train_data, batch_size=128, shuffle=True)#根据显卡的显存来选择，此处用的3090
test_loader = DataLoader(test_data, batch_size=128)

In [10]:
### 建立模型，使用bert作为预训练模型，进行一些微调

In [11]:
model = transformers.BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=6)
lr = 1e-5
optimizer = torch.optim.Adam(model.parameters(), lr = lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
loss = torch.nn.CrossEntropyLoss()

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

### 开始训练过程

In [12]:
model.to(device)
model.train()
for epoch in range(10):
    print(f"{epoch+1} epoch start——")
    train_loss = []
    train_accs = []
    for i, batch in tqdm(enumerate(train_loader)):
        labels = batch['labels'].to(device)
        input_ids = batch['features']['input_ids'].to(device)
        token_type_ids = batch['features']['token_type_ids'].to(device)
        attention_mask = batch['features']['attention_mask'].to(device)
        out = model(input_ids, attention_mask=attention_mask, labels=labels)
        l = out[0]
        #l = loss(out,batch['labels'].to(device))
        optimizer.zero_grad()
        l.backward()
        optimizer.step()
        out[0].cpu()
        acc = (out[1].argmax(dim=-1).cpu() == batch['labels']).float().mean()
        train_loss.append(l.item())
        train_accs.append(acc)
    train_loss = sum(train_loss)/len(train_loss)
    train_accs = sum(train_accs)/len(train_accs)
    print(f"[ Train | {epoch+1:03d} ] loss = {train_loss:.5f} , acc = {train_accs:.5f}")

1 epoch start——


69it [01:06,  1.03it/s]


[ Train | 001 ] loss = 1.23002 , acc = 0.58507
2 epoch start——


69it [00:48,  1.41it/s]


[ Train | 002 ] loss = 0.75096 , acc = 0.74438
3 epoch start——


69it [00:49,  1.41it/s]


[ Train | 003 ] loss = 0.57242 , acc = 0.80929
4 epoch start——


69it [00:49,  1.41it/s]


[ Train | 004 ] loss = 0.47094 , acc = 0.83945
5 epoch start——


69it [00:49,  1.41it/s]


[ Train | 005 ] loss = 0.37475 , acc = 0.88244
6 epoch start——


69it [00:48,  1.41it/s]


[ Train | 006 ] loss = 0.30016 , acc = 0.90610
7 epoch start——


69it [00:48,  1.41it/s]


[ Train | 007 ] loss = 0.23708 , acc = 0.92973
8 epoch start——


69it [00:48,  1.41it/s]


[ Train | 008 ] loss = 0.18849 , acc = 0.94467
9 epoch start——


69it [00:48,  1.41it/s]


[ Train | 009 ] loss = 0.13877 , acc = 0.96403
10 epoch start——


69it [00:48,  1.41it/s]

[ Train | 010 ] loss = 0.11111 , acc = 0.96981





### 测试模型的表现（明显有点过拟合了）

In [13]:
model.to(device)
model.eval()
test_loss = []
test_accs = []
for i, batch in tqdm(enumerate(test_loader)):
    with torch.no_grad():
        labels = batch['labels'].to(device)
        input_ids = batch['features']['input_ids'].to(device)
        token_type_ids = batch['features']['token_type_ids'].to(device)
        attention_mask = batch['features']['attention_mask'].to(device)
        out = model(input_ids, attention_mask=attention_mask, labels=labels)
    acc = (out[1].argmax(dim=-1).cpu() == batch['labels']).float().mean()
    test_loss.append(l.item())
    test_accs.append(acc)
test_loss = sum(test_loss)/len(test_loss)
test_accs = sum(test_accs)/len(test_accs)
print(f"[ Train | {epoch+1:03d} ] loss = {test_loss:.5f} , acc = {test_accs:.5f}")

18it [00:04,  4.01it/s]

[ Train | 010 ] loss = 0.13033 , acc = 0.74711





In [14]:
torch.save(model,'final_model.pth') #保存我们的整个模型

### 用几个例子来看看我们模型的表现

In [15]:
def Predict_F(example,tokenizer,model):
    labels = torch.tensor([1])
    labels = labels.to(device)
    token = tokenizer(example)
    input_ids = torch.tensor(token.input_ids).to(device)
    token_type_ids = torch.tensor(token.token_type_ids).to(device)
    attention_mask = torch.tensor(token.attention_mask).to(device)
    with torch.no_grad():
        predict = model(input_ids, attention_mask=attention_mask, labels=labels)[1].argmax(dim=-1)
    labels = ["angry","happy","neutral","surprise","fear","sad"]
    return labels[predict]

In [16]:
example = ["为啥有人把我东西拿了，他在哪里！我要找到他！"]
pre = Predict_F(example,tokenizer,model)
print(pre)

angry


In [17]:
example = ["在复旦读书挺好的"]
pre = Predict_F(example,tokenizer,model)
print(pre)

happy
