In [167]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data.dataloader as dataloader
import torch.optim as optim
import torch.autograd as autograd
import torchtext.vocab as torchvocab
from torch.autograd import Variable
import numpy as np
import tqdm
import os
import time
import re
import pandas as pd
import string
import gensim
import time
import random
import collections
from collections import Counter
from nltk.corpus import stopwords
from itertools import chain
from sklearn import manifold
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# 下面是colab环境配置

In [168]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

E: Package 'python-software-properties' has no installation candidate
··········


In [169]:
# 指定Google Drive云端硬盘的根目录，名为drive
!mkdir -p drive
!google-drive-ocamlfuse drive

In [170]:
# 指定当前的工作文件夹
import os

# 此处为google drive中的文件路径,drive为之前指定的工作根目录，要加上
os.chdir("drive/Colab Notebooks") 

# 开始正文预测

In [256]:
test = "疫情虽然暂时得到控制，但是还是要时刻保持警惕，黑龙江加油"

In [257]:
perdit_zh = {0:"害怕", 1:"恶心", 2:"乐观", 3:"惊讶", 4:"感激", 5:"伤心", 6:"愤怒"}

In [258]:
class textCNN(nn.Module):
    def __init__(self, vocab_size, embed_size, seq_len, labels, weight, **kwargs):
        super(textCNN, self).__init__(**kwargs)
        self.labels = labels
        self.embedding = nn.Embedding.from_pretrained(weight)
        self.embedding.weight.requires_grad = True
        self.conv1 = nn.Conv2d(1, 1, (3, embed_size))
        self.conv2 = nn.Conv2d(1, 1, (4, embed_size))
        self.conv3 = nn.Conv2d(1, 1, (5, embed_size))
        self.pool1 = nn.MaxPool2d((seq_len - 3 + 1, 1))
        self.pool2 = nn.MaxPool2d((seq_len - 4 + 1, 1))
        self.pool3 = nn.MaxPool2d((seq_len - 5 + 1, 1))
        self.linear = nn.Linear(3, labels)
        self.drop = nn.Dropout(p=0.5)

    def forward(self, inputs):
        inputs = self.embedding(inputs).view(inputs.shape[0], 1, inputs.shape[1], -1)
        x1 = F.relu(self.conv1(inputs))
        x2 = F.relu(self.conv2(inputs))
        x3 = F.relu(self.conv3(inputs))

        x1 = self.pool1(x1)
        x2 = self.pool2(x2)
        x3 = self.pool3(x3)

        x = torch.cat((x1, x2, x3), 1)
        x = x.view(inputs.shape[0], 1, -1)

        x = self.linear(x)
        x = x.view(-1, self.labels)

        return(x)

In [259]:
wvmodel=gensim.models.Word2Vec.load("weibo_zh.model").wv # 这是上面的词向量模型

In [260]:
# 加载整个模型  
model = torch.load("cnn-model.pkl", map_location='cpu')

In [261]:
vocab = [ch for ch in test]

In [262]:
embed_size = 200
bidirectional = True
batch_size = 50
lr = 0.001
device = torch.device('cpu')
use_gpu = False

In [263]:
word_to_idx = {word: i+1 for i, word in enumerate(vocab)}
word_to_idx['<unk>'] = 0
idx_to_word = {i+1: word for i, word in enumerate(vocab)}
idx_to_word[0] = '<unk>'

In [264]:
weight = torch.zeros(len(vocab)+1, embed_size)
for i in range(len(wvmodel.index2word)):
    try:
        index = word_to_idx[wvmodel.index2word[i]]
    except:
        continue
    weight[index, :] = torch.from_numpy(wvmodel.get_vector(
        idx_to_word[word_to_idx[wvmodel.index2word[i]]]))

In [265]:
weight

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-6.3274,  1.0385, -1.3764,  ..., -1.6645,  1.8736, -2.4250],
        [ 5.4094,  0.3381,  0.5112,  ..., -4.0594, -3.0026,  1.5176],
        ...,
        [ 0.0119, -0.1237,  2.4795,  ...,  2.0439,  2.6481,  1.4656],
        [ 1.1728,  1.8094,  0.4729,  ...,  1.9941, -2.2353,  0.3307],
        [-1.9601, -2.7114, -2.3524,  ...,  0.0860,  4.1026,  3.7395]])

In [266]:
def encode_samples(tokenized_samples):
    features = []
    for sample in tokenized_samples:
        feature = []
        for token in sample:
            if token in word_to_idx:
                feature.append(word_to_idx[token])
            else:
                feature.append(0)
        features.append(feature)
    return features

In [267]:
def pad_samples(features, maxlen, PAD=0):
    padded_features = []
    for feature in features:
        if len(feature) >= maxlen:
            padded_feature = feature[:maxlen]
        else:
            padded_feature = feature
            while(len(padded_feature) < maxlen):
                padded_feature.append(PAD)
        padded_features.append(padded_feature)
    return padded_features

In [268]:
test_features = torch.tensor(pad_samples(encode_samples([vocab]),maxlen=60))

In [269]:
test_set = torch.utils.data.TensorDataset(test_features)

In [270]:
test_iter = torch.utils.data.DataLoader(test_set, batch_size=batch_size,
                                        shuffle=False)

In [271]:
with torch.no_grad():
  for batch in test_iter:
    for t in batch:
      vectors=t.to(device)
    predictions = model(vectors)
    rounded_preds = torch.max(predictions,1)[1]

In [272]:
print("该文本预测为 {} 类文本".format(perdit_zh[int(rounded_preds)]))

该文本预测为 愤怒 类文本


In [273]:
values = []
for i in predictions:
  for va in i:
    values.append(float(va))
for i in range(len(perdit_zh)):
  print(" {} 类文本得分为{}".format(perdit_zh[i], values[i]))


 害怕 类文本得分为-96.76004028320312
 恶心 类文本得分为2.973323345184326
 乐观 类文本得分为-15.055810928344727
 惊讶 类文本得分为6.123964309692383
 感激 类文本得分为-9.662149429321289
 伤心 类文本得分为-33.23273849487305
 愤怒 类文本得分为16.62092399597168


In [274]:
values

[-96.76004028320312,
 2.973323345184326,
 -15.055810928344727,
 6.123964309692383,
 -9.662149429321289,
 -33.23273849487305,
 16.62092399597168]