In [None]:
# -*- coding:utf-8 -*-

import collections
import os
import random
import sys

import jieba
import torch
import torch.utils.data as Data
import torchtext.vocab as vocab
from tqdm import tqdm

import nltk
import re
# nltk.download('punkt')

In [None]:
def save_vocab(vocab, path):
    # print(vocab.get_itos())
    with open(path, 'w', encoding="utf8") as output:
        print("\n".join(vocab.get_itos()), file=output)

def read_vocab(vocab_path):
    vocab_dict = {}
    with open(vocab_path, 'r', encoding="utf8") as f:
        for line in f:
            word = line[:-1]
            # print("*{0}*".format(word))
            if word == "": continue
            vocab_dict[word] = 1
    dict = vocab.vocab(vocab_dict, min_freq=0)
    return dict

In [None]:
def read_imdb(folder='train', data_root="data"):
    data = []
    for label in ['pos', 'neg']:
        folder_name = os.path.join(data_root, folder, label)
        print(folder_name)
        for file in tqdm(os.listdir(folder_name)):
            input_file = os.path.join(folder_name, file)
            print("reading file {0}".format(input_file), file=sys.stderr)
            with open(input_file, 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '').lower()
                data.append([review, 1 if label == 'pos' else 0])  # 评论文本字符串和01标签
    random.shuffle(data)
    return data

def get_tokenized_imdb(data):  # 将每行数据的进行空格切割,保留每个的单词
    """
    @params:
        data: 数据的列表，列表中的每个元素为 [文本字符串，0/1标签] 二元组
    @return: 切分词后的文本的列表，列表中的每个元素为切分后的词序列
    """

    def tokenizer(text):
        return [tok.lower() for tok in text.split(' ')]

    return [tokenizer(review) for review, _ in data]

def get_vocab_imdb(data, min_count=1):
    """
    @params:
        data: 同上
    @return: 数据集上的词典，Vocab 的实例（freqs, stoi, itos）
    """
    tokenized_data = get_tokenized_imdb(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    # print(counter)
    # 统计所有的数据
    dict = vocab.vocab(counter, min_freq=min_count)  # 构建词汇表
 
    # 加入<pad> 和 <unk>
    dict.insert_token("<pad>", 0)
    dict.insert_token("<unk>", 1)
    return dict

def preprocess_imdb(data, vocab, max_l=128):
    """
    @params:
        data: 同上，原始的读入数据
        vocab: 训练集上生成的词典
    @return:
        features: 单词下标序列，形状为 (n, max_l) 的整数张量
        labels: 情感标签，形状为 (n,) 的0/1整数张量
    """

    def pad(x):  # 填充
        return x[:max_l] if len(x) > max_l else x + [vocab["<pad>"]] * (max_l - len(x))

    tokenized_data = get_tokenized_imdb(data)
    # pprint(tokenized_data[:10])
    padded_tokenized_data = []
    for words in tokenized_data:
        indexed_words = [vocab[word] if word in vocab else vocab["<unk>"] for word in words]
        padded_words = pad(indexed_words)
        padded_tokenized_data.append(padded_words)
    # pprint(padded_tokenized_data[:10])
    features = torch.tensor(padded_tokenized_data)
    labels = torch.tensor([score for _, score in data])
    return features, labels

def make_imdb_dataset(batch_size=64, max_length=128, min_count=5):
    # 读取文本数据
    train_data, test_data = read_imdb(folder="train"), read_imdb(folder="test")
    # 获取字典
    vocab = get_vocab_imdb(train_data, min_count)
    # *号语法糖,解绑参数，获取dataset对象
    train_set = Data.TensorDataset(*preprocess_imdb(train_data, vocab, max_length))
    test_set = Data.TensorDataset(*preprocess_imdb(test_data, vocab, max_length))  # 相当于将函数参数是函数结果
    # 获取迭代器
    train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
    test_iter = Data.DataLoader(test_set, batch_size)

    return train_iter, test_iter, vocab


In [None]:
def read_weibo(tag='training-processed', data_root="data\\weibo_senti_100k"):
    data = []
    input_file = os.path.join(data_root, "{0}.csv".format(tag))
    with open(input_file, 'r', encoding="utf8") as f:
        for line in tqdm(f):
            line = line.strip()
            # print("line: ",line)
            # print(len(line))
            i = 0
            j = 0
            for i in range(len(line)):
                if line[i] == ',':
                    j+=1
                if j == 5:
                    break
            if int(line[0]) == 4:
                label = 1
            else:
                label = int(line[0])
            # print(i)
            review = line[i+1:-5]
            # print(review)
            data.append([review, label])  # 评论文本字符串和01标签
    random.shuffle(data)
    # print(data[:10])
    return data

def get_tokenized_weibo(data):  # 将每行数据的进行空格切割,保留每个的单词
    # 此处可以添加更复杂的过滤逻辑
    def tokenizer(text):
        return [tok for tok in jieba.cut(text)]

    mylist = [tokenizer(review) for review, _ in data]
    # print(mylist[:10])
    return mylist

def get_vocab_weibo(data, min_count=1):
    '''
    @params:
        data: 同上
    @return: 数据集上的词典，Vocab 的实例（freqs, stoi, itos）
    '''
    tokenized_data = get_tokenized_weibo(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    # print(counter)
    # 统计所有的数据
    dict = vocab.vocab(counter, min_freq=min_count)  # 构建词汇表
    # 加入<pad> 和 <unk>
    dict.insert_token("<pad>", 0)
    dict.insert_token("<unk>", 1)
    return dict

def preprocess_weibo(data, vocab, max_l=64):
    def pad(x):  # 填充
        return x[:max_l] if len(x) > max_l else x + [vocab["<pad>"]] * (max_l - len(x))

    tokenized_data = get_tokenized_weibo(data)
    # print(tokenized_data[:10])
    # print("*****************")
    # print(vocab.get_stoi())
    padded_tokenized_data = []
    for words in tokenized_data:
        indexed_words = [vocab[word] if word in vocab else vocab["<unk>"] for word in words]
        # print("indexed_words: ",indexed_words)
        padded_words = pad(indexed_words)
        # print("padded_words: ",padded_words)
        padded_tokenized_data.append(padded_words)
        # print("padded_tokenized_data: ",padded_tokenized_data)
    # print(padded_tokenized_data[:10])
    features = torch.tensor(padded_tokenized_data)
    labels = torch.tensor([score for _, score in data])
    return features, labels

def make_weibo_dataset(batch_size=64, max_length=64, min_count=5):
    # 读取文本数据
    train_data, test_data = read_weibo(tag="test_sample"), read_weibo(tag="test_sample2")
    # 获取字典
    vocab = get_vocab_weibo(train_data, min_count)
    # *号语法糖,解绑参数，获取dataset对象
    train_set = Data.TensorDataset(*preprocess_weibo(train_data, vocab, max_length))
    test_set = Data.TensorDataset(*preprocess_weibo(test_data, vocab, max_length)) 
    # 获取迭代器
    train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
    test_iter = Data.DataLoader(test_set, batch_size)

    return train_iter, test_iter, vocab

# 用已经训练好的模型来标记
def make_weibo_testset(file_path, vocab_path, batch_size=64, max_length=64):
    # 读取数据
    data = []
    with open(file_path, 'r', encoding="utf8") as f:
        for line in f:
            review = line.strip()
            label = 0  # 标签固定为0
            data.append([review, label])

    # 读入词典
    vocab = read_vocab(vocab_path)
    # *号语法糖,解绑参数，获取dataset对象
    data_set = Data.TensorDataset(*preprocess_weibo(data, vocab, max_length))  # 相当于将函数参数是函数结果
    # 获取迭代器
    data_iter = Data.DataLoader(data_set, batch_size)

    return data_iter, vocab

def load_pretrained_embedding(words, pretrained_vocab_path=None, emb_size=100, type="glove"):
    '''
    @params:
        words: 需要加载词向量的词语列表，以 itos (index to string) 的词典形式给出
        pretrained_vocab: 预训练词向量
        type: 词向量的种类
    @return:
        embed: 加载到的词向量
    '''
    # embed = torch.zeros(len(words), emb_size)  # 初始化为len*100维度
    embed = torch.normal(mean=0, std=1, size=(len(words), emb_size))
    if type == "glove":
        # 先硬编码使用100d的glove向量
        pretrained_vocab = vocab.GloVe(name="6B", dim=100, cache="data\\glove")
    else:
        return embed

    pretrained_emb_size = pretrained_vocab.vectors[0].shape[0]
    oov_count = 0  # out of vocabulary
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            if pretrained_emb_size == emb_size:
                embed[i, :] = pretrained_vocab.vectors[idx]  # 将每个词语用训练的语言模型理解
            elif pretrained_emb_size < emb_size:
                embed[1, :] = pretrained_vocab.vectors[idx] + [0] * (emb_size - pretrained_emb_size)
            else:
                embed[1, :] = pretrained_vocab.vectors[idx][:emb_size]
        except KeyError:
            oov_count += 1
    if oov_count > 0:
        print("There are %d oov words." % oov_count)
    # print(embed.shape),在词典中寻找相匹配的词向量
    return embed
    
    

In [None]:
# train_data, test_data = read_weibo(tag="training-processed"), read_weibo(tag="test_sample")
train_data, test_data = read_weibo(tag="test_sample"), read_weibo(tag="test_sample")

In [None]:
# for sample in train_data[:5]:
    #     print(sample[1], '\t', sample[0][:])
for review,i in train_data[:5]:
    print(i, '\t', review[:80])

In [None]:
tokenized_data = get_tokenized_weibo(train_data[:5])
print(tokenized_data[:5])
print('# words in vocab:', len(tokenized_data))

In [None]:
counter = collections.Counter([tk for st in tokenized_data for tk in st])
print('counter:', counter)

In [None]:
dict = vocab.vocab(counter)  # 构建词汇表
temp=dict
print((temp.get_itos()))

In [None]:
min_count = 1
dict=get_vocab_weibo(train_data[:5], min_count)
print("Vocab size: {0}".format(len(dict)))
print(dict.get_itos())
print(dict.get_stoi())

In [None]:
features, lable = preprocess_weibo(train_data[:2], dict, max_l=5)
# print(features.size())
# print(lable.size())

print(features[:2])
print(lable[:2])


In [None]:
cache_dir = "data\\glove"
glove_vocab = vocab.GloVe(name='6B', dim=100, cache=cache_dir)


In [None]:
load_pretrained_embedding(vocab.get_itos(), glove_vocab)