### Import Libraries

In [1]:
import os
import sys
import numpy as np
import pandas as pd

In [2]:
import re            # 正则匹配
import jieba         # 中文分词
import codecs        # 文件编码转换
import collections   # 统计词频

In [3]:
import torch
import torch.autograd as autograd # torch 中自动计算梯度模块
import torch.nn as nn             # 神经网络模块
import torch.nn.functional as F   # 神经网络模块中的常用功能 
import torch.optim as optim       # 模型优化器模块
import torch.utils.data as Data
torch.manual_seed(1)

<torch._C.Generator at 0x10d616150>

In [41]:
# 导入自定义库
from utils.data_utils import clean_str
from utils.data_utils import build_vocab
from utils.data_utils import get_tokens

In [5]:
!ls ./data

Preliminary-texting.csv dataset.py              [34mtrain[m[m
__init__.py             [34mdev[m[m                     training-inspur.csv
[31mdata.rar[m[m                get_data.sh


### View Dateset

In [6]:
!head ./data/training-inspur.csv

"ROWKEY","COMMCONTENT","COMMLEVEL"
"1080003","普通公园一个只是多了几个泉而已，人不多，适合老人孩子闲逛，买票的话还是贵了，人家说6.30之前进园不用花钱","1"
"1080004","跟儿子在里面玩了一天，非常好！跟儿子在里面玩了一天，非常好！真的很不错哦，有空还要去","1"
"1080005","这已经是第五次来这里玩了。每次孩子都很喜欢，不愿意从水里出来。有机会还会再来。还有比我更忠诚的客户吗？哈哈","1"
"1080006","当天在携程上定的票，打温泉度假村咨询电话和携程客服都说次日生效，但到酒店后，票能用。请客服人员了解清楚再回答咨询问题。不然听信，就得中途掉头回家了。","1"
"1080007","烟台历史的一部分，非常值得推荐去看看！海边景色也很漂亮！","1"
"1080008","周末看看动物亲近亲近大自然挺好的，媳妇儿还跟猴子拍照，猴子满身爬，挺好玩，如果动物再多点就好了，门票小贵","1"
"1080009","五四广场青岛旅游景点必打卡又一地点，标志性红包建筑雕塑矗立在市政府对面亦是海边，还是有点意思的，旁边有游船可出海游玩","1"
"1080010","五四广场坐落在山东省青岛市市南区，在海边，那个红红的火炬就是五四广场的标志。广场上有大片绿地，广场上游玩的人挺多的，聚集火车站不太远。","1"
"1080011","环境好，景色美，值得游览，感觉很好","1"


### Text Preprocessing
- 去除标点符号
- 中文分词
- 去除空格
- 去除缺失项

#### 加载数据集

In [7]:
df_dataset = pd.read_csv('./data/training-inspur.csv', encoding='utf-8')

In [8]:
len(df_dataset)

20000

### Prepare Data

In [9]:
COMMCONTENT_SEG = []

for sent in df_dataset['COMMCONTENT']:

    # Extract Sentence
    sent = str(sent).strip()

    sent = clean_str(sent)

    stopwords = [" ","!","...................................................................."]

    seg_list = jieba.cut(sent, cut_all=False)

    seg_list = [i for i in seg_list if i not in stopwords]
    
    COMMCONTENT_SEG.append(" ".join(seg_list))
df_dataset['COMMCONTENT_SEG'] = pd.DataFrame(COMMCONTENT_SEG,columns=['COMMCONTENT_SEG'])

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/5n/2_by50851fxc4d_snc1d9wf80000gn/T/jieba.cache
Loading model cost 0.724 seconds.
Prefix dict has been built succesfully.


In [10]:
df_dataset = df_dataset[df_dataset['COMMCONTENT_SEG']!=""]
df_dataset = df_dataset.reset_index()

In [11]:
df_dataset.head()

Unnamed: 0,index,ROWKEY,COMMCONTENT,COMMLEVEL,COMMCONTENT_SEG
0,0,1080003,普通公园一个只是多了几个泉而已，人不多，适合老人孩子闲逛，买票的话还是贵了，人家说6.30之...,1,普通 公园 一个 只是 多 了 几个 泉 而已 人不多 适合 老人 孩子 闲逛 买票 的话 ...
1,1,1080004,跟儿子在里面玩了一天，非常好！跟儿子在里面玩了一天，非常好！真的很不错哦，有空还要去,1,跟 儿子 在 里面 玩 了 一天 非常 好 跟 儿子 在 里面 玩 了 一天 非常 好 真的...
2,2,1080005,这已经是第五次来这里玩了。每次孩子都很喜欢，不愿意从水里出来。有机会还会再来。还有比我更忠诚...,1,这 已经 是 第五次 来 这里 玩 了 每次 孩子 都 很 喜欢 不 愿意 从水里 出来 有...
3,3,1080006,当天在携程上定的票，打温泉度假村咨询电话和携程客服都说次日生效，但到酒店后，票能用。请客服人...,1,当天 在 携程 上定 的 票 打 温泉 度假村 咨询电话 和 携程 客服 都 说 次日 生效...
4,4,1080007,烟台历史的一部分，非常值得推荐去看看！海边景色也很漂亮！,1,烟台 历史 的 一部分 非常 值得 推荐 去 看看 海边 景色 也 很漂亮


#### 导出处理后数据集 CSV

In [12]:
if not os.path.exists('./dataset_inspur.csv'):
    print("Downloading...")
    
    df_dataset.to_csv('./dataset_inspur.csv', 
                      sep=",", 
                      columns=['ROWKEY','COMMCONTENT_SEG','COMMLEVEL'],
                      index=False, 
                      encoding='utf-8')
else:
    print("Dataset exists in: ./dataset_inspur.csv")

Dataset exists in: ./dataset_inspur.csv


### Build Vocab

In [147]:
vocab,vocab_freqs = build_vocab(df_dataset['COMMCONTENT_SEG'])

In [14]:
vocab_freqs.most_common(5)

[('的', 33632), ('了', 13661), ('是', 8170), ('很', 7257), ('去', 6813)]

In [15]:
len(vocab_freqs)

33950

In [16]:
BASE_DIR = '/Users/tsw/ScenicSpotReviews'
W2V_DIR = BASE_DIR + '/embeddings/'
TEXT_DATA_DIR = BASE_DIR + '/data/'

MAX_SEQUENCE_LENGTH = 588

MAX_NUM_WORDS = 33950

EMBEDDING_DIM = 300

VALIDATION_SPLIT = 0.2

BATCH_SIZE = 32

In [17]:
vocab_size = min(MAX_NUM_WORDS, len(vocab_freqs)) + 2
word2index = {x[0]: i+2 for i, x in enumerate(vocab_freqs.most_common(33950))}
word2index["PAD"] = 0
word2index["UNK"] = 1
index2word = {v:k for k, v in word2index.items()}

In [18]:
word2index["壁纸"]

30002

In [19]:
index2word[560]

'普通'

In [20]:
def pad_sequences(vectorized_seqs, seq_lengths):
    seq_tensor = torch.zeros((len(vectorized_seqs),max(seq_lengths))).long()
    for idx, (seq, seq_len) in enumerate(zip(vectorized_seqs, seq_lengths)):
        seq_tensor[idx, :seq_len] = torch.LongTensor(seq)
    return seq_tensor

In [21]:
#seq_tensor = pad_sequences(vectorized_seqs,seq_lengths)

In [22]:
# Lookup Table
# First, build index mapping words in the embeddings set to their embedding vector
print('Indexing word vectors...')

embeddings_index = {}

with open(os.path.join(W2V_DIR, 'zhihu.vec')) as f:
    all_lines = f.readlines()
    lines = all_lines[1:]
    for line in lines:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors...
Found 150128 word vectors.


### word embedding Lookup

In [23]:
num_words = min(MAX_NUM_WORDS,len(word2index))
num_words

33950

In [24]:
word_embedding_matrix = np.zeros((num_words + 1, EMBEDDING_DIM))

In [25]:
for word, i in word2index.items():
    if i > MAX_NUM_WORDS:
        continue
    
    embedding_vector = embeddings_index.get(str(word))
    
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

In [26]:
"word_embedding_matrix.shape:",word_embedding_matrix.shape

('word_embedding_matrix.shape:', (33951, 300))

### 训练集/测试集 划分

In [27]:
def split_dataset(X, y, train_ratio=0.8):
    X = np.array(X)
    # seq_lens = np.array(seq_lens)
    data_size = len(X)

    # Shuffle the data
    shuffle_indices = np.random.permutation(np.arange(data_size))
    X, y = X[shuffle_indices], y[shuffle_indices]

    # Split into train and validation set
    train_end_index = int(train_ratio*data_size)
    train_X = X[:train_end_index]
    train_y = y[:train_end_index]

    valid_X = X[train_end_index:]
    valid_y = y[train_end_index:]
    
    return train_X,train_y,valid_X,valid_y

In [28]:
train_X,train_y,valid_X,valid_y = split_dataset(df_dataset['COMMCONTENT_SEG'], 
                                                df_dataset['COMMLEVEL'], 
                                                train_ratio=0.8)

In [29]:
train_X.shape

(15995,)

In [30]:
# 将 words 转换为
def text_to_seqs(texts):
    seqs = []
    for text in texts:
        words = []
        text = text.strip().split(" ")
        for word in text:
            words.append(word2index[word])
        seqs.append(words)
    return seqs

# 将 words 转换为
def text_to_sequences(vectorized_seqs, seq_lengths):
    seqs_tensor = torch.zeros((len(vectorized_seqs),max(seq_lengths))).long()
#     print(seqs_tensor.shape)
    for idx, (seq, seq_len) in enumerate(zip(vectorized_seqs, seq_lengths)):
        seqs_tensor[idx, :seq_len] = torch.LongTensor(seq)
    return seqs_tensor

In [31]:
# 
def pad_sequences(vectorized_seqs, seq_lengths):
    seq_tensor = torch.zeros((len(vectorized_seqs),max(seq_lengths))).long()
    for idx, (seq, seq_len) in enumerate(zip(vectorized_seqs, seq_lengths)):
        seq_tensor[idx, :seq_len] = torch.LongTensor(seq)
    return seq_tensor

In [32]:
train_X_vectorized_seqs = text_to_seqs(train_X)
valid_X_vectorized_seqs = text_to_seqs(valid_X)

In [33]:
len(train_X_vectorized_seqs),len(valid_X_vectorized_seqs)

(15995, 3999)

In [34]:
train_X_seq_lengths = [len(i) for i in train_X_vectorized_seqs]
valid_X_seq_lengths = [len(i) for i in valid_X_vectorized_seqs]

In [35]:
len(train_X_seq_lengths),len(valid_X_seq_lengths)

(15995, 3999)

In [36]:
train_X_tensor = text_to_sequences(train_X_vectorized_seqs,train_X_seq_lengths)

In [37]:
valid_X_tensor = text_to_sequences(valid_X_seq_lengths,valid_X_seq_lengths)

In [38]:
train_y_tensor = torch.from_numpy(np.array(train_y.tolist()))

In [39]:
torch_dataset = Data.TensorDataset(train_X_tensor,train_y_tensor)

In [40]:
# 把 dataset 放入 DataLoader
data_loader = Data.DataLoader(
    dataset=torch_dataset,      # torch TensorDataset format
    batch_size=BATCH_SIZE,      # mini batch size
    shuffle=True,               # 要不要打乱数据 (打乱比较好)
    num_workers=2,              # 多线程来读数据
)

### 模型定义

In [42]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [136]:
BASE_DIR = '/Users/tsw/ScenicSpotReviews'

W2V_DIR = BASE_DIR + '/embeddings/'

TEXT_DATA_DIR = BASE_DIR + '/data/'

MAX_SEQUENCE_LENGTH = 588

MAX_NUM_WORDS = 33950

EMBEDDING_DIM = 300

VALIDATION_SPLIT = 0.2

HIDDEN_DIM=64


BATCH_SIZE = 32

#### 1.LSTM

In [146]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        
        # word_embeddings layer
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [145]:
labels = ["1","2","3"]
label_to_ix = {"1": 0, "2": 1, "3": 2}

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word2index), len(label_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
# with torch.no_grad():
#     inputs = prepare_sequence(training_data[0][0], word2index)
#     label_scores = model(inputs)
#     print(label_scores)

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, labels in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()
        
#         print(sentence)

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word2index)
        print(sentence_in)
        #labels = prepare_sequence(labels, label_to_ix)
        labels = torch.tensor([1,2,3], dtype=torch.long)

        # Step 3. Run our forward pass.
        labels_scores = model(sentence_in)
        
        print(labels_scores)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(labels_scores, labels)
        loss.backward()
        optimizer.step()

# See what the scores are after training
# with torch.no_grad():
#     inputs = prepare_sequence(training_data[0][0], word2index)
#     labels_scores = model(inputs)

#     # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
#     # for word i. The predicted tag is the maximum scoring tag.
#     # Here, we can see the predicted sequence below is 0 1 2 0 1
#     # since 0 is index of the maximum value of row 1,
#     # 1 is the index of maximum value of row 2, etc.
#     # Which is DET NOUN VERB DET NOUN, the correct sequence!
#     print(tag_scores)

tensor([   353,   2616,     24,   2050,      8,   3675,      2,     29,
           177,     54,     68,   4746,      9,  11463,     10,     78,
             3,    362,    926,     24,     59,    528,    183,     48,
            30,     67,    452])
tensor([[-1.2748, -1.0953, -0.9517],
        [-1.2725, -1.1569, -0.9029],
        [-1.3127, -0.9682, -1.0466],
        [-1.2592, -0.9199, -1.1471],
        [-1.2245, -0.9501, -1.1413],
        [-1.2281, -0.9344, -1.1574],
        [-1.0572, -1.1701, -1.0723],
        [-1.2318, -0.9712, -1.1099],
        [-1.1322, -1.1616, -1.0087],
        [-1.2227, -1.1217, -0.9681],
        [-1.1414, -1.1219, -1.0357],
        [-1.1147, -1.0592, -1.1231],
        [-1.1376, -1.0471, -1.1133],
        [-1.0497, -0.9777, -1.2955],
        [-1.1428, -1.0444, -1.1112],
        [-1.1097, -0.9987, -1.1974],
        [-1.2452, -0.9770, -1.0917],
        [-1.3424, -0.9859, -1.0060],
        [-1.2913, -1.0178, -1.0114],
        [-1.4784, -0.8560, -1.0580],
        [-1

ValueError: Expected input batch_size (27) to match target batch_size (3).

In [117]:
class LSTMClassifier(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size, label_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
 
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
 
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim)
    
        self.lstm = nn.LSTM(
            input_size = 300,
            hidden_size = 64,
            num_layers = 1,
            batch_first = False
        )
        
        self.hidden2label = nn.Linear(64,3)
 
#         self.hidden2label = nn.Linear(hidden_dim, label_size)
    
        self.hidden = self.init_hidden()
 
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))
 
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        label_space = self.hidden2label(lstm_out.view(len(sentence), -1))
        label_scores = F.log_softmax(label_space,dim=None)
#         label_scores = nn.LogSoftmax(dim=label_space)
        return label_scores

In [114]:
# model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, len(word2inddx), len(tag_to_ix))
model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, len(word2index), 3)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [115]:
training_data = []
for sent,label in list(zip(train_X,train_y)):
    training_data.append((sent.split(" "),label))

In [116]:
# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    model.zero_grad()
    inputs = prepare_sequence(training_data[0][0], word2index)
    label_scores = model(inputs)
    print(label_scores)

tensor([[-1.1190, -1.0934, -1.0838],
        [-0.9497, -1.2108, -1.1546],
        [-0.9388, -1.1502, -1.2298],
        [-0.9333, -1.1615, -1.2251],
        [-0.9641, -1.2443, -1.1071],
        [-0.9483, -1.2474, -1.1228],
        [-0.9744, -1.3424, -1.0178],
        [-0.9813, -1.2142, -1.1140],
        [-1.0991, -1.0904, -1.1064],
        [-1.0631, -1.0701, -1.1658],
        [-1.1996, -1.1411, -0.9697],
        [-0.8917, -1.1714, -1.2726],
        [-1.0936, -1.3022, -0.9338],
        [-1.1101, -1.1369, -1.0508],
        [-0.9960, -1.0973, -1.2144],
        [-0.9800, -1.0846, -1.2494],
        [-1.0642, -1.2958, -0.9642],
        [-1.0611, -1.2229, -1.0229],
        [-1.0389, -1.1774, -1.0844],
        [-0.9874, -1.1101, -1.2109],
        [-0.9401, -1.1184, -1.2637],
        [-0.9727, -1.2311, -1.1088],
        [-1.0073, -1.2971, -1.0176],
        [-0.9683, -1.3388, -1.0268],
        [-1.2060, -1.3781, -0.8017],
        [-1.2453, -1.3628, -0.7848],
        [-1.1418, -1.0745, -1.0809]])




In [94]:
for epoch in range(100):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, labels in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word2index)
        # targets = prepare_sequence(labels, label_to_ix)
        labels = torch.tensor([1,2,3], dtype=torch.long)

        # Step 3. Run our forward pass.
        label_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(label_scores, labels)
        loss.backward()
        optimizer.step()



ValueError: Expected input batch_size (27) to match target batch_size (3).

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMClassifier, self).__init__()
        
        self.lstm = nn.LSTM(
            input_size = 32,
            hidden_size = 64,
            num_layers = 1,
            batch_first = True
        )
        
        self.out = nn.Linear(64,3)
        
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim

        # self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        self.word_embedding = nn.Embedding.from_pretrained(torch.from_numpy(word_embedding_matrix))

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        
        self.hidden = self.init_hidden()
    
    
    def forward(self, x):
        r_out, (h_n, h_c) = self.lstm(x, None)
        out = self.out(r_out[:,-1,:])
        return out

In [None]:
lstm = LSTM()

In [None]:
print(lstm)

In [None]:
optimizer = torch.optim.Adam(lstm.parameters(), lr=0.01)

In [None]:
loss_func = nn.CrossEntropyLoss()

In [None]:
for epoch in range(100):
    for step, (x,y) in enumerate(train_loader):
        

In [None]:
embedding = nn.Embedding.from_pretrained(torch.from_numpy(word_embedding_matrix))

In [None]:
#     def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
#         super(LSTMClassifier, self).__init__()
#         self.hidden_dim = hidden_dim

#         # self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
#         self.word_embedding = nn.Embedding.from_pretrained(torch.from_numpy(word_embedding_matrix))

#         # The LSTM takes word embeddings as inputs, and outputs hidden states
#         # with dimensionality hidden_dim.
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim)

#         # The linear layer that maps from hidden state space to tag space
#         self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        
#         self.hidden = self.init_hidden()

In [None]:
EMBEDDING_DIM,HIDDEN_DIM,len(word2index),len(labels)

In [None]:
train_X,train_y,valid_X,valid_y

In [None]:
list(zip(train_X,train_y))

In [None]:
training_data = []
for sent,label in list(zip(train_X,train_y)):
    training_data.append((sent.split(" "),label))

In [None]:
training_data[0]

In [None]:
prepare_sequence(training_data, word2index)

In [None]:
train_dataset = Data.TensorDataset(train_X,train_y)

In [None]:
train_dataset = Data.TensorDataset(train_X_tensor,train_y_tensor)

In [None]:
train_dataset

In [None]:
# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    model.zero_grad()
    inputs = prepare_sequence(training_data[0][0], word2index)
#     print(inputs)
    label_scores = model(inputs)
    print(label_scores)

In [None]:
for epoch in range(10):  # again, normally you would NOT do 300 epochs, it is toy data
    for step, (batch_x, batch_y) in enumerate(data_loader):  # 每一步 loader 释放一小批数据用来学习
#         print(batch_x.shape)
        batch_x = batch_x.view(-1, 1, 588)
#         print(batch_x.shape)
        # for sentence, labels in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
#         sentence_in = prepare_sequence(batch_x, word2index)
        # targets = prepare_sequence(labels, label_to_ix)
        labels = torch.tensor([1,2,3], dtype=torch.long)

        # Step 3. Run our forward pass.
        label_scores = model(batch_x)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(label_scores, labels)
        loss.backward()
        optimizer.step()

In [None]:
for epoch in range(100):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, labels in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word2index)
        # targets = prepare_sequence(labels, label_to_ix)
        labels = torch.tensor([1,2,3], dtype=torch.long)

        # Step 3. Run our forward pass.
        label_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(label_scores, labels)
        loss.backward()
        optimizer.step()

In [None]:
torch.tensor([1,2,3], dtype=torch.long).view(1,3)

In [None]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [None]:
seq = ['普通','公园','一个','只是','多', '了', '几个', '泉', '而已', '人不多', '适合','老人', '孩子', '闲逛', '买票']

In [None]:
pseq = prepare_sequence(seq, word2index)

In [None]:
pseq[0]

In [None]:
# input = torch.LongTensor(pseq)
embedding(torch.LongTensor(pseq))

In [None]:
one = pd.read_csv('./data/train/1_train.txt', encoding='utf-8',header=None,names=['COMMLEVEL','COMMCONTENT'])
two = pd.read_csv('./data/train/2_train.txt', encoding='utf-8',header=None,names=['COMMLEVEL','COMMCONTENT'])
three = pd.read_csv('./data/train/3_train.txt', encoding='utf-8',header=None,names=['COMMLEVEL','COMMCONTENT'])
train = pd.concat([one,two,three],ignore_index=True)

In [None]:
all_dataset = pd.concat([train,test],ignore_index=True)

In [None]:
def load_data_and_labels():

    # Load the data
    positive_examples = list(codecs.open("data/rt-polaritydata/rt-polarity.pos", 'r', 'utf-8').readlines())
    positive_examples = [s.strip() for s in positive_examples]
    positive_examples = [get_tokens(clean_str(sent)) for sent in positive_examples]
    negative_examples = list(open("data/rt-polaritydata/rt-polarity.neg", "r").readlines())
    negative_examples = [s.strip() for s in negative_examples]
    negative_examples = [get_tokens(clean_str(sent)) for sent in negative_examples]
    X = positive_examples + negative_examples

    # Labels
    positive_labels = [[0,1] for _ in positive_examples]
    negative_labels = [[1,0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)

    print "Total: %i, NEG: %i, POS: %i" % (len(y), np.sum(y[:, 0]), np.sum(y[:, 1]))

    return X, y

In [None]:
positive_examples = list(codecs.open("./data/train/1_train.txt", 'r', 'utf-8').readlines())
positive_examples = [s.strip() for s in positive_examples]

# positive_examples = [get_tokens(clean_str(sent)) for sent in positive_examples]

In [None]:
positive_examples

In [None]:
positive_labels = [[1,0,0] for _ in positive_examples]
positive_labels

In [None]:
X = np.empty(num_recs,dtype=list)
y = np.zeros(num_recs)
i=0
with open('./data/training-inspur.csv','r+') as f:
    for line in f:
        _,label, sentence = line.strip().split("\t")
        words = nltk.word_tokenize(sentence.lower())
        seqs = []
        for word in words:
            if word in word2index:
                seqs.append(word2index[word])
            else:
                seqs.append(word2index["UNK"])
        X[i] = seqs
        y[i] = int(label)
        i += 1
X = sequence.pad_sequences(X, maxlen=MAX_SENTENCE_LENGTH)

def pad_sequences(vectorized_seqs, seq_lengths):
    pass

In [None]:
"word_embedding_matrix.shape:",word_embedding_matrix.shape

In [None]:
for word, i in word2index.items():
    if i > MAX_NB_WORDS:
        continue
    
    embedding_vector = embeddings_index.get(str(word))
    
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

In [None]:
### jia

In [None]:
embedding = nn.Embedding.from_pretrained(torch.from_numpy(word_embedding_matrix))

In [None]:
embedding

In [None]:
input = torch.LongTensor([3])
embedding(input)

In [None]:
with codecs.open('./data/training-inspur.csv','r', 'utf-8') as f:
    all_lines = f.readlines()
    
    # 跳过第一行 Header
    for line in all_lines[1:]:
        # Extract Sentence
        _key, sentence, label = line.strip().split('","')
        
        sentence = clean_str(sentence)
        
        stopwords = [" ","!","...................................................................."]
        
        seg_list = jieba.cut(sentence, cut_all=False)
        
        seg_list = [i for i in seg_list if i not in stopwords]
        
        length.append(len(seg_list))
        
        word_list.append(seg_list)
        
        if len(seg_list) > max_length:
            max_length = len(seg_list)
        
        for word in seg_list:
            word_freqs[word] += 1
        num_recs += 1

In [None]:
## EDA 
length = []
max_length = 0
word_freqs = collections.Counter()
word_list = []
num_recs = 0
with codecs.open('./data/training-inspur.csv','r', 'utf-8') as f:
    all_lines = f.readlines()
    # 跳过第一行
    for line in all_lines[1:]:
        # Extract Sentence
        _key, sentence, label = line.strip().split('","')
        
        sentence = clean_str(sentence)
        
        stopwords = [" ","!","...................................................................."]
        
        seg_list = jieba.cut(sentence, cut_all=False)
        
        seg_list = [i for i in seg_list if i not in stopwords]
        
        length.append(len(seg_list))
        
        word_list.append(seg_list)
        
        if len(seg_list) > max_length:
            max_length = len(seg_list)
        
        for word in seg_list:
            word_freqs[word] += 1
        num_recs += 1
        
print('max_length ',max_length)
print('nb_words ', len(word_freqs))
import matplotlib.pyplot as plt
plt.hist(length)

In [141]:
embeds = nn.Embedding(len(word2index), 50)  # 2 words in vocab, 5 dimensional embeddings
lookup_tensor = torch.tensor([word2index["你"]], dtype=torch.long)
hello_embed = embeds(lookup_tensor)
print(hello_embed)

tensor([[ 1.6241, -1.1140,  0.2834, -0.1117,  1.4462,  1.6221,  0.9165,
          0.9102, -0.8726,  1.2840,  0.4811, -2.1449, -0.7621, -0.3971,
          0.1165, -0.2806, -1.1074, -0.3137, -0.7684, -0.4783, -0.2467,
          0.6146, -0.5697,  1.0643,  1.8119,  0.2465, -0.8759, -0.0283,
          0.1164, -0.5778,  1.4203, -1.6684,  0.9633,  2.5660, -0.8036,
         -0.0420,  0.0867,  0.1872,  0.2350,  0.5459, -0.1542, -1.6307,
         -2.0396,  0.3050, -1.0852,  0.0199,  0.3821,  1.3308,  0.6283,
          0.6861]])


In [142]:
word2index["你"]

60

In [143]:
embeds

Embedding(33952, 50)