<a href="https://colab.research.google.com/github/XuRui314/HITSZ_2022_NLP_Project/blob/main/1_3_Gensim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1.3 Gensim

Gensim 是一个开源的python库，可以将文档表示为语义向量。

官网：https://radimrehurek.com/gensim/

- Word2vec
- FastText
- TF-IDF, LSA, LDA

思考：为什么要把词表示为向量？


In [None]:
import gensim
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
# [
#     'a':[1.2,,,],
# 'b':[1,3,....]
# ]
def transfer(gloveFile, word2vecFile):
    glove2word2vec(gloveFile, word2vecFile)

transfer('./glove.6B.50d.txt','./glove.6B.50d_w2v_format.txt')
#glove 和word2vec格式不同，进行转换

  glove2word2vec(gloveFile, word2vecFile)


In [None]:
# 加载预训练的词向量
glove_vectors = KeyedVectors.load_word2vec_format('./glove.6B.50d_w2v_format.txt', binary=False)

In [None]:
# 查看与'twitter'最相近的词
glove_vectors.most_similar('twitter')

[('facebook', 0.9333045482635498),
 ('myspace', 0.8801369667053223),
 ('youtube', 0.8430657982826233),
 ('blog', 0.8262057304382324),
 ('blogs', 0.8064824342727661),
 ('blogging', 0.7970671057701111),
 ('tumblr', 0.7901089787483215),
 ('email', 0.778261125087738),
 ('tweets', 0.7604537010192871),
 ('e-mail', 0.7538726925849915)]

In [None]:
# 查看'computer'的词向量
glove_vectors['computer']

array([ 0.079084, -0.81504 ,  1.7901  ,  0.91653 ,  0.10797 , -0.55628 ,
       -0.84427 , -1.4951  ,  0.13418 ,  0.63627 ,  0.35146 ,  0.25813 ,
       -0.55029 ,  0.51056 ,  0.37409 ,  0.12092 , -1.6166  ,  0.83653 ,
        0.14202 , -0.52348 ,  0.73453 ,  0.12207 , -0.49079 ,  0.32533 ,
        0.45306 , -1.585   , -0.63848 , -1.0053  ,  0.10454 , -0.42984 ,
        3.181   , -0.62187 ,  0.16819 , -1.0139  ,  0.064058,  0.57844 ,
       -0.4556  ,  0.73783 ,  0.37203 , -0.57722 ,  0.66441 ,  0.055129,
        0.037891,  1.3275  ,  0.30991 ,  0.50697 ,  1.2357  ,  0.1274  ,
       -0.11434 ,  0.20709 ], dtype=float32)

### 下面使用文本中预训练的词向量进行情感分类

In [None]:
from nltk.corpus import movie_reviews
import random
random.seed(42)


def load_movie_reviews():
    pos_ids = movie_reviews.fileids('pos')
    neg_ids = movie_reviews.fileids('neg')

    all_reviews = []
    for pids in pos_ids:
        all_reviews.append((movie_reviews.raw(pids), 'positive'))
    
    for nids in neg_ids:
        all_reviews.append((movie_reviews.raw(nids), 'negative'))

    random.shuffle(all_reviews)
    train_reviews = all_reviews[:1600]
    test_reviews = all_reviews[1600:]

    return train_reviews, test_reviews

train_reviews, test_reviews = load_movie_reviews()
print('train:', len(train_reviews))
print('test:', len(test_reviews))

train: 1600
test: 400


In [None]:
from nltk import word_tokenize
import numpy as np

# 将文本中每个词的词向量的平均作为文本的表示
def convert_text_to_vector(text, vectors):
    vector = np.zeros(vectors.vector_size)
    #最终文本向量的初始化
    num = 0
    for word in word_tokenize(text):
        if word in vectors:
            vector = vector + vectors[word]
            num += 1
    if num > 0:
        vector = vector / num
    return vector

In [None]:
def build_X_y(feature_extractor,reviews, vectors):
    X = []
    Y = []
    
    for review, polarity in reviews:
        x = feature_extractor(review, vectors)
        y = 0 if polarity == 'negative' else 1
        X.append(x)
        Y.append(y)

    return X, Y


In [None]:
X_train, y_train = build_X_y(convert_text_to_vector,train_reviews, glove_vectors)
X_test, y_test = build_X_y(convert_text_to_vector,test_reviews, glove_vectors)

In [None]:
print(X_test[0])

[ 2.82499822e-01  1.49704999e-01 -1.70585516e-01 -1.42463931e-01
  3.47442902e-01  2.69994009e-01 -3.28432874e-01 -8.50855218e-02
 -2.36472232e-01 -2.44773272e-02  2.77545260e-02  9.46451616e-02
 -3.13645131e-01 -7.34175905e-02  4.54961444e-01  5.62172322e-02
  3.91319665e-02  2.88562016e-02 -4.35868673e-01 -2.42483748e-01
 -8.23339735e-03  2.83226109e-01  2.35443052e-01  2.09568062e-03
  2.50511249e-01 -1.39674038e+00 -3.81673463e-01  8.58117964e-02
  1.93260298e-01 -1.87363802e-01  2.91190987e+00 -1.57550570e-02
 -1.11970683e-01 -3.05982004e-01  9.22510949e-02  3.67293589e-02
  8.53569318e-02  1.76745555e-01  1.07471380e-01 -1.24025612e-01
 -1.07960290e-01  2.17176207e-01  6.30604254e-02 -5.81833029e-02
 -1.09076334e-01  8.35401807e-02 -5.80367818e-02 -2.57382441e-01
 -2.68450526e-02  3.43150741e-02]


In [None]:
from sklearn.svm import LinearSVC
#线性SVM

def train_and_test(X_train, y_train, X_test, y_test):
    classifier = LinearSVC()

    classifier.fit(X_train, y_train)
    accuracy = classifier.score(X_test, y_test)
    print(f'accuracy is {accuracy:.4f}')

    return classifier

In [None]:
train_and_test(X_train, y_train, X_test, y_test)

accuracy is 0.7050


LinearSVC()

思考：进一步的改进

1.优化特征提取方式？


2.使用更优秀的分类器？

In [None]:
# 将文本中每个词的词向量的维度最大值作为文本的表示


def convert_text_to_vector_max(text, vectors):
    word_vectors = None
    num = 0
    for word in word_tokenize(text):
        if word in vectors:
            if word_vectors is None:
                word_vectors = np.expand_dims(vectors[word], axis=0) 
            else:

                word_vectors = np.concatenate((word_vectors, np.expand_dims(vectors[word], axis=0)),axis=0) 
            num += 1
    # print(word_vectors.shape)

    vector = word_vectors.max(axis=0)
    return vector

In [None]:
a = np.array([1,2,3])
print(a.shape)
a = np.expand_dims(a, axis=0)
a.shape 

(3,)


(1, 3)

In [None]:
X_train, y_train = build_X_y(convert_text_to_vector_max,train_reviews, glove_vectors)
X_test, y_test = build_X_y(convert_text_to_vector_max,test_reviews, glove_vectors)

In [None]:
print(len(X_train[0]))

50


In [None]:
train_and_test(X_train, y_train, X_test, y_test)
#未收敛，方法有问题

accuracy is 0.6100




LinearSVC()

In [None]:
from nltk.corpus import opinion_lexicon
# 导入情感词典
positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())

# 将只考虑情感词

def convert_text_to_vector_senti(text, vectors):
    vector = np.zeros(vectors.vector_size)
    #最终文本向量的初始化
    num = 0
    for word in word_tokenize(text):
        if word in vectors and (word in positive_words or word in negative_words):
            vector = vector + vectors[word]
            num += 1
    if num > 0:
        vector = vector / num
    return vector

In [None]:


def convert_text_to_vector_sum(text, vectors):
    vector = np.zeros(vectors.vector_size)
    #最终文本向量的初始化
    num = 0
    for word in word_tokenize(text):
        if word in vectors and (word in positive_words or word in negative_words):
            vector = vector + vectors[word]
    return vector

In [None]:
X_train, y_train = build_X_y(convert_text_to_vector_senti,train_reviews, glove_vectors)
X_test, y_test = build_X_y(convert_text_to_vector_senti,test_reviews, glove_vectors)

In [None]:
train_and_test(X_train, y_train, X_test, y_test)

accuracy is 0.7525


LinearSVC()

accuracy is 0.5550




LinearSVC()

其他优化特征表示的方式：


1.使用更高维度的词向量，100，200，300？


2.使用预训练更加充分的词向量，在更大语料库上训练得到的词向量


3.使用其他方式对词向量进行处理，不只是简单的平均，如：使用CNN，多核CNN以注意到不同尺度的词汇间的信息，加上RNN对词向量编码句意信息

3.1 再对RNN进行优化？LSTM，BiLSTM?


#### transformer

参考资料
- http://jalammar.github.io/illustrated-transformer/
- http://nlp.seas.harvard.edu/2018/04/03/attention.html
- https://arxiv.org/abs/1706.03762

下周：


### 1. Pytorch

官网：https://pytorch.org/

- tensor
- 自动求导
- 深度学习流程
    - 加载数据
    - 建立模型
    - 选择优化器
    - 训练
    - 保存和加载模型

参考：
- https://pytorch.org/tutorials/beginner/basics/tensorqs_tutorial.html
- https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html

### 2.NER任务baseline

- 弱baseline：BiLSTM+CRF

- 强baseline：BERT+CRF


