# 内容
- 利用 fasttext 对语料 text_corpus.txt 训练模型，保存 ft_skipgram_ws5_dim64.bin
- 利用 tensorflow 对 questions.csv 进行word to ids 处理

In [8]:
import os
import pandas as pd
from tensorflow.contrib import learn
import fasttext
import numpy as np
from sklearn.utils import shuffle
import re

In [10]:
def preprocess(x):
    try:
        tk_x = x.lower()

        # list of characters which needs to be replaced with space
        space_replace_chars = ['?', ':', ',', '"', '[', ']', '~', '*', ';', '!', '?', '(', ')', '{', '}', '@', '$',
                               '#', '.', '-', '/']
        tk_x = tk_x.translate({ord(x): ' ' for x in space_replace_chars})

        non_space_replace_chars = ["'"]
        tk_x = tk_x.translate({ord(x): '' for x in non_space_replace_chars})

        # remove non-ASCII chars
        tk_x = ''.join([c if ord(c) < 128 else '' for c in tk_x])

        # replace all consecutive spaces with one space
        tk_x = re.sub('\s+', ' ', tk_x).strip()

        # find all consecutive numbers present in the word, first converted numbers to * to prevent conflicts while replacing with numbers
        regex = re.compile(r'([\d])')
        tk_x = regex.sub('*', tk_x)
        nos = re.findall(r'([\*]+)', tk_x)
        # replace the numbers with the corresponding count like 123 by 3
        for no in nos:
            tk_x = tk_x.replace(no, "<NUMBER>", 1)

        return tk_x.strip().lower()
    except:
        return ""

def build_corpus(filepath):
    similar_items = pd.read_csv(filepath)
    selected_cols = ['question1', 'question2', 'is_duplicate']
    similar_items = similar_items[selected_cols]
    similar_items['question1'] = similar_items['question1'].apply(preprocess)
    similar_items['question2'] = similar_items['question2'].apply(preprocess)
    similar_items = shuffle(similar_items)
    similar_items = similar_items.drop_duplicates()
    question_list = list(similar_items['question1'])
    question_list.extend(list(similar_items['question2']))
    pd.DataFrame(question_list).to_csv('./data_repository/text_corpus.txt', index=False)
    print('Text corpus generated and persisted successfully.')
    return similar_items

In [11]:
embeddings_model = fasttext.train_unsupervised("./data_repository/text_corpus.txt", model='skipgram',
                                                    lr=0.1, dim=64,
                                                    ws=5, epoch=50)
embeddings_model.save_model("./model_siamese_network/ft_skipgram_ws5_dim64.bin")
print('FastText training finished successfully.')

Text corpus generated and persisted successfully.
FastText training finished successfully.


In [None]:
similar_pairs = build_corpus('./data_repository/questions.csv')
current_index = 0

input_X = list(similar_pairs['question1'])
input_Y = list(similar_pairs['question2'])

wc_list_x = list(len(x.split(' ')) for x in input_X)
wc_list_y = list(len(x.split(' ')) for x in input_Y)
wc_list = []
wc_list.extend(wc_list_x)
wc_list.extend(wc_list_y)

number_of_elements = len(input_X)

In [None]:
def write_metadata(filename, labels):
    with open(filename, 'w') as f:
        f.write("Index\tLabel\n")
        for index, label in enumerate(labels):
            f.write("{}\t{}\n".format(index, label))

    print('Metadata file saved in {}'.format(filename))

In [12]:
# tensorflow 1.8
from tensorflow.contrib import learn

# 建立word到idx的映射关系
# 1.首先将列表里面的词生成一个词典；
# 2.按列表中的顺序给每一个词进行排序，每一个词都对应一个序号(从1开始，<UNK>的序号为0)
# 3.按照原始列表顺序，将原来的词全部替换为它所对应的序号
# 4.同时如果大于最大长度的词将进行剪切，小于最大长度的词将进行填充
# 5.然后将其转换为列表，进而转换为一个array

# 我们使用这些索引值做embedding，然后才能将数据转换成神经网络需要的格式

# or use a constant like 16, select this parameter based on your understanding of what could be a good choice
max_document_length = 16 
# Create the vocabularyprocessor object, setting the max lengh of the documents.
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
full_corpus = [] # 词的list
full_corpus.extend(input_X)
full_corpus.extend(input_Y)
# Transform the documents using the vocabulary.
full_data = np.asarray(list(vocab_processor.fit_transform(full_corpus)))

embeddings_lookup = []
# Extract word:id mapping from the object.
for word in list(vocab_processor.vocabulary_._mapping):
    try:
        embeddings_lookup.append(embeddings_model[str(word)])
    except:
        pass
embeddings_lookup = np.asarray(embeddings_lookup)
vocab_processor.save('./model_siamese_network/vocab')
write_metadata(os.path.join('model_siamese_network', 'metadata.tsv'), list(vocab_processor.vocabulary_._mapping))
print('Vocab processor executed and saved successfully.')

ModuleNotFoundError: No module named 'tensorflow.contrib'

In [None]:
X = full_data[0:number_of_elements]
Y = full_data[number_of_elements:2 * number_of_elements]
label = list(similar_pairs['is_duplicate'])

In [None]:
# get_siamese_batch
topN=10
last_index = current_index
current_index += topN
X[last_index: current_index, :]
Y[last_index: current_index, :]
np.expand_dims(label[last_index: current_index], axis=1)
