# Train a word2vec model on a given corpus

In [1]:
import os
os.chdir("../") # set working directory to the code's root directory
import sys
sys.path.insert(0, 'src/train/word2vec')
sys.path.insert(0, 'src/preprocess')
sys.path.insert(0, 'src/test')
import trainWord2Vec
import preprocess
import biastest
import gensim

Using TensorFlow backend.


In [3]:
# Set parameters for training the model
size = 300 # Dimensionality of the resulting vectors
window = 5 # Size of context window
minCount = 20 # Minimal number of occurences in the corpus that is required for a word to be included in the model
threads = 3 # Number of threads to use for preprocessing and training
skipGram = 1 # Use Skip-Gram or CBOW model
hierarchical = 1 # Use hierarchical softmax or non-hierarchical
negative = 20 # Number of negative samples to use

In [6]:
# Corpus should be placed in data/raw/ consisting of one sentence per line
corpusPath = "news.2007.de.shuffled" # Change this line to use a different corpus

raw = 'data/raw/'+corpusPath
preprocessed = 'data/processed/'+corpusPath
modelPath = 'models/word2vec/'+corpusPath
resultPath = 'results/word2vec/'+corpusPath

In [11]:
# Preprocess the corpus: replacing Umlauts, removing special characters etc.
preprocess.chunkwisePreprocessing(raw, preprocessed, batchsize = 100000, workers = threads)

Starting preprocessing of new batch..
Starting preprocessing of new batch..
Starting preprocessing of new batch..
Starting preprocessing of new batch..
Starting preprocessing of new batch..
Starting preprocessing of new batch..
Starting preprocessing of new batch..
Starting preprocessing of new batch..
Starting preprocessing of new batch..
Starting preprocessing of new batch..
Starting preprocessing of new batch..
Starting preprocessing of new batch..
Starting preprocessing of new batch..
Starting preprocessing of new batch..
Starting preprocessing of new batch..
Starting preprocessing of new batch..
Starting preprocessing of new batch..
Starting preprocessing of new batch..
Starting preprocessing of new batch..
Starting preprocessing of new batch..


In [14]:
# Train word2vec model
trainWord2Vec.trainGivenCorpus(gensim.models.word2vec.LineSentence(preprocessed), modelPath, vectorDim = size, windowSize=window, mincount=minCount,  nWorkers=threads, skipgram=skipGram,hierarchicalSampling=hierarchical, negativeSampling=negative) 

2017-07-18 18:22:15,310 : INFO : collecting all words and their counts
2017-07-18 18:22:15,312 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-07-18 18:22:15,407 : INFO : PROGRESS: at sentence #10000, processed 152413 words, keeping 31119 word types
2017-07-18 18:22:15,503 : INFO : PROGRESS: at sentence #20000, processed 306128 words, keeping 49236 word types
2017-07-18 18:22:15,586 : INFO : PROGRESS: at sentence #30000, processed 459412 words, keeping 64211 word types
2017-07-18 18:22:15,665 : INFO : PROGRESS: at sentence #40000, processed 613688 words, keeping 77483 word types
2017-07-18 18:22:15,755 : INFO : PROGRESS: at sentence #50000, processed 768084 words, keeping 89454 word types
2017-07-18 18:22:15,842 : INFO : PROGRESS: at sentence #60000, processed 920948 words, keeping 100059 word types
2017-07-18 18:22:15,922 : INFO : PROGRESS: at sentence #70000, processed 1074267 words, keeping 110145 word types
2017-07-18 18:22:16,003 : INFO : PROGRESS: 

In [7]:
# Run bias tests 
# test = "base" runs our own German tests
# test = "par" runs our German translation of the tests Caliskan et al. used 
# test = "en" runs Caliskan et al.'s English tests
biasTestResult = biastest.runBiasTests(modelPath, test = "base")
with open(resultPath, 'a') as outFile:
    outFile.write(str(biasTestResult))
    outFile.write('\n')

2017-07-19 00:44:35,257 : INFO : loading projection weights from models/word2vec/news.2007.de.shuffled
2017-07-19 00:44:50,793 : INFO : loaded (62702, 300) matrix from models/word2vec/news.2007.de.shuffled
starting testing for data/external/iats/de/flowers-insects.txt with coverage [0.12, 0.36, 1.0, 1.0]
Starting p-value computation 0 / 1000000
Starting p-value computation 10000 / 1000000
Starting p-value computation 20000 / 1000000
Starting p-value computation 30000 / 1000000
Starting p-value computation 40000 / 1000000
Starting p-value computation 50000 / 1000000
Starting p-value computation 60000 / 1000000
Starting p-value computation 70000 / 1000000
Starting p-value computation 80000 / 1000000
Starting p-value computation 90000 / 1000000
Starting p-value computation 100000 / 1000000
Starting p-value computation 110000 / 1000000
Starting p-value computation 120000 / 1000000
Starting p-value computation 130000 / 1000000
Starting p-value computation 140000 / 1000000
Starting p-value c