# 本程式需要在Google Colab才能執行

## 下載GloVe

In [1]:
!git clone https://github.com/stanfordnlp/GloVe.git

Cloning into 'GloVe'...
remote: Enumerating objects: 436, done.[K
remote: Total 436 (delta 0), reused 0 (delta 0), pack-reused 436[K
Receiving objects: 100% (436/436), 179.83 KiB | 6.42 MiB/s, done.
Resolving deltas: 100% (239/239), done.


## 下載IMDB資料集

In [0]:
import urllib.request
import os
import tarfile

In [3]:
if not os.path.exists('data/'):
    os.makedirs('data/')

url="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
filepath="data/aclImdb_v1.tar.gz"
if not os.path.isfile(filepath):
    result=urllib.request.urlretrieve(url,filepath)
    print('downloaded:',result)

downloaded: ('data/aclImdb_v1.tar.gz', <http.client.HTTPMessage object at 0x7f5b8dfe0588>)


In [0]:
if not os.path.exists("data/aclImdb"):
    tfile = tarfile.open("data/aclImdb_v1.tar.gz", 'r:gz')
    result=tfile.extractall('data/')

## 讀取檔案

In [0]:
import re
def rm_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', text)

In [0]:
import os
def read_files(filetype):
    path = "data/aclImdb/"
    file_list=[]

    positive_path=path + filetype+"/pos/"
    for f in os.listdir(positive_path):
        file_list+=[positive_path+f]
    
    negative_path=path + filetype+"/neg/"
    for f in os.listdir(negative_path):
        file_list+=[negative_path+f]
        
    print('read',filetype, 'files:',len(file_list))
       
    all_labels = ([1] * 12500 + [0] * 12500) 
    
    all_texts  = []
    for fi in file_list:
        with open(fi,encoding='utf8') as file_input:
            all_texts += [rm_tags(" ".join(file_input.readlines()))]
            
    return all_labels,all_texts

In [7]:
y_train,train_text=read_files("train")

read train files: 25000


In [8]:
y_test,test_text=read_files("test")

read test files: 25000


## 將資料集文字移除標點符號，合併成一個list

In [0]:
import string
table = str.maketrans('', '', string.punctuation)
word_list = []

for line in train_text + test_text:
    line = line.lower()
    line = re.sub(r"([a-z]+)'[a-z]\b", '', line)
    line = line.translate(table)
    line = re.sub(r'  +', ' ', line)
    line = line.strip().split(' ')
    word_list.extend(line)

In [10]:
print(word_list[:100])

['this', 'is', 'a', 'ripsnorting', 'oldfashioned', 'adventure', 'yarn', 'i', 'understand', 'that', 'by', 'political', 'standards', 'the', 'treatment', 'of', 'the', 'indians', 'was', 'unacceptable', 'but', 'this', 'moving', 'about', 'politics', 'about', 'action', 'dialogue', 'comradery', 'acting', 'direction', 'music', 'and', 'photography', 'and', 'marvelous', 'on', 'all', 'these', 'factors', 'grant', 'fairbanks', 'and', 'mclaglen', 'are', 'electric', 'together', 'and', 'jaffe', 'is', 'superb', 'this', 'is', 'the', 'ultimate', 'buddy', 'movie', 'great', 'little', 'thriller', 'i', 'was', 'expecting', 'some', 'type', 'of', 'silly', 'horror', 'movie', 'but', 'what', 'i', 'got', 'was', 'tight', 'short', 'thriller', 'that', 'waste', 'none', 'of', 'our', 'time', 'mostof', 'these', 'movies', 'we', 'have', 'to', 'get', 'into', 'the', 'back', 'characters', 'stories', 'so', 'we', 'will', 'either', 'feel']


In [0]:
with open('GloVe/imdb', 'w', encoding='utf8') as f:
    f.write(' '.join(word_list))

## 寫入自訂demo.sh到GloVe資料夾
https://github.com/stanfordnlp/GloVe/blob/master/demo.sh

In [0]:
demosh = """
#!/bin/bash
set -e

# Makes programs, downloads sample data, trains a GloVe model, and then evaluates it.
# One optional argument can specify the language used for eval script: matlab, octave or [default] python

make

CORPUS=imdb
VOCAB_FILE=imdb.txt
COOCCURRENCE_FILE=cooccurrence.bin
COOCCURRENCE_SHUF_FILE=cooccurrence.shuf.bin
BUILDDIR=build
SAVE_FILE=vectors-imdb
VERBOSE=2
MEMORY=4.0
VOCAB_MIN_COUNT=5
VECTOR_SIZE=50
MAX_ITER=15
WINDOW_SIZE=15
BINARY=2
NUM_THREADS=8
X_MAX=10

echo
echo "$ $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE"
$BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE
echo "$ $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE"
$BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE
echo "$ $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE"
$BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE
echo "$ $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE"
$BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE
if [ "$CORPUS" = 'imdb' ]; then
   if [ "$1" = 'matlab' ]; then
       matlab -nodisplay -nodesktop -nojvm -nosplash < ./eval/matlab/read_and_evaluate.m 1>&2 
   elif [ "$1" = 'octave' ]; then
       octave < ./eval/octave/read_and_evaluate_octave.m 1>&2
   else
       echo "$ python eval/python/evaluate.py --vocab_file $VOCAB_FILE --vectors_file $SAVE_FILE.txt"
       python eval/python/evaluate.py --vocab_file $VOCAB_FILE --vectors_file $SAVE_FILE.txt
   fi
fi
"""
with open('GloVe/demo-imdb.sh', 'w', encoding='utf8') as f:
    f.write(demosh)
os.chmod("GloVe/demo-imdb.sh", 0o777)

## 訓練

In [13]:
!cd GloVe && ./demo-imdb.sh

mkdir -p build
gcc src/glove.c -o build/glove -lm -pthread -Ofast -march=native -funroll-loops -Wall -Wextra -Wpedantic
[01m[Ksrc/glove.c:[m[K In function ‘[01m[Kglove_thread[m[K’:
         [01;35m[Kfread(&cr, sizeof(CREC), 1, fin)[m[K;
         [01;35m[K^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[m[K
gcc src/shuffle.c -o build/shuffle -lm -pthread -Ofast -march=native -funroll-loops -Wall -Wextra -Wpedantic
[01m[Ksrc/shuffle.c:[m[K In function ‘[01m[Kshuffle_merge[m[K’:
                 [01;35m[Kfread(&array[i], sizeof(CREC), 1, fid[j])[m[K;
                 [01;35m[K^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[m[K
[01m[Ksrc/shuffle.c:[m[K In function ‘[01m[Kshuffle_by_chunks[m[K’:
         [01;35m[Kfread(&array[i], sizeof(CREC), 1, fin)[m[K;
         [01;35m[K^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[m[K
gcc src/cooccur.c -o build/cooccur -lm -pthread -Ofast -march=native -funroll-loops -Wall -Wextra -Wpedantic
[01m[Ksrc/cooccur.c:[m[K In function ‘