In [14]:
import tensorflow as tf
import numpy as np
import os
import re
import pickle
import html 

# Get Data for Language Model

In [3]:
# Get Wikitext 103
!wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip

--2019-08-17 17:29:35--  https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.86.149
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.86.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 190229076 (181M) [application/zip]
Saving to: ‘wikitext-103-v1.zip’


2019-08-17 17:29:42 (31.8 MB/s) - ‘wikitext-103-v1.zip’ saved [190229076/190229076]



In [4]:
!unzip wikitext-103-v1.zip

Archive:  wikitext-103-v1.zip
   creating: wikitext-103/
  inflating: wikitext-103/wiki.test.tokens  
  inflating: wikitext-103/wiki.valid.tokens  
  inflating: wikitext-103/wiki.train.tokens  


# Get IMDB Dataset 

In [6]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

--2019-08-17 17:30:10--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2019-08-17 17:30:18 (11.3 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [None]:
!tar xzf aclImdb_v1.tar.gz

In [2]:
!ls

aclImdb  aclImdb_v1.tar.gz  sample_data  wikitext-103  wikitext-103-v1.zip


# Preprocess Data Wikitext


In [2]:
train_path = "wikitext-103/wiki.train.tokens"
valid_path = "wikitext-103/wiki.valid.tokens"
test_path = "wikitext-103/wiki.test.tokens"

In [3]:
def istitle(line):
    return len(re.findall(r'^ = [^=]* = $', line)) != 0
    
UNK = "unk"
def read_wiki(filename):
    articles = []
    with open(filename, encoding='utf8') as f:
        lines = f.readlines()
    current_article = ''
    for i,line in enumerate(lines):
        current_article += line
        if i < len(lines)-2 and lines[i+1] == ' \n' and istitle(lines[i+2]):
            current_article = current_article.replace('<unk>', UNK)
            articles.append(current_article)
            current_article = ''
    current_article = current_article.replace('<unk>', UNK)
    articles.append(current_article)
    return articles

In [4]:
def preprocess(x):
  x = x.strip().lower()
  
  # fix html 
  re1 = re.compile(r'  +')
  x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>',UNK).replace(' @.@ ','.').replace(
        ' @-@ ','-').replace(' @,@ ',',').replace('\\', ' \\ ')
  x=re1.sub(' ', html.unescape(x))
  
  "Add spaces around / and # in `t`. \n" 
  x=re.sub(r'([/#\n])', r' \1 ', x)
  
  "Remove multiple spaces in `t`."
  
  x=re.sub(' {2,}', ' ', x)
  
  return '<S> '+x+' <E>'

In [None]:
wiki_articles = read_wiki(train_path)
wiki_valid_articles = read_wiki(valid_path)

In [None]:
wiki_articles = [preprocess(article) for article in wiki_articles]
wiki_valid_articles = [preprocess(article) for article in wiki_valid_articles]

In [27]:
print(wiki_articles[3])

<S> = gambia women 's national football team = 
 
 the gambia women 's national football team represents the gambia in international football competition . the team , however , has not competed in a match recognised by fifa , the sport 's international governing body , despite that organised women 's football has been played in the country since 1998 . the gambia has two youth teams , an under-17 side that has competed in fifa u-17 women 's world cup qualifiers , and an under-19 side that withdrew from regional qualifiers for an under-19 world cup . the development of a national team faces challenges similar to those across africa , although the national football association has four staff members focusing on women 's football . 
 
 = = the team = = 
 
 in 1985 , few countries had women 's national football teams . while the sport gained popularity worldwide in later decades , the gambia 's national team only played its first game in 2007 . that game was not fifa-recognised . as of mar

In [None]:
with open('data/wiki_articles.pkl', 'wb') as f:
    pickle.dump(wiki_articles, f)

In [None]:
with open('data/wiki_valid_articles.pkl', 'wb') as f:
    pickle.dump(wiki_valid_articles, f)

### Build Vocabulary from training data

In [None]:
from collections import Counter
def build_vocab(data,max_words,min_freq):
    counter = Counter([word for para in data for word in para.split()])
    vocab = [word[0] for word in counter.most_common() if word[1] >= min_freq]
    return vocab[:max_words]
    

In [None]:
train_vocab = build_vocab(wiki_articles,max_words=60000,min_freq=3)

In [30]:
len(train_vocab)

60000

# Preprocess IMDB Dataset

labels :  0 for positive, 1 for negative.

In [8]:
dir_names = ['pos','neg']

imdb_file_paths = []
imdb_labels = []
for i, dir in enumerate(dir_names):
  file_names = [os.path.join("aclImdb/train",dir,name) for name in os.listdir("aclImdb/train/"+dir)]
  imdb_file_paths += file_names
  imdb_labels += [i]*len(os.listdir("aclImdb/train/"+dir))
  


In [9]:
np.random.seed(42)
permutation = np.random.permutation(len(imdb_file_paths))

imdb_file_paths = np.array(imdb_file_paths)[permutation]
imdb_labels = np.array(imdb_labels)[permutation]

In [10]:
print(len(imdb_file_paths),len(imdb_labels))

25000 25000


In [11]:
imdb_reviews = []
for file in imdb_file_paths:
  with open(file,encoding='utf-8') as f:
    data = f.read()
  data = preprocess(data)
  imdb_reviews.append(data)

In [12]:
print(imdb_reviews[0][:300],imdb_labels[0])

<S> an end of an era was released here in the states in spring 2002 with "the rookie," a disney live action film that seemed to be the "best for last!!!!!" it took place right here in texas! actually, the story began in west texas, as evidenced by an area code found on a sign over there. it was abou 0


In [15]:
with open('data/imdb_reviews.pkl', 'wb') as f:
    pickle.dump(imdb_reviews, f)
    
with open('data/imdb_labels.pkl', 'wb') as f:
    pickle.dump(imdb_labels, f)

# Get IMDB Validation Data 

In [16]:
imdb_valid_file_paths = []
imdb_valid_labels = []
for i, dir in enumerate(dir_names):
  file_names = [os.path.join("aclImdb/test",dir,name) for name in os.listdir("aclImdb/test/"+dir)]
  imdb_valid_file_paths += file_names
  imdb_valid_labels += [i]*len(os.listdir("aclImdb/test/"+dir))

In [17]:
np.random.seed(42)
permutation = np.random.permutation(len(imdb_valid_file_paths))

imdb_valid_file_paths = np.array(imdb_valid_file_paths)[permutation]
imdb_valid_labels = np.array(imdb_valid_labels)[permutation]

In [18]:
imdb_valid_reviews = []
for file in imdb_valid_file_paths:
  with open(file,encoding='utf-8') as f:
    data = f.read()
  data = preprocess(data)
  imdb_valid_reviews.append(data)

In [19]:
print(imdb_valid_reviews[0][0:300],imdb_valid_labels[0])

<S> i rank opera as one of the better argento films. plot holes and inconsistencies? sure, but i don't think they impair this film as much as many other reviewers seem to. a lot of elements that are in many of argento's films are kinda "off-the-wall", but that's part of the draw of his films... 
 
  0


In [21]:
with open('data/imdb_valid_reviews.pkl', 'wb') as f:
    pickle.dump(imdb_valid_reviews, f)
    
with open('data/imdb_valid_labels.pkl', 'wb') as f:
    pickle.dump(imdb_valid_labels, f)