In [1]:
!pip install -q tensorflow-gpu==2.0.0-beta1

[K     |████████████████████████████████| 348.9MB 74kB/s 
[K     |████████████████████████████████| 501kB 48.8MB/s 
[K     |████████████████████████████████| 3.1MB 33.7MB/s 
[?25h

In [0]:
import tensorflow as tf
import numpy as np
import os
import re
import html 

# Get Data for Language Model

In [3]:
# Get Wikitext 103
!wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip

--2019-08-17 17:29:35--  https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.86.149
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.86.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 190229076 (181M) [application/zip]
Saving to: ‘wikitext-103-v1.zip’


2019-08-17 17:29:42 (31.8 MB/s) - ‘wikitext-103-v1.zip’ saved [190229076/190229076]



In [4]:
!unzip wikitext-103-v1.zip

Archive:  wikitext-103-v1.zip
   creating: wikitext-103/
  inflating: wikitext-103/wiki.test.tokens  
  inflating: wikitext-103/wiki.valid.tokens  
  inflating: wikitext-103/wiki.train.tokens  


# Get IMDB Dataset 

In [6]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

--2019-08-17 17:30:10--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2019-08-17 17:30:18 (11.3 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [0]:
!tar xzf aclImdb_v1.tar.gz

In [2]:
!ls

aclImdb  aclImdb_v1.tar.gz  sample_data  wikitext-103  wikitext-103-v1.zip


# Preprocess Data Wikitext


In [0]:
train_path = "wikitext-103/wiki.train.tokens"
valid_path = "wikitext-103/wiki.valid.tokens"
test_path = "wikitext-103/wiki.test.tokens"

In [0]:
def istitle(line):
    return len(re.findall(r'^ = [^=]* = $', line)) != 0
    
UNK = "unk"
def read_wiki(filename):
    articles = []
    with open(filename, encoding='utf8') as f:
        lines = f.readlines()
    current_article = ''
    for i,line in enumerate(lines):
        current_article += line
        if i < len(lines)-2 and lines[i+1] == ' \n' and istitle(lines[i+2]):
            current_article = current_article.replace('<unk>', UNK)
            articles.append(current_article)
            current_article = ''
    current_article = current_article.replace('<unk>', UNK)
    articles.append(current_article)
    return articles

In [0]:
def preprocess(x):
  x = x.strip().lower()
  
  # fix html 
  re1 = re.compile(r'  +')
  x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>',UNK).replace(' @.@ ','.').replace(
        ' @-@ ','-').replace(' @,@ ',',').replace('\\', ' \\ ')
  x=re1.sub(' ', html.unescape(x))
  
  "Add spaces around / and # in `t`. \n" 
  x=re.sub(r'([/#\n])', r' \1 ', x)
  
  "Remove multiple spaces in `t`."
  
  x=re.sub(' {2,}', ' ', x)
  
  return '<S> '+x+' <E>'

In [0]:
wiki_articles = read_wiki(train_path)
wiki_valid_articles = read_wiki(valid_path)

In [0]:
wiki_articles = [preprocess(article) for article in wiki_articles]
wiki_valid_articles = [preprocess(article) for article in wiki_valid_articles]

In [27]:
print(wiki_articles[3])

<S> = gambia women 's national football team = 
 
 the gambia women 's national football team represents the gambia in international football competition . the team , however , has not competed in a match recognised by fifa , the sport 's international governing body , despite that organised women 's football has been played in the country since 1998 . the gambia has two youth teams , an under-17 side that has competed in fifa u-17 women 's world cup qualifiers , and an under-19 side that withdrew from regional qualifiers for an under-19 world cup . the development of a national team faces challenges similar to those across africa , although the national football association has four staff members focusing on women 's football . 
 
 = = the team = = 
 
 in 1985 , few countries had women 's national football teams . while the sport gained popularity worldwide in later decades , the gambia 's national team only played its first game in 2007 . that game was not fifa-recognised . as of mar

### Build Vocabulary from training data

In [0]:
from collections import Counter
def build_vocab(data,max_words,min_freq):
    counter = Counter([word for para in data for word in para.split()])
    vocab = [word[0] for word in counter.most_common() if word[1] >= min_freq]
    return vocab[:max_words]
    

In [0]:
train_vocab = build_vocab(wiki_articles,max_words=60000,min_freq=3)

In [30]:
len(train_vocab)

60000

# Preprocess IMDB Dataset

labels :  0 for positive, 1 for negative.

In [0]:
dir_names = ['pos','neg']

imdb_file_paths = []
imdb_labels = []
for i, dir in enumerate(dir_names):
  file_names = [os.path.join("aclImdb/train",dir,name) for name in os.listdir("aclImdb/train/"+dir)]
  imdb_file_paths += file_names
  imdb_labels += [i]*len(os.listdir("aclImdb/train/"+dir))
  


In [0]:
np.random.seed(42)
permutation = np.random.permutation(len(file_paths))

imdb_file_paths = np.array(file_paths)[permutation]
imdb_labels = np.array(labels)[permutation]

In [55]:
print(len(imdb_file_paths),len(imdb_labels))

25000 25000


In [0]:
imdb_reviews = []
for file in imdb_file_paths:
  with open(file,encoding='utf-8') as f:
    data = f.read()
  data = preprocess(data)
  imdb_reviews.append(data)

In [57]:
imdb_reviews[0]

"<S> jimmy stewart and anthony mann teamed to do some of the best westerns ever made and this is one of the best. \n \n the real star of the film however is the spectacular canadian rockies that serve as a backdrop for the story. some of the best cinematography ever done in the history of film. \n \n in all five of the westerns that stewart and mann did together the supporting roles were perfectly cast. no exception here, right down to parts that might only have a few lines, the characters are firmly etched with those lines. \n \n stewart is a cynical hard-bitten loner in this film whose only real friend is his sidekick walter brennan. it's brennan's death at the hands of the villains that makes him want to finally free the gold settlement from the bad guys and incidentally redeem himself in the process. \n \n john mcintire is the head villain of the piece and he was an under-appreciated actor with a vast range. he could play delightful old codgers, authority figures and in this case a

In [58]:
labels[0]

0