In [4]:
%reload_ext autoreload
%autoreload 2

from IPython.display import display_markdown

In [None]:
import en_vectors_web_lg
nlp = en_vectors_web_lg.load()

In [None]:
MAX_WORDS_IN_SENTENCE = 300
LIMIT = -1
dataset_id = 'imdb_{}'.format(LIMIT if LIMIT > 0 else 'all')

In [12]:
from sklearn.model_selection import train_test_split
from shared.data import load_imdb

print("Loading training data...")
X_train, y_train = load_imdb('../data/aclImdb/train', limit=LIMIT)

print("Loading test data...")
# lower text count, because of memory problems when trying to load all
test_limit = LIMIT if 0 <= LIMIT <= 1000 else 1000
X_test, y_test = load_imdb('../data/aclImdb/test', limit=test_limit)

Loading training data...
Loading test data...


In [13]:
from machine_learning.plot_helpers import describe_data

describe_data(X_train, y_train)

display_markdown('### Train count: {}'.format(len(X_train)), raw=True)
display_markdown('### Test count:  {}'.format(len(X_test)), raw=True)

### Data sample

Unnamed: 0,X,y
0,"b""Dan Katzir has produced a wonderful film tha...",1
1,b'If you want Scream or anything like the big-...,1
2,"b""Outlandish premise that rates low on plausib...",0
3,b'Let\'s face it-- if you rented a STDVD seque...,0
4,b'Bizarre Tobe Hooper exercise regarding an un...,0
5,"b'Well, maybe the PC version of this game was ...",0
6,"b'Watching ""Kroko"" I would have liked to leave...",0
7,"b""First of all, I have to start this comment b...",1
8,"b""Very much a film from the times -- extremely...",0
9,"b'""The Invisible Ray"" is part science fiction ...",1


#### Text stats

count                                                 25000
unique                                                24904
top       b"This show comes up with interesting location...
freq                                                      3
Name: X, dtype: object

#### Words length stats

count    25000.000000
mean       233.776720
std        173.715418
min         10.000000
25%        127.000000
50%        174.000000
75%        284.000000
max       2470.000000
Name: X, dtype: float64

#### Labels stats

count    25000.00000
mean         0.50000
std          0.50001
min          0.00000
25%          0.00000
50%          0.50000
75%          1.00000
max          1.00000
Name: y, dtype: float64

#### Labels counts

1    12500
0    12500
Name: y, dtype: int64

1    0.5
0    0.5
Name: y, dtype: float64

### Train count: 25000

### Test count:  1000

In [7]:
from shared.models import KerasModel
from machine_learning.plot_helpers import plot_training

display_markdown('#### Training or loading model', raw=True)

model = KerasModel(
    nlp, dataset_id, max_words_in_sentence=MAX_WORDS_IN_SENTENCE, 
    epochs=50)

try:
    print("Loading model...")
    model.load()
    print("Model '{}' loaded".format(model.filename))
except IOError:
    print("Unable to load model, training...")
    history = model.train(X_train, y_train)
    model.save()
    print("Model '{}' saved".format(model.filename))
    plot_training(history)

Using TensorFlow backend.


#### Training or loading model

Loading model...
Unable to load model, training...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Model 'keras_imdb_all_999bd479.h5' saved


In [49]:
display_markdown("#### Overview of created convolutional network architecture", raw=True)

display(model.summary())

#### Overview of created convolutional network architecture

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 296, 32)           48032     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 59, 32)            0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 59, 32)            128       
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 55, 64)            10304     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 64)                256       
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
__________

None

In [43]:
import numpy as np

from sklearn import pipeline
from shared import transformers

raw_samples = np.array([
    "This movie is, in my opinion, very worth watching!",
    "I kinda liked that movie. Maybe it's not as good as other, but still watchable",
    "I have to warn everybody, this movie is really bad. Actors don't know how to play. It su!*!",
], dtype='object')

display_markdown("#### Raw text", raw=True)

for text in raw_samples:
    display(text)

display_markdown("#### Cleaned text", raw=True)

clear_pipeline = pipeline.Pipeline([
    ('clear', transformers.ClearTextTransformer()),
])

for text in clear_pipeline.transform(raw_samples):
    display(text)

display_markdown("#### Predicted scores", raw=True)
with pd.option_context("display.max_colwidth", -1):
    display(pd.DataFrame({
        "text": pd.Series(raw_samples),
        "score": pd.Series(model.predict_proba(raw_samples)[:, 1].reshape(len(raw_samples))),
    }))


#### Raw text

'This movie is, in my opinion, very worth watching!'

"I kinda liked that movie. Maybe it's not as good as other, but still watchable"

"I have to warn everybody, this movie is really bad. Actors don't know how to play. It su!*!"

#### Cleaned text

'this movie is in my opinion very worth watching!'

"i kinda liked that movie maybe it's not as good as other but still watchable"

"i have to warn everybody this movie is really bad actors don't know how to play it su! !"

#### Predicted scores

Unnamed: 0,score,text
0,0.95733,"This movie is, in my opinion, very worth watching!"
1,0.718341,"I kinda liked that movie. Maybe it's not as good as other, but still watchable"
2,3.8e-05,"I have to warn everybody, this movie is really bad. Actors don't know how to play. It su!*!"


In [None]:
from machine_learning.evaluation import evaluate_and_report
evaluate_and_report(model, X_test, y_test)