In [6]:
%load_ext autoreload
%autoreload 2

In [7]:
import numpy as np

from shared.data import load_fine_food_reviews
from IPython.display import display_markdown

In [8]:
import en_vectors_web_lg
nlp = en_vectors_web_lg.load()

In [9]:
MAX_WORDS_IN_SENTENCE = 200
TEXT_COLUMN = "Text"
LIMIT = -1
dataset_id = 'food_{}_{}'.format(TEXT_COLUMN.lower(), LIMIT if LIMIT > 0 else 'all')

In [10]:
from sklearn.model_selection import train_test_split

X, y = load_fine_food_reviews(limit=LIMIT, text_column=TEXT_COLUMN)
indices = np.arange(len(X))

# we're limiting test count to 2000 due to memory problems with higher values
test_count = min(2000, int(0.25 * len(X)))
idx_train, idx_test = train_test_split(indices, test_size=test_count, random_state=42)

In [11]:
import pandas as pd
data = pd.DataFrame({'X': pd.Series(X), 'y': pd.Series(y[:, 0])})

display_markdown("### Data sample", raw=True)
display(data.head(10))

display_markdown('#### Text stats', raw=True)
display(data.X.describe())

display_markdown('#### Words length stats', raw=True)
display(data.X.apply(lambda w: len(w.split())).describe())

display_markdown('#### Labels stats', raw=True)
display(data.y.describe())

display_markdown('#### Labels counts', raw=True)
display(data.y.value_counts())
display(data.y.value_counts(normalize=True))

display_markdown('### Train count: {}'.format(len(idx_train)), raw=True)
display_markdown('### Test count:  {}'.format(len(idx_test)), raw=True)

# it's no longer needed, just for presentational purposes
del data

### Data sample

Unnamed: 0,X,y
0,i have bought several of the vitality canned d...,1
1,product arrived labeled as jumbo salted peanut...,0
2,this is a confection that has been around a fe...,1
3,if you are looking for the secret ingredient i...,0
4,great taffy at a great price. there was a wid...,1
5,i got a wild hair for taffy and ordered this f...,1
6,this saltwater taffy had great flavors and was...,1
7,this taffy is so good. it is very soft and ch...,1
8,right now i'm mostly just sprouting this so my...,1
9,this is a very healthy dog food. good for thei...,1


#### Text stats

count                                                525814
unique                                               363827
top       this review will make me sound really stupid, ...
freq                                                    199
Name: X, dtype: object

#### Words length stats

count    525814.000000
mean         79.016667
std          78.447587
min           3.000000
25%          33.000000
50%          55.000000
75%          96.000000
max        2520.000000
Name: X, dtype: float64

#### Labels stats

count    525814.000000
mean          0.843981
std           0.362874
min           0.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: y, dtype: float64

#### Labels counts

1    443777
0     82037
Name: y, dtype: int64

1    0.843981
0    0.156019
Name: y, dtype: float64

### Train count: 523814

### Test count:  2000

In [12]:
from shared.models import KerasModel
from machine_learning.plot_helpers import plot_training

display_markdown('#### Training or loading model', raw=True)

model = KerasModel(
    nlp, dataset_id, 
    max_words_in_sentence=MAX_WORDS_IN_SENTENCE, 
    epochs=5)

try:
    print("Loading model...")
    model.load()
    print("Model '{}' loaded".format(model.filename))
except IOError:
    print("Unable to load model, training...")
    history = model.train(X, y, idx_train, idx_test)
    model.save()
    print("Model '{}' saved".format(model.filename))
    plot_training(history)

#### Training or loading model

Loading model...
Model 'keras_food_text_all_b3f214f0.h5' loaded


In [13]:
import numpy as np

model.predict_proba(np.array([
    "it was very good",
    "Dinner was awful! impossible to eat. Never visit",
    "expectations met",
    "expectations not met",
    "tasty",
], dtype='object'))

array([[ 0.27826458,  0.72173542],
       [ 0.64885449,  0.35114554],
       [ 0.31527561,  0.68472439],
       [ 0.31725574,  0.68274426],
       [ 0.29618645,  0.70381355]], dtype=float32)

In [1]:
from machine_learning.plot_helpers import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, classification_report


def check_model(X_test, y_test, model):   
    y_pred = model.predict(X_test)
    classes = ['Negative', 'Positive']
    print("Classification report for {}\n".format(model.NAME))
    print(classification_report(y_test, y_pred, target_names=classes))
    plot_confusion_matrix(confusion_matrix(y_test, y_pred), classes)

In [2]:
check_model(X[idx_test], y[idx_test], model)

NameError: name 'X' is not defined