In [9]:
!wget https://s3-us-west-1.amazonaws.com/restaurant-review-data/kaggle/whats-cooking/train.json.zip
!unzip train.json.zip

--2018-04-24 02:54:57--  https://s3-us-west-1.amazonaws.com/restaurant-review-data/kaggle/whats-cooking/train.json.zip
Resolving s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)... 52.219.20.105
Connecting to s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)|52.219.20.105|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1845320 (1.8M) [application/zip]
Saving to: ‘train.json.zip’


2018-04-24 02:54:57 (3.76 MB/s) - ‘train.json.zip’ saved [1845320/1845320]



The code below is for tagging docs with ingredients

In [1]:
import re
from functools import partial
import json
with open('train.json') as f:
    whatscooking = json.load(f)
    ingredients = [i['ingredients'] for i in whatscooking]
    
ingredients = [ingredient for ingredientlist in ingredients for ingredient in ingredientlist]
ingredients = map(str.lower, ingredients)

clean_up_strings = [
    '\(.+oz\.\)'
]
clean_up_patterns = [re.compile(p) for p in clean_up_strings]

def clean(patterns, string):
    for pattern in patterns:
        string = pattern.sub("", string)
    return string.strip()

clean = partial(clean, clean_up_patterns)

ingredients = map(clean, ingredients)
ingredients = sorted(list(set(ingredients)))


In [2]:
def yield_subchars(string):
    for i in range(len(string)):
        yield string[:i+1]
        
def to_string_list(doc_or_span):
    return [i.norm_ + i.whitespace_ for i in doc_or_span]

def to_string(doc_or_span):
    return "".join(to_string_list(doc_or_span))


def yield_subtokens(doc):
    for i in range(len(doc)):
        yield doc[:i+1]
        
def yield_substring_lists(doc_or_span):
    for i in range(len(doc)):
        yield to_string_list(doc[:i+1])        

In [3]:
import logging 
import spacy
from spacy.tokens.doc import Doc
from spacy.tokens.token import Token
#Token.set_extension('ingredient', default=None, force=True)
from spacy.tokens.span import Span
from spacy.matcher import PhraseMatcher
# try:
#     Token.set_extension('ingredient', default=None)
# except ValueError:
#     # make idempotent
#     pass
    
nlp = spacy.load('en')
class Parser:
    
    def __init__(self, nlp, token_bank):
        self.nlp = nlp
        self.token_bank = token_bank
        #self.token_graph = self.build_token_graph()
        self.matcher = PhraseMatcher(nlp.vocab)
        self.nlp.entity.add_label('INGREDIENT') 
        self.add_patterns_to_matcher()
        self.nlp.add_pipe(self.match, last=True)
        
    def __call__(self, text, *args, **kwargs):
        return self.nlp(text, *args, **kwargs)
    
    def pipe(self, text, *args, **kwargs):
        return self.nlp.pipe(text, *args, **kwargs)    
        
        
    def match(self, doc):
        self.matcher(doc)
        return doc
    
    def on_match(self, matcher, doc, i, matches):
        
        match_id, start, end = matches[i]
        span = doc[start : end]
        INGREDIENT = self.nlp.vocab.strings['INGREDIENT'] # get hash value of entity label
        ingredient_ent = Span(doc, start, end, label=INGREDIENT) # create a Span for the new entity
        doc.ents = doc.ents + (ingredient_ent, )

    def add_patterns_to_matcher(self):
        n_patterns, n_nonpatterns = 0, 0
        for doc in self.nlp.pipe(self.token_bank, disable=['parser','tagger']):
            rep = to_string(doc)
            if len(doc) < 10:
                n_patterns +=1
                self.matcher.add(rep, self.on_match, doc)
            else:
                n_nonpatterns +=1     
        logging.warning(f"{n_nonpatterns} patterns were too long to call to matcher")
              

    @staticmethod
    def get_ingredient(token):
        return token.doc.user_data.get((token.i, u'ingredient'))
    @staticmethod
    def set_user_id(token, value):
        token.doc.ingredient[(token.i, u'ingredient')] = value
        
    def build_token_graph(self):
        """
        refactor to pass over token bank just once you idiot.
        """
        g = nx.DiGraph()
        for doc in nlp.pipe(self.token_bank, disable=['ner','parser','tagger']):
            doclist = to_string_list(doc)
            docstring = ''.join(doclist)
            g.add_node(docstring)
        for doc in nlp.pipe(self.token_bank, disable=['ner','parser','tagger']):
            doclist = to_string_list(doc)
            docstring = ''.join(doclist)            
            for subdoc in yield_substring_lists(doc):
                substring = "".join(subdoc)
                if substring in g:
                    g.add_edge(substring, docstring)
        return g
              
        
        

In [4]:
p = Parser(nlp, ingredients)



In [5]:
text = "i like to order two eggs in my ramen"

i = 1
doc = p(text)
    
print(doc)

for ent in filter(lambda x: x.label_ == 'INGREDIENT', doc.ents):
    print(ent, ent.label_)

i like to order two eggs in my ramen
eggs INGREDIENT
ramen INGREDIENT


In [6]:
from pandas import read_csv

df = read_csv("../dataset/review.csv", nrows=100000)

i = 1
doc = p(df.text.iloc[i])
    
print(doc)

for ent in filter(lambda x: x.label_ == 'INGREDIENT', doc.ents):
    print(ent, ent.label_)

b"Super simple place but amazing nonetheless. It's been around since the 30's and they still serve the same thing they started with: a bologna and salami sandwich with mustard. \n\nStaff was very helpful and friendly."
bologna INGREDIENT
salami INGREDIENT
mustard INGREDIENT


### Create an ingredient matrix:
$f(ingredients) \rightarrow rating$?

In [7]:
class DefaultDict(dict):
    def __init__(self, *args, **kwargs):
        super().__init__(self, *args, **kwargs)
    def __missing__(self, key):
        self[key] = len(self)
        return self[key]
        
        
d = DefaultDict()


In [8]:
from tqdm import tqdm_notebook, tqdm
from scipy.sparse import lil_matrix

vecs = []
missing_ingredients = []
labels = DefaultDict()
labels2 = DefaultDict()

N= len(df)
for doc in tqdm_notebook(p.pipe(df.text.values, disable=['parser','tagger', 'ner'], n_threads=4), total = N):
    codes = list()
    for ingredient in map(lambda x: x.string, filter(lambda x: x[0].ent_type_ == 'INGREDIENT', doc.ents)):
        code = labels[ingredient]
        codes.append(code)
    vecs.append(codes)
import numpy as np
    
X = np.zeros((N, len(labels)))

for i, vec in tqdm_notebook(enumerate(vecs), total=len(vecs)):
    for j in vec:
        X[i, j] = 1    
        
        








In [10]:
%load_ext Cython

In [17]:
%%cython_pyximport
import random
from cymem.cymem cimport Pool

from libc.math cimport sqrt

cimport cython

cdef struct Point:
    double x
    double y

cdef class World:
    cdef Pool mem
    cdef int N
    cdef double* m
    cdef Point* r
    cdef Point* v
    cdef Point* F
    cdef readonly double dt
    def __init__(self, N, threads=1, m_min=1, m_max=30.0, r_max=50.0, v_max=4.0, dt=1e-3):
        self.mem = Pool()
        self.N = N
        self.m = <double*>self.mem.alloc(N, sizeof(double))
        self.r = <Point*>self.mem.alloc(N, sizeof(Point))
        self.v = <Point*>self.mem.alloc(N, sizeof(Point))
        self.F = <Point*>self.mem.alloc(N, sizeof(Point))
        for i in range(N):
            self.m[i] = random.uniform(m_min, m_max)
            self.r[i].x = random.uniform(-r_max, r_max)
            self.r[i].y = random.uniform(-r_max, r_max)
            self.v[i].x = random.uniform(-v_max, v_max)
            self.v[i].y = random.uniform(-v_max, v_max)
            self.F[i].x = 0
            self.F[i].y = 0
        self.dt = dt


@cython.cdivision(True)
def compute_F(World w):
    """Compute the force on each body in the world, w."""
    cdef int i, j
    cdef double s3, tmp
    cdef Point s
    cdef Point F
    for i in range(w.N):
        # Set all forces to zero.
        w.F[i].x = 0
        w.F[i].y = 0
        for j in range(i+1, w.N):
            s.x = w.r[j].x - w.r[i].x
            s.y = w.r[j].y - w.r[i].y

            s3 = sqrt(s.x * s.x + s.y * s.y)
            s3 *= s3 * s3;

            tmp = w.m[i] * w.m[j] / s3
            F.x = tmp * s.x
            F.y = tmp * s.y

            w.F[i].x += F.x
            w.F[i].y += F.y

            w.F[j].x -= F.x
            w.F[j].y -= F.y


@cython.cdivision(True)
def evolve(World w, int steps):
    """Evolve the world, w, through the given number of steps."""
    cdef int _, i
    for _ in range(steps):
        compute_F(w)
        for i in range(w.N):
            w.v[i].x += w.F[i].x * w.dt / w.m[i]
            w.v[i].y += w.F[i].y * w.dt / w.m[i]
            w.r[i].x += w.v[i].x * w.dt
            w.r[i].y += w.v[i].y * w.dt

ValueError: module name must be given

In [14]:
%%cython

cdef g():
    return 1

%%ti

In [15]:
g

NameError: name 'g' is not defined

In [9]:
from sklearn.preprocessing import LabelBinarizer
#y = LabelBinarizer().fit_transform(df['stars'])
y = df['stars'].values

In [114]:
from keras.optimizers import Adagrad, Adam
from keras.losses import categorical_crossentropy, binary_crossentropy, mean_absolute_error
from keras.layers import Dense
from keras.models import Sequential
model = Sequential()
model.add(Dense(12, input_shape = (X.shape[1], ), activation='relu'))
model.add(Dense(1))
#model.add(Dense(y.shape[1]))
model.compile(optimizer='adam', loss=mean_absolute_error, metrics=['mean_absolute_error'])
model.fit(X, y, epochs=25, validation_split=.2, batch_size=1000)

Train on 80000 samples, validate on 20000 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f66c98bb198>

In [134]:
import pandas as pd 
rinds = [r_labels[i] for i in range(len(weights))]
r_labels = {j: i for i, j in labels.items()}
weights = pd.DataFrame(model.layers[0].get_weights()[0])
weights.index = rinds

from sklearn.metrics.pairwise import cosine_similarity
sims = pd.DataFrame(cosine_similarity(weights), columns=rinds, index = rinds)


In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X, y)

In [1]:
lr

NameError: name 'lr' is not defined

In [69]:
model.get_weights()

[array([[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        ...,
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]], dtype=float32),
 array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan], dtype=float32),
 array([[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., 

In [34]:
weights = model.weights[0]
import tensorflow as tf
tf.global_variables_initializer()
with tf.Session() as session:
    w = weights.eval(session)

FailedPreconditionError: Attempting to use uninitialized value dense_1/kernel
	 [[Node: _retval_dense_1/kernel_0_0 = _Retval[T=DT_FLOAT, index=0, _device="/job:localhost/replica:0/task:0/device:CPU:0"](dense_1/kernel)]]

[array([[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        ...,
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]], dtype=float32),
 array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan], dtype=float32)]

In [27]:
w

<tf.Tensor 'dense_1/kernel/read:0' shape=(2722, 100) dtype=float32>

In [1]:
from dask.dataframe import read_csv
df = read_csv("../dataset/review.csv")

In [None]:
print(len(df))

In [1]:
from pandas import read_csv
import spacy
nlp = spacy.load('en')
df = read_csv("../dataset/review.csv", nrows=100000)

In [7]:
doc = nlp(df.text.iloc[0])
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
ents

[('around lunch or dinner hours', 73, 101, 'TIME'),
 ('half', 234, 238, 'CARDINAL'),
 ('Hand', 269, 273, 'GPE')]

In [229]:

bus = read_csv("../dataset/business.csv")

In [231]:
!ls ../dataset/

business.csv				 review.json
business.json				 tip.csv
checkin.csv				 tip.json
checkin.json				 user.csv
Dataset_Challenge_Dataset_Agreement.pdf  user.json
photos.json				 Yelp_Dataset_Challenge_Round_11.pdf
review.csv


In [389]:


class DocVisualizer:
    def __init__(self, text, nlp):
        self.doc = nlp(self.preprocess(text))
        
    @staticmethod
    def preprocess(text):
        return text.replace("\\n", "")
        
    def _build_token_tables(self):
        r = {}
        for i, token in enumerate(self.doc):
            r[i] = self._build_token_view(token)
        return r
    
    def _make_js_pop(self):
        d = self._build_token_tables()
        s = ""
        for key, value in d.items():
            s+= f'mydict[{key}] = `{value}`\n'
        return s
        
    def _build_token_view(self, token):
        tablestr = f"""
        <table style="width:100%">
          <tr>
            <th>Token</th>
            <th>POS</th> 
            <th>DEP</th>
            <th>head</th>            
          </tr>
          <tr>
            <td>{token.lemma_}</td>
            <td>{token.pos_}</td> 
            <td>{token.dep_}</td>
            <td>{token.head}</td>            
          </tr>"""
        return tablestr
        
    def _build_doc_view(self):
        
        raw = "".join(map(lambda token: f"<span id={token.i} onmouseover='highlight(this);' onmouseout='unhighlight(this)'>{token.string + token.whitespace_}</span>", self.doc))
        raw += f"""
        <div id='rdiv'>
        
        </div>
        <script>
        
        function unhighlight(x) {{
          x.style.backgroundColor = 'white'
        }}        
    

        function highlight(x) {{
          x.style.backgroundColor = 'red'
          var inner = mydict[x.id]
          div = document.getElementById("rdiv")
          div.innerHTML = inner
        }}
        
        var mydict = {{}}
        {self._make_js_pop()}
        </script>
        """
        return raw
        
        

In [388]:
text = df.text.iloc[2]
d = DocVisualizer(text, nlp)
from IPython.display import HTML, display
display(HTML(d._build_doc_view()))