#### 0. Prepare Pkgs

In [1]:
# it seems to have some dependency issue with Torch using CPU only
# so I uninstalled Torch on pip
# then it is back by Tensorflow
# Better idea to create a new environment
import spacy
from spacy.matcher import PhraseMatcher
from spacy.util import minibatch
from spacy.training.example import Example
import random

import pandas as pd
import numpy as np
from collections import defaultdict

from IPython.display import Markdown, Latex, display

Using TensorFlow backend.


In [None]:
# spacy relies on language-specific models
# nlp = spacy.load('en_core_web_sm')

#### 1. Basic Text Processing with Spacy

In [2]:
# reviews on menu items
# prepare data
data = pd.read_json('Documents/restaurant.json')
data.head()
menu = ["Cheese Steak", "Cheesesteak", "Steak and Cheese", "Italian Combo", "Tiramisu", "Cannoli",
        "Chicken Salad", "Chicken Spinach Salad", "Meatball", "Pizza", "Pizzas", "Spaghetti",
        "Bruchetta", "Eggplant", "Italian Beef", "Purista", "Pasta", "Calzones",  "Calzone",
        "Italian Sausage", "Chicken Cutlet", "Chicken Parm", "Chicken Parmesan", "Gnocchi",
        "Chicken Pesto", "Turkey Sandwich", "Turkey Breast", "Ziti", "Portobello", "Reuben",
        "Mozzarella Caprese",  "Corned Beef", "Garlic Bread", "Pastrami", "Roast Beef",
        "Tuna Salad", "Lasagna", "Artichoke Salad", "Fettuccini Alfredo", "Chicken Parmigiana",
        "Grilled Veggie", "Grilled Veggies", "Grilled Vegetable", "Mac and Cheese", "Macaroni",
        "Prosciutto", "Salami"]

# phrase matcher
nlp = spacy.blank('en')  # use empty model?
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
menu_token_list = [nlp(item) for item in menu]
matcher.add('MENU', menu_token_list)

# prepare a defaultdict to store ratings of menu items
item_ratings = defaultdict(list)

In [3]:
# tokenize - matcher - append
for idx, review in data.iterrows():
    doc = nlp(review.text)
    matches = matcher(doc)
    # every match has three comps: [0] = id, [1] = start, [2] = end
    found_items = set([doc[match[1]:match[2]].text.lower()
                       for match in matches])  # lower to lemmatize
    for f in found_items:
        item_ratings[f].append(review.stars)

In [4]:
# mean ratings
mean_ratings = {item: sum(val)/len(val) for item, val in item_ratings.items()}

# but count matters
counts = {item: len(val) for item, val in item_ratings.items()}
item_counts = sorted(counts, key=counts.get, reverse=True)
for item in item_counts:
    print(f"{item:>25}{counts[item]:>5}")  # indent {item:>25}

                    pizza  265
                    pasta  206
                 meatball  128
              cheesesteak   97
             cheese steak   76
                  cannoli   72
                  calzone   72
                 eggplant   69
                  purista   63
                  lasagna   59
          italian sausage   53
               prosciutto   50
             chicken parm   50
             garlic bread   39
                  gnocchi   37
                spaghetti   36
                 calzones   35
                   pizzas   32
                   salami   28
            chicken pesto   27
             italian beef   25
                 tiramisu   21
            italian combo   21
                     ziti   21
         chicken parmesan   19
       chicken parmigiana   17
               portobello   14
           mac and cheese   11
           chicken cutlet   10
         steak and cheese    9
                 pastrami    9
               roast beef    7
       f

In [5]:
# combine ratings and counts
sorted_ratings = sorted(mean_ratings, key=mean_ratings.get)
display(Markdown('**Worst rated menu items**\n'))
for item in sorted_ratings[:10]:
    print(
        f"{item:20} Ave rating:{mean_ratings[item]:.2f} \tcount:{counts[item]}")

display(Markdown('\n\n**Best rated menu items**\n'))
for item in sorted_ratings[:-11:-1]:
    print(
        f'{item:20} Ave ratings:{mean_ratings[item]:.2f} \tcount:{counts[item]}')

**Worst rated menu items**


chicken cutlet       Ave rating:3.40 	count:10
turkey sandwich      Ave rating:3.80 	count:5
spaghetti            Ave rating:3.89 	count:36
italian beef         Ave rating:3.92 	count:25
tuna salad           Ave rating:4.00 	count:5
macaroni             Ave rating:4.00 	count:5
italian combo        Ave rating:4.05 	count:21
garlic bread         Ave rating:4.13 	count:39
roast beef           Ave rating:4.14 	count:7
eggplant             Ave rating:4.16 	count:69




**Best rated menu items**


corned beef          Ave ratings:5.00 	count:2
turkey breast        Ave ratings:5.00 	count:1
fettuccini alfredo   Ave ratings:5.00 	count:6
artichoke salad      Ave ratings:5.00 	count:5
steak and cheese     Ave ratings:4.89 	count:9
reuben               Ave ratings:4.75 	count:4
prosciutto           Ave ratings:4.68 	count:50
purista              Ave ratings:4.67 	count:63
chicken salad        Ave ratings:4.60 	count:5
chicken pesto        Ave ratings:4.56 	count:27


#### 2. Text Classification

In [6]:
# Spam detection
spam = pd.read_csv('Documents/spam.csv')
spam.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# create a pipe
nlp = spacy.blank('en')
textcat = nlp.add_pipe('textcat')

# add labels to text classifier
textcat.add_label('ham')
textcat.add_label('spam')

# convert labels in the data to the form TextCategorizer requires
# (1/2) aka. dictionary of boolean values for each class
train_texts = spam['text'].values
train_labels = [{'cats': {'ham': label == 'ham',
                         'spam': label == 'spam'}}
                for label in spam['label']]

# (2/2) combine text and labels into a single list
train_data = list(zip(train_texts, train_labels))
train_data[:3]

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  {'cats': {'ham': True, 'spam': False}}),
 ('Ok lar... Joking wif u oni...', {'cats': {'ham': True, 'spam': False}}),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  {'cats': {'ham': False, 'spam': True}})]

In [8]:
# train the model
# (1/3) create an optimizer to update the model
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

# (2/3) create minibatches since more efficient 
batches = minibatch(train_data, size = 8)

# (3/3) split batch into text & labels, update model
for batch in batches:
    for text, labels in batch:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, labels)
        nlp.update([example], sgd = optimizer)

In [4]:
# more epoches
random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()  # only use when training from scratch

losses = {}
for epoch in range(5):
    random.shuffle(train_data)
    batches = minibatch(train_data, size=8)
    for batch in batches:
        for text, labels in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, labels)
            nlp.update([example], sgd=optimizer, losses=losses)
    print(losses)

{'textcat': 293.82070328966967}
{'textcat': 406.4270003403468}
{'textcat': 488.70290066040826}
{'textcat': 544.8554215435448}
{'textcat': 575.8179430301309}


In [9]:
# Make predictions
# tokenize texts
texts = ["Are you ready for the tea party??? It's gonna be wild",
        "URGENT reply to this message for GUARANTEED FREE TEA"]
docs = [nlp.tokenizer(text) for text in texts]

# use textcat to get scores for each doc
textcat = nlp.get_pipe('textcat') # trained 
scores = textcat.predict(docs)

# use highest prob score to get pred. label
predicted_labels = scores.argmax(axis = 1)
print([textcat.labels[label] for label in predicted_labels])

['ham', 'spam']


#### 3. Word Vectors

In [10]:
nlp = spacy.load('en_core_web_lg')

###### <font color ='7f7f7f'>classification</font>

In [11]:
# make doc vecs
spam = pd.read_csv('Documents/spam.csv')

with nlp.disable_pipes():  # for efficiency
    doc_vectors = np.array([nlp(text).vector for text in spam.text])

In [12]:
# classification model with SVM
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

X_train, X_test, y_train, y_test = train_test_split(
    doc_vectors, spam.label, test_size=.1, random_state=1)

# dual = False to save time
svc = LinearSVC(random_state=1, dual=False, max_iter=10000)
svc.fit(X_train, y_train)
print(f"Accuracy: {svc.score(X_test,y_test)*100:.3f}%")

Accuracy: 97.849%


###### <font color ='7f7f7f'>doc similarity</font>

In [13]:
# measure doc similarity with cosine similarity
def cosine_similarity(a, b):
    return a.dot(b)/np.sqrt(a.dot(a)*b.dot(b))


a = nlp('REPLY NOW FOR FREE TEA').vector
b = nlp('According to legend, Emperor Shen Nung discovered tea when leaves from a wild tree blew into his pot of boiling water.').vector
cosine_similarity(a, b)

0.7030031

#### Appendices

###### <font color = '7f7f7f'>text classification with train/test split & functions</font>

In [14]:
def load_data(csv_file, split=.9):
    data = pd.read_csv(csv_file)
    # shuffle data
    train_data = data.sample(frac=1, random_state=7)
    texts = train_data.text.values
    labels = [{'ham': label == 'ham', 'spam': label == 'spam'}
              for label in train_data['label']]
    split = int(len(train_data)*split)

    train_labels = [{'cats': labels} for labels in labels[:split]]
    test_labels = [{'cats': labels} for labels in labels[split:]]
    return texts[:split], train_labels, texts[split:], test_labels


train_texts, train_labels, test_texts, test_labels = load_data(
    'Documents/spam.csv')

In [22]:
print('Texts from training data\n------')
print(train_texts[:2])
print('\nLabels from training data\n------')
print(train_labels[:2])

Texts from training data
------
['You will be in the place of that man'
 '\\Si.como no?!listened2the plaid album-quite gd&the new air1 which is hilarious-also boughtåÓbraindanceåÓa comp.ofstuff on aphexåÕs ;abel']

Labels from training data
------
[{'cats': {'ham': True, 'spam': False}}, {'cats': {'ham': True, 'spam': False}}]


In [16]:
# create the model
nlp = spacy.blank('en')
textcat = nlp.add_pipe('textcat')

textcat.add_label('ham')
textcat.add_label('spam')

1

In [23]:
# train function
def train(model, train_data, optimizer, batch_size=8):
    losses = {}
    random.seed(1)
    random.shuffle(train_data)
    for batch in minibatch(train_data, size=batch_size):
        for text, labels in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, labels)
            model.update([example], sgd=optimizer, losses=losses)
    return losses


spacy.util.fix_random_seed(1)
random.seed(1)
optimizer = nlp.begin_training()
train_data = list(zip(train_texts, train_labels))
losses = train(nlp, train_data, optimizer)
print(losses['textcat'])

253.89666720109636


In [24]:
# make predictions
def predict(nlp, texts):
    docs = [nlp.tokenizer(text) for text in texts]
    textcat = nlp.get_pipe('textcat')
    scores = textcat.predict(docs)
    predicted_class = scores.argmax(axis=1)
    return predicted_class

In [25]:
# evaluate the model by accuracy
def evaluate(model, texts, labels):
    predicted_class = predict(model, texts)
    true_class = [int(label['cats']['spam']) for label in labels]
    correct_predictions = predicted_class == true_class
    return correct_predictions.mean()


accuracy = evaluate(nlp, test_texts, test_labels)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.9839


###### <font color = 'maroon'>small project about Su & Lu</font>

In [1]:
# pkgs for this project per se
import pandas as pd
import numpy as np
import spacy
from spacy.util import minibatch
from spacy.training.example import Example
import random

# below are for word2vec
from opencc import OpenCC
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict

cc = OpenCC('t2s')

Using TensorFlow backend.


In [2]:
# prepare data https://raw.githubusercontent.com/chinese-poetry/chinese-poetry/master/json/
# Su @ 14: 44000, 45000, 46000
# Lu @ 39-41: 127000 - 136000
def load_song_shi(nums):
    dfs = pd.concat([pd.read_json(
        'https://raw.githubusercontent.com/chinese-poetry/chinese-poetry/master/json/poet.song.{}.json'.format(num)) for num in nums])
    return dfs


Su = load_song_shi([44000, 45000, 46000])
Lu = load_song_shi(list(range(127000, 137000, 1000)))

In [3]:
spacy.util.fix_random_seed(1)
random.seed(1)

# to same length
Su = Su[Su.author == '蘇軾']
Lu = Lu[Lu.author == '陸游'].sample(n=len(Su))

# turn list object to string
Su['paragraphs'] = Su['paragraphs'].apply(
    lambda x: ''.join(line for line in x))

Lu['paragraphs'] = Lu['paragraphs'].apply(
    lambda x: ''.join(line for line in x))
SuLu = pd.concat([Su, Lu]).sample(frac=1, random_state=7).reset_index().drop(columns = ['index'])
authors = SuLu.author.unique()

######    <font color = 'teal'>simplify to use language model</font>

In [6]:
SuLu['paragraphs'] = SuLu['paragraphs'].apply(lambda x: cc.convert(x))

model = spacy.load('zh_core_web_lg')
with model.disable_pipes():
    doc_vectors = np.array([model(text).vector for text in SuLu.paragraphs])

X_train, X_test, y_train, y_test = train_test_split(
    doc_vectors, SuLu.author, test_size=.1, random_state=1)

# dual = False to save time
svc = LinearSVC(random_state=1, dual=False, max_iter=10000)
svc.fit(X_train, y_train)
print(f"Accuracy: {svc.score(X_test,y_test)*100:.3f}%")  # OK

Accuracy: 72.566%


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    doc_vectors, SuLu.author, test_size=.1, random_state=1)

# dual = False to save time
xgb = XGBClassifier(random_state=1)
xgb.fit(X_train, y_train)
print(f"Accuracy: {xgb.score(X_test,y_test)*100:.3f}%")  # OK



Accuracy: 72.035%


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    doc_vectors, SuLu.author, test_size=.1, random_state=1)

rfc = RandomForestClassifier(random_state=1)
rfc.fit(X_train, y_train)
print(f"Accuracy: {rfc.score(X_test,y_test)*100:.3f}%")  # Not good

Accuracy: 67.611%


In [7]:
# doc similarity
def cosine_similarity(a, b):
    return a.dot(b)/np.sqrt(a.dot(a)*b.dot(b))
# center 
vec_mean = doc_vectors.mean(axis = 0)
centered = doc_vectors - vec_mean

sel = random.randint(0,len(doc_vectors))
line = doc_vectors[sel]
val = {}
range_ = list(range(sel)) + list(range(sel+1, len(doc_vectors)))
for i in range_:
    val[i] = cosine_similarity(doc_vectors[i], line)

print(SuLu.iloc[sel][['author','paragraphs']],SuLu.iloc[sorted(val, key = val.get, reverse = True)[0]][['author','paragraphs']])

author                                                      陸游
paragraphs    前年蒙趣召，渡江当六月。顾惭衰病躯，触热朝行阙。君恩虽屡下，恐惧乞骸骨。飘然返柴荆，所愧已黔突。
Name: 1100, dtype: object author                                                       蘇軾
paragraphs    任公镇西南，尝赠绕朝策。当时若尽用，善阵无赫赫。凄凉十年后，邪正久已白。却留封德彝，天意眇难...
Name: 3918, dtype: object


In [19]:
# which poem is most "most similar" to others in the df
sims = defaultdict(int)
for idx in range(len(doc_vectors)):
    range_ = list(range(idx)) + list(range(idx+1, len(doc_vectors)))
    sim = np.array([cosine_similarity(doc_vectors[idx], c) for c in doc_vectors[range_]])
    most_similar = sim.argmax()
    sims[most_similar] = sims[most_similar] + 1

KeyboardInterrupt: 

In [20]:
# most "most similar"
SuLu.iloc[sorted(sims, key = sims.get, reverse = True)[0]] # 5596

AttributeError: 'list' object has no attribute 'get'

###### <font color = '7f7f7f'>end of simplification</font>

In [None]:
# create train/test
train_texts = SuLu.paragraphs.values
authors_label = [{authors[0]: author == authors[0], authors[1]: author == authors[1]}
                 for author in SuLu['author']]
split = .9
split = int(len(SuLu)*split)

train_authors = [{'cats': author} for author in authors_label[:split]]
test_authors = [{'cats': author} for author in authors_label[split:]]
train_texts = SuLu.paragraphs[:split]
test_texts = SuLu.paragraphs[split:]

In [12]:
# create model
nlp = spacy.blank('zh')
textcat = nlp.add_pipe('textcat')
textcat.add_label(authors[0])  # Su
textcat.add_label(authors[1])  # Lu

# train
optimizer = nlp.begin_training()
train_data = list(zip(train_texts, train_authors))
for epoch in range(10):
    losses = {}
    random.seed(1)
    random.shuffle(train_data)
    for batches in minibatch(train_data, size=8):
        for text, labels in batches:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, labels)
            nlp.update([example], sgd=optimizer, losses=losses)
    print(losses['textcat'])

2174.827909047634
1178.7452880896647
811.5390418922483
595.4171474424583
390.694593819601
336.3565853438324
296.30636926735184
191.57716236540728
184.13119991794198
171.50943364577017


In [13]:
# predict
def predict(nlp, texts):
    docs = [nlp.tokenizer(text) for text in texts]
    textcat = nlp.get_pipe('textcat')
    scores = textcat.predict(docs)
    predicted_class = scores.argmax(axis=1)
    return predicted_class

def evaluate(model, texts, labels):
    predicted_class = predict(model, texts)
    true_class = [int(label == authors[1]) for label in labels]
    correct_predictions = predicted_class == true_class
    return correct_predictions.mean()

accuracy = evaluate(nlp, test_texts, test_authors)
print(f'Accuracy: {accuracy:.4f}') # rather poor accuracy

Accuracy: 0.4301
