In [4]:
# import the movie review data and split into train/test sets
import sklearn
import os
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.datasets import load_files
path = './movie_reviews/'
# we will consider only the most 1000 common words
max_tokens = 1000
# load files -- there are 2000 files
movie_reviews = load_files(path)
# the names of the categories (the labels) are automatically generated from the names of the folders in path
# 'pos' and 'neg'
labels = movie_reviews.target_names

# Split data into training and test sets
# since this is just an example, we will omit the dev test set
# 'movie_reviews.data' is the movie reviews
# 'movie_reviews.target' is the categories assigned to each review
# 'test_size = .20' is the proportion of the data that should be reserved for testing
# 'random_state = 42' is an integer that controls the randomization of the
# data so that the results are reproducible
from sklearn.model_selection import train_test_split

movies_train, movies_test, sentiment_train, sentiment_test = train_test_split(movie_reviews.data,
movie_reviews.target,test_size = 0.20,random_state = 42)

In [6]:
# initialize TfidfVectorizer to create the tfIdf representation of the corpus
# the parameters are: min_df -- the percentage of documents that the word has
# to occur in to be considered, the tokenizer to use, and the maximum
# number of words to consider (max_features)
vectorizer = TfidfVectorizer(min_df = .1,tokenizer = nltk.word_tokenize,max_features = max_tokens)

# fit and transform the text into tfidf format, using training text
# here is where we build the tfidf representation of the training data
movies_train_tfidf = vectorizer.fit_transform(movies_train)

In [7]:
# train the naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
# Initialize the classifier and train it
classifier = MultinomialNB()
classifier.fit(movies_train_tfidf, sentiment_train)

In [14]:
# find accuracy based on test set
movies_test_tfidf = vectorizer.fit_transform(movies_test)
# for each document in the test data, use the classifier to predict whether its sentiment is positive or negative
sentiment_pred = classifier.predict(movies_test_tfidf)
sklearn.metrics.accuracy_score(sentiment_test,sentiment_pred)



0.64

In [15]:
# View the results as a confusion matrix
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(sentiment_test,sentiment_pred,normalize=None)
print(conf_matrix)

[[132  58]
 [ 86 124]]


In [20]:
# SVM classification
import numpy as np
from sklearn.datasets import load_files
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# the directory root will be wherever the movie review data is located
# modify the path as appropriate for your system
directory_root = "./movie_reviews/"
movie_reviews = load_files(directory_root,
encoding='utf-8',decode_error="replace")
# count the number of reviews in each category
labels, counts = np.unique(movie_reviews.target,
return_counts=True)
# convert review_data.target_names to np array
labels_str = np.array(movie_reviews.target_names)[labels]
print(dict(zip(labels_str, counts)))

from sklearn.model_selection import train_test_split
movies_train, movies_test, sentiment_train, sentiment_test = train_test_split(movie_reviews.data,
movie_reviews.target, test_size = 0.20, random_state = 42)

{'neg': 1000, 'pos': 1000}


In [23]:
# We will work with a TF_IDF representation, as before
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

# Use the Pipeline function to construct a sequence of two processes
# to run, one after the other -- the vectorizer and the classifier
svc_tfidf = Pipeline([
("tfidf_vectorizer", TfidfVectorizer(
stop_words = "english", max_features=1000)),
("linear svc", SVC(kernel="linear"))
])

model = svc_tfidf
model.fit(movies_train, sentiment_train)
sentiment_pred = model.predict(movies_test)
accuracy_result = accuracy_score( sentiment_test,
sentiment_pred)
print(accuracy_result)
0.8125
# View the results as a confusion matrix
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(sentiment_test,
sentiment_pred,normalize=None)
print(conf_matrix)

0.8125
[[153  37]
 [ 38 172]]


In [24]:
# for multiclass, use the "one vs rest" approach
from sklearn.multiclass import OneVsRestClassifier
model = OneVsRestClassifier(SVC())

In [27]:
# take a look at the data
from IPython.display import display, Markdown
with open("examples/restaurant_search.md", "r") as f:
    display(Markdown(f.read()))

## restaurant_search

- I want to get some [lunch](meal)
- I am searching for a [dinner](meal) spot
- i'm looking for a place in the [north](location) of town
- show me some [good](quality) [chinese](cuisine) restaurants in the [north](location)
- how about a [mexican](cuisine) restaurant [downtown](location)
- Are there any [indian](cuisine) spots near here
- Italian restaurants on the [west side] (location)
- looking for [German](cuisine) places in the [south](location)
- what [Greek](cuisine) places are near [12345](location)
- help me fine a [casual](atmosphere)[asian f_usion](cuisine) place
- I am looking a [french](cuisine) restaurant [nearby](location)
- I am looking for a [nice] (quality) [mexican](cuisine) or [thai](cuisine) place that's [not too expensive](price)
- [cozy](atmosphere) [barbecue](cuisine) restaurant

In [30]:
# prepare to train CRF
import sklearn_crfsuite
from spacy_crfsuite import read_file
train_data = read_file("examples/restaurant_search.md")

import spacy
from spacy_crfsuite.tokenizer import SpacyTokenizer
from spacy_crfsuite.train import gold_example_to_crf_tokens

nlp = spacy.load("en_core_web_sm", disable=["ner"])
tokenizer = SpacyTokenizer(nlp)
train_dataset = [gold_example_to_crf_tokens(ex, tokenizer = tokenizer)
for ex in train_data
]
train_dataset[0]

[CRFToken(text='I', tag='PRP', entity='O', shape='X', pattern={}, dense_features=[]),
 CRFToken(text='want', tag='VBP', entity='O', shape='xxxx', pattern={}, dense_features=[]),
 CRFToken(text='to', tag='TO', entity='O', shape='xx', pattern={}, dense_features=[]),
 CRFToken(text='get', tag='VB', entity='O', shape='xxx', pattern={}, dense_features=[]),
 CRFToken(text='some', tag='DT', entity='O', shape='xxxx', pattern={}, dense_features=[]),
 CRFToken(text='lunch', tag='NN', entity='U-meal', shape='xxxx', pattern={}, dense_features=[])]

In [31]:
import srsly

component_config = srsly.read_json("examples/default-config.json")
component_config

{'features': [['low', 'title', 'upper'],
  ['low',
   'bias',
   'prefix5',
   'prefix2',
   'suffix5',
   'suffix3',
   'suffix2',
   'upper',
   'title',
   'digit'],
  ['low', 'title', 'upper']],
 'c1': 0.003,
 'c2': 0.03}

In [50]:
import sklearn
print(sklearn.__version__)

1.2.1


In [49]:
from spacy_crfsuite import CRFExtractor
crf_extractor = CRFExtractor(
component_config = component_config)

rs =  crf_extractor.fine_tune(train_dataset, cv = 5,n_iter=50, random_state=42)

print("best_params:", rs.best_params_, ", score:",rs.best_score_)
crf_extractor.train(train_dataset)
classification_report = crf_extractor.eval(train_dataset)
print(classification_report[1])

<class 'AttributeError'>: 'CRF' object has no attribute 'keep_tempfiles'

In [None]:
example = {"text": "show some good chinese restaurants near me"}
tokenizer.tokenize(example, attribute="text")
crf_extractor.process(example)