## Machine Learning with text using Spacy

In [1]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [2]:
import string 
punctuations = string.punctuation

In [4]:
import spacy
parser = spacy.load("en_core_web_sm")

In [5]:
# Custom transformer using spacy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return[clean_text(text) for text in X]
    def fit(self, X, y = None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

In [6]:
# Basic utility function to clean the text
def clean_text(text):
    return text.strip().lower()

In [8]:
# Create spacy tokenizer that parses a sentence andd generates tokens

def spacy_tokenizer(sentence):
    tokens = parser(sentence)
    tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if (tok not in stopwords and tok not in punctuations)]
    return tokens

In [9]:
# Create vectorizer object to generate feature vectors, we will use custom spacy's tokenizer

vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1)) 
classifier = LinearSVC()

In [10]:
# Create the pipeline to clean, tokenize, vectorize, and classify

pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer',vectorizer),
                ('classifier', classifier)])

In [11]:
# Load Sample Data
train = [('I love this sandwich.', 'pos'),
        ('this is an amazing place!', 'pos'),
        ('I feel very good about these beers.', 'pos'),
        ('this is my best work.', 'pos'),
        ("what an awesome view", 'pos'),
        ("I do not like ehis restaurant", 'neg'),
        ('I am tired of this stuff', 'neg'),
        ("I an't deal with this", 'neg'),
        ('he is my sworn enemy!', 'neg'),
        ('my boss is horrible.','neg')]

In [12]:
test = [('the beer was good', 'pos'),
       ('I do not emjoy my job', 'neg'),
       ("I ain't feeling dandy today.", 'neg'),
       ("I feel amazing!", 'pos'),
       ('Gary is a good friend of mine', 'pos'),
       ("I can't believe I'm doing this", 'neg')]

In [17]:
# Create model and measure accuracy

pipe.fit([x[0] for x in train], [x[1] for x in train])
pred_data = pipe.predict([x[0] for x in test])
for (sample, pred) in zip(test, pred_data):
    print (sample, pred)
print ("Accuracy:", accuracy_score([x[1] for x in test], pred_data))

('the beer was good', 'pos') pos
('I do not emjoy my job', 'neg') neg
("I ain't feeling dandy today.", 'neg') pos
('I feel amazing!', 'pos') pos
('Gary is a good friend of mine', 'pos') pos
("I can't believe I'm doing this", 'neg') neg
Accuracy: 0.8333333333333334
