## Spacy

## Retrieve Data

In [1]:
from datasets import load_dataset
dataset = load_dataset("imdb")

Found cached dataset imdb (/home/nicolas/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


  0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
from datasets import get_dataset_split_names

get_dataset_split_names("imdb")

['train', 'test', 'unsupervised']

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

## Stemming and Lemmatization

### Lemmatization


In [4]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [5]:
# Setup spacy
import spacy
nlp = spacy.load('en_core_web_sm')

# Take a 20 characters sentence example from the test dataset
test_list = dataset['train']['text'][0].split()[:20]
test_sentence = ' '.join(test_list)

# Lemmatize the sentence
doc = nlp(test_sentence)

# Get all token
tokens = [token.text for token in doc]

print('Original Sentence: {test_sentence}')
print()
for token in doc:
    if token.text != token.lemma_:
        print(f'Original : {token.text}, New: {token.lemma_}')

Original Sentence: {test_sentence}

Original : rented, New: rent
Original : AM, New: be
Original : CURIOUS, New: curious
Original : surrounded, New: surround
Original : was, New: be


In [8]:
from typing import List

In [6]:
from string import punctuation
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
def lemma_preprocessor(x_list: List[str]) -> List[str]:
    """
    Preprocessing function to lowercase and remove punctuation
    of a list of string and lemmatize each string.
    
    Args:
        x_list: List of strings
    
    Returns:
        List of preprocessed strings.
    """
    no_punc_lower = [x.lower().translate(str.maketrans("", "", punctuation)) for x in x_list]
    spacy_nlp = spacy.load('en_core_web_sm')
    res = []
    for sentence in no_punc_lower:
        doc = spacy_nlp(sentence)
        s = []
        for word in doc:
            s.append(word.lemma_)
        s = ' '.join(s)
        res.append(s)
    return res

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Create pipeline
pipeline = Pipeline([
    ("preprocess", FunctionTransformer(lemma_preprocessor)),
    ("vectorizer", CountVectorizer(lowercase=True)),
    ("nb", MultinomialNB()),
])

# Fit the pipeline with train dataset
pipeline.fit(np.array(dataset["train"]["text"][:100]), np.array(dataset["train"]["label"][:100]))

### Stem

In [33]:
import nltk
from nltk.stem import PorterStemmer
nltk.download("punkt")

# Initialize Python porter stemmer
ps = PorterStemmer()

# Example inflections to reduce
example_words = ["program","programming","programer","programs","programmed"]

# Perform stemming
print("{0:20}{1:20}".format("--Word--","--Stem--"))
for word in example_words:
    print ("{0:20}{1:20}".format(word, ps.stem(word)))

--Word--            --Stem--            
program             program             
programming         program             
programer           program             
programs            program             
programmed          program             


[nltk_data] Downloading package punkt to /home/nicolas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [47]:
def stem_preprocessor(x_list: List[str]) -> List[str]:
    """
    Preprocessing function to lowercase and remove punctuation
    of a list of string and lemmatize each string.
    
    Args:
        x_list: List of strings
    
    Returns:
        List of preprocessed strings.
    """
    spacy_nlp = spacy.load('en_core_web_sm')
    res = []
    ps = PorterStemmer()
    for sentence in x_list:
        doc = spacy_nlp(sentence)
        s = []
        for word in doc:
            s.append(ps.stem(str(word)))
        s = ' '.join(s)
        res.append(s)
    return res

'toto'

In [48]:
example_words = ["program","programming","programer","programs","programmed"]
stem_preprocessor(example_words)

['program', 'program', 'program', 'program', 'program']