In [1]:
#pip install spacy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
# Download and install the English language model
!python -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     -------------------------------------- 12.8/12.8 MB 601.9 kB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')




## **Methods:**

### **1. Modifying vocab.is_stop attribute:**

#### Importing neccessary libraries

In [2]:
# Download and install the English language model
# !python -m spacy download en_core_web_sm
# 
import spacy
from spacy.lang.en import STOP_WORDS

nlp = spacy.load("en_core_web_sm")

#### Adding custom words to the list of stopwords

In [3]:
# Define custom stopwords
custom_stopwords = {'NIL', 'JUNK'}

# Add custom stopwords to the spaCy language model
for word in custom_stopwords:
    nlp.vocab[word].is_stop = True

#### Python code for removing custom stopwords after tokenisation

In [4]:
# Function to remove stopwords from text
def remove_stopwords(text):
    doc = nlp(text)
    filtered_tokens = [token.text for token in doc if not token.is_stop]
    return filtered_tokens

In [5]:
text = "This is an example sentence demonstrating stop word removal. NIL and JUNK are custom stopwords."
filtered_tokens = remove_stopwords(text)

print("Original text:", text, "\n")
print("Text after stop word removal:", filtered_tokens)

Original text: This is an example sentence demonstrating stop word removal. NIL and JUNK are custom stopwords. 

Text after stop word removal: ['example', 'sentence', 'demonstrating', 'stop', 'word', 'removal', '.', 'custom', 'stopwords', '.']


### **2. Modifying Defaults.stop_words:**

#### Importing necessary models and adding custom stopwords

In [6]:
# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Define custom stopwords
custom_stopwords = ['nil', 'junk']

# Add custom stopwords to the spaCy language model
for word in custom_stopwords:
    nlp.Defaults.stop_words.add(word)

#### Python code for removing custom stopwords after tokenisation

In [7]:
# Function to remove stopwords from text
def remove_stopwords(text):
    doc = nlp(text)
    filtered_tokens = [token.text for token in doc if token.text.lower() not in nlp.Defaults.stop_words]
    return filtered_tokens

In [8]:
text = "This is an example sentence demonstrating stop word removal. NIL and JUNK are custom stopwords."
filtered_text = remove_stopwords(text)
print("Original text:", text, '\n')
print("Text after stop word removal:", filtered_text)

Original text: This is an example sentence demonstrating stop word removal. NIL and JUNK are custom stopwords. 

Text after stop word removal: ['example', 'sentence', 'demonstrating', 'stop', 'word', 'removal', '.', 'custom', 'stopwords', '.']


### **3. Override Language.Defaults.stop_words:**

#### Adding custom stopwords

In [9]:
# Define custom stopwords
new_stopwords = {'NIL', 'JUNK'}

# Get default stopwords from spaCy and convert to a set
default_stopwords = set(nlp.Defaults.stop_words)

# Add custom stopwords to the default stopwords
updated_stopwords = default_stopwords.union(new_stopwords)

#### Python code for removing custom stopwords after tokenisation

In [10]:
# Function to remove stopwords from text
def remove_stopwords(text):
    doc = nlp(text)
    filtered_tokens = [token.text for token in doc if token.text.lower() not in updated_stopwords]
    return filtered_tokens

In [11]:
text = "This is an example sentence demonstrating stop word removal. NIL and JUNK are custom stopwords."
filtered_text = remove_stopwords(text)
print("Original text:", text, '\n')
print("Text after stop word removal:", ' '.join(filtered_text))

Original text: This is an example sentence demonstrating stop word removal. NIL and JUNK are custom stopwords. 

Text after stop word removal: example sentence demonstrating stop word removal . custom stopwords .
