In [2]:
#Import pandas as pd
import pandas as pd

#Import spacy
import spacy

#Load the English model
nlp = spacy.load("en_core_web_sm") #Here, spacy.load, loads the English model and assigns it to variable nlp

#When you execute nlp = spacy.load('en'), spaCy downloads and loads the pre-trained English language model
#into memory and assigns it to the variable nlp.
#This pre-trained model contains information about word vectors, part-of-speech tags, syntactic dependencies, and other linguistic features necessary for various NLP tasks.

In [4]:
# Example text
text = "A customer in New York City wants to give a review."

# Process the text using spaCy
doc = nlp(text) #Recall that the Doc object represents the our sentence.


# Tokenization
print("Tokenization:")
for token in doc:
    # Iterate through each token in the processed text and print the token text
    print(token.text)

Tokenization:
A
customer
in
New
York
City
wants
to
give
a
review
.


In [5]:
# Filter out stop words
print("Filtered Tokens (without stop words):")
filtered_tokens = [token.text for token in doc if not token.is_stop]
# Create a list of tokens excluding stop words using list comprehension
print(filtered_tokens)

Filtered Tokens (without stop words):
['customer', 'New', 'York', 'City', 'wants', 'review', '.']


In [6]:
# Part-of-Speech Tagging (POS)
print("Part-of-Speech Tagging (POS):")
for token in doc:
    # Iterate through each token and print the token text and its POS tag
    print(token.text, token.pos_)

Part-of-Speech Tagging (POS):
A DET
customer NOUN
in ADP
New PROPN
York PROPN
City PROPN
wants VERB
to PART
give VERB
a DET
review NOUN
. PUNCT


In [7]:
print("Named Entity Recognition (NER):")
for ent in doc.ents:
    # Iterate through each named entity in the processed text and print its text and label
    print(ent.text, ent.label_)

Named Entity Recognition (NER):
New York City GPE


In [8]:
# Lemmatization
print("Lemmatization:")
lemmatized_tokens = [token.lemma_ for token in doc if not token.is_punct]
# Create a list of lemmatized tokens excluding punctuation using list comprehension
print(lemmatized_tokens)

Lemmatization:
['a', 'customer', 'in', 'New', 'York', 'City', 'want', 'to', 'give', 'a', 'review']


In [10]:

file_path = 'sentiment_examples.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    sentiment_texts = file.readlines()

In [12]:
#Initialize empty lists to store the results
token_lists = []  # List to store tokens for each sentiment example
filtered_token_lists = []  # List to store filtered tokens (after stop word removal) for each sentiment example
pos_tag_lists = []  # List to store POS tags for each sentiment example
ner_lists = []  # List to store named entities for each sentiment example

# Process each sentiment example using spaCy and store the results
for sentiment_text in sentiment_texts:
    doc = nlp(sentiment_text.strip())  # Strip any leading/trailing whitespace
    #The .strip() method is used to clean up the sentiment_text
    #before passing it to the spaCy nlp pipeline for processing.
    #This ensures that there are no unwanted spaces or newline characters
    #that could affect the processing of the text by spaCy.

In [16]:
tokens = [token.text for token in doc]  # Extract tokens from the processed text
token_lists.append(tokens)  # Append tokens list to token_lists
print(token_lists)

[['"', 'The', 'delivery', 'was', 'prompt', ',', 'and', 'the', 'packaging', 'was', 'secure', '.', 'Everything', 'arrived', 'in', 'perfect', 'condition', '.', '"'], ['"', 'The', 'delivery', 'was', 'prompt', ',', 'and', 'the', 'packaging', 'was', 'secure', '.', 'Everything', 'arrived', 'in', 'perfect', 'condition', '.', '"'], ['"', 'The', 'delivery', 'was', 'prompt', ',', 'and', 'the', 'packaging', 'was', 'secure', '.', 'Everything', 'arrived', 'in', 'perfect', 'condition', '.', '"']]


In [17]:
# Stop Word Removal filter
filtered_tokens = [token.text for token in doc if not token.is_stop]  # Filter out stop words
filtered_token_lists.append(filtered_tokens)  # Append filtered tokens list to filtered_token_lists
print(filtered_token_lists)

[['"', 'delivery', 'prompt', ',', 'packaging', 'secure', '.', 'arrived', 'perfect', 'condition', '.', '"'], ['"', 'delivery', 'prompt', ',', 'packaging', 'secure', '.', 'arrived', 'perfect', 'condition', '.', '"'], ['"', 'delivery', 'prompt', ',', 'packaging', 'secure', '.', 'arrived', 'perfect', 'condition', '.', '"']]


In [18]:
# Named Entity Recognition (NER)
ner_entities = [(ent.text, ent.label_) for ent in doc.ents]  # Extract named entities and their labels
ner_lists.append(ner_entities)  # Append named entities list to ner_lists
print(ner_lists)

[[], []]
