In [6]:
# Step 8: The Scrying Spell – Named Entity Recognition (NER)

import pandas as pd
import spacy
# Make sure to run this line in your environment if you haven't installed spaCy yet:
# !pip install spacy
# Make sure to run this line once to download the model:
! python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [7]:
def categorize_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating <= 2:
        return 'negative'
    else:
        return 'neutral'

df = pd.read_csv("Amazon_Reviews.csv", on_bad_lines='skip', engine='python')
df.rename(columns={'Review Text': 'review_text', 'Rating': 'rating', 'Date of Experience': 'date_of_experience'}, inplace=True)
df.dropna(subset=['review_text', 'rating', 'date_of_experience'], inplace=True)
df.drop_duplicates(inplace=True)
df['rating'] = df['rating'].str.extract(r'(\d+)')
df['rating'] = df['rating'].astype(int)
df['date_of_experience'] = pd.to_datetime(df['date_of_experience'], errors='coerce')
df.dropna(subset=['date_of_experience'], inplace=True)
df['text_length'] = df['review_text'].str.len()
df['exclamation_count'] = df['review_text'].str.count('!')
current_date = pd.Timestamp.now()
df['review_age_days'] = (current_date - df['date_of_experience']).dt.days
df['sentiment'] = df['rating'].apply(categorize_sentiment)

In [None]:
# 2. Load the spaCy language model.
# The 'en_core_web_sm' model is a small English model that is fast and effective.

try:
    nlp = spacy.load("en_core_web_sm")
    print("spaCy model loaded successfully.")
except OSError:
    print("Downloading spaCy model 'en_core_web_sm'...")
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

spaCy model loaded successfully.


In [None]:
# 3. Define a function to extract entities from a given text.
# The function will return a dictionary of entities and their labels.
def extract_entities(text):
    doc = nlp(text)
    entities = {ent.label_: ent.text for ent in doc.ents}
    return entities

In [None]:
# 4. Apply the entity extraction function to the 'review_text' column.
# We'll apply this to a sample of the data to keep it fast.

df_sample = df.sample(10, random_state=42).copy()
df_sample['entities'] = df_sample['review_text'].apply(extract_entities)


In [None]:
# 5. Display the results.

print("\n--- The Scrying Glass Reveals Entities ---")
for index, row in df_sample.iterrows():
    print(f"Review Text: {row['review_text']}")
    print(f"Extracted Entities: {row['entities']}")
    print("-" * 50)


--- The Scrying Glass Reveals Entities ---
Review Text: I always use Amazon! I think they are great. Never had a problem with them.
Extracted Entities: {'ORG': 'Amazon'}
--------------------------------------------------
Review Text: Shalu displayed rather incompetent customer service today by constantly saying "sorry for the inconvenience" rather than trying to sort the issue. On top of that, they asked me what the issue was despite having already explained it to them which makes me think they didn't actually bother reading my message. The issue wasn't even a complicated one, simply just that I needed a code to pick up my parcel from the shop like you do with any Amazon order that goes to a pick-up point, just that I hadn't been provided one this time. And then on top of all of the above, they've just decided to stop responding to my messages - how professional.
Extracted Entities: {'DATE': 'today', 'ORG': 'Amazon'}
--------------------------------------------------
Review Text: Afte