# NLP With Spacy

# Task 1: Perform named entity recognition (NER) to extract product names and brands.

In [3]:
# Import Libraries
import spacy
import pandas as pd
import re

In [6]:
# Load the csv file
reviews_df = pd.read_csv('../data/Reviews.csv')

In [7]:
# Display first few rows to verify structure
print(reviews_df.head())# Print the first 5 rows of the data


   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1 

In [8]:
# Check for missing values
print(reviews_df.isnull().sum())

Id                         0
ProductId                  0
UserId                     0
ProfileName               26
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64


In [9]:
# Drop rows with missing values
reviews_df = reviews_df.dropna()

In [10]:
# Check for missing values
print(reviews_df.isnull().sum())

Id                        0
ProductId                 0
UserId                    0
ProfileName               0
HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Time                      0
Summary                   0
Text                      0
dtype: int64


In [13]:
# Check column names (strip spaces to avoid issues)
reviews_df.columns = reviews_df.columns.str.strip()
reviews_df.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [15]:
# Load Spacy Model
nlp = spacy.load('en_core_web_sm')

In [16]:
# Function to extract product names and brands.  Named Entity Recognition (NER)
def extract_entities(text):
    """
    Extract product names and brands using spaCy NER.
    """
    doc = nlp(str(text))
    entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ["ORG", "PRODUCT"]]
    return entities


# Task 2: Rule Based Approach

In [19]:
# Define positive and negative keywords
positive_keywords = ["good", "happy", "awesome", "great", "excellent"]
negative_keywords = ["bad", "sad", "terrible", "awful", "poor"]

# Define a function to perform sentiment analysis using rules
def rule_based_sentiment_analysis(text):
    # Convert text to lowercase for case-insensitive matching
    text_lower = text.lower()

    # Count occurrences of positive and negative keywords
    positive_count = sum([len(re.findall(keyword, text_lower)) for keyword in positive_keywords])
    negative_count = sum([len(re.findall(keyword, text_lower)) for keyword in negative_keywords])

    # Determine sentiment based on keyword counts
    if positive_count > negative_count:
        return "Positive"
    elif positive_count < negative_count:
        return "Negative"
    else:
        return "Neutral"

# Task 3: Apply NER and Sentiment Analysis on the "Text" column

In [24]:
# A sample of the data
reviews_sample = reviews_df.sample(300, random_state=42)

In [25]:
reviews_sample["Entities"] = reviews_sample["Text"].apply(extract_entities)
reviews_sample["Sentiment"] = reviews_sample["Text"].apply(rule_based_sentiment_analysis)

In [26]:
# Display a summary of results
print("\n✅ Sample Output:")
print(reviews_sample[["Text", "Entities", "Sentiment"]].head(10))


✅ Sample Output:
                                                     Text  \
18828   Avocado oil beats olive oil for dressings, any...   
363857  New years resolution season and all, bought pr...   
342609  I was pretty sad when I got these peaches (whi...   
62213   I find that most (not all) people in North Ame...   
467133  Love that this is recycled, works just as well...   
518641  Good steaks, Delivery was a little delayed tho...   
539725  I love the 'mites'. Vegemite, marmite and prom...   
91433   Product arrived timely and in good shape, have...   
377144  I was unfamiliar on how much peppercorns I wou...   
139209  I just discovered Coconut Oil this year.  Coco...   

                                                 Entities Sentiment  
18828                                                  []  Positive  
363857                                                 []   Neutral  
342609                             [(Native Forest, ORG)]   Neutral  
62213                         