<a href="https://colab.research.google.com/github/Sangamithra546/DataScienceLabManual231801147/blob/main/147exp5b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
!pip install spacy nltk pandas

import pandas as pd
import nltk
import spacy
import string

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [7]:
# Load Amazon Fine Food Reviews dataset (update path if needed)
reviews_dataset = pd.read_csv("/content/Reviews.csv", sep=',', quotechar='"', on_bad_lines='skip', engine='python')

# Select review text column
reviews_text = reviews_dataset[['Text']].copy()

# Remove missing/null entries
reviews_text.dropna(inplace=True)

# Limit dataset to 10,000 reviews
reviews_text = reviews_text.head(10000)

reviews_text.head()

Unnamed: 0,Text
0,I have bought several of the Vitality canned d...
1,Product arrived labeled as Jumbo Salted Peanut...
2,This is a confection that has been around a fe...
3,If you are looking for the secret ingredient i...
4,Great taffy at a great price. There was a wid...


In [8]:
def preprocess_review(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()                           # Lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text

preprocessed_reviews = reviews_text.copy()
preprocessed_reviews['Cleaned_Text'] = preprocessed_reviews['Text'].apply(preprocess_review)

preprocessed_reviews.head()


Unnamed: 0,Text,Cleaned_Text
0,I have bought several of the Vitality canned d...,i have bought several of the vitality canned d...
1,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled as jumbo salted peanut...
2,This is a confection that has been around a fe...,this is a confection that has been around a fe...
3,If you are looking for the secret ingredient i...,if you are looking for the secret ingredient i...
4,Great taffy at a great price. There was a wid...,great taffy at a great price there was a wide...


In [12]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def tokenize_and_clean(text):
    if not isinstance(text, str):
        return []
    # Download the missing resource just before using it
    try:
        nltk.data.find('tokenizers/punkt_tab')
    except nltk.downloader.DownloadError:
        nltk.download('punkt_tab')
    except LookupError:
        nltk.download('punkt_tab')

    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

# Fill NaN if any
preprocessed_reviews['Cleaned_Text'] = preprocessed_reviews['Cleaned_Text'].fillna("").astype(str)

tokenized_reviews = preprocessed_reviews.copy()
tokenized_reviews['Tokens'] = tokenized_reviews['Cleaned_Text'].apply(tokenize_and_clean)

tokenized_reviews.head()

Unnamed: 0,Text,Cleaned_Text,Tokens
0,I have bought several of the Vitality canned d...,i have bought several of the vitality canned d...,"[bought, several, vitality, canned, dog, food,..."
1,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled as jumbo salted peanut...,"[product, arrived, labeled, jumbo, salted, pea..."
2,This is a confection that has been around a fe...,this is a confection that has been around a fe...,"[confection, around, centuries, light, pillowy..."
3,If you are looking for the secret ingredient i...,if you are looking for the secret ingredient i...,"[looking, secret, ingredient, robitussin, beli..."
4,Great taffy at a great price. There was a wid...,great taffy at a great price there was a wide...,"[great, taffy, great, price, wide, assortment,..."


In [16]:
pos_tagged_reviews = tokenized_reviews.copy()
# Download the missing resource
nltk.download('averaged_perceptron_tagger_eng')

pos_tagged_reviews['POS'] = pos_tagged_reviews['Tokens'].apply(nltk.pos_tag)

pos_tagged_reviews[['Text', 'POS']].head()

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


Unnamed: 0,Text,POS
0,I have bought several of the Vitality canned d...,"[(bought, VBD), (several, JJ), (vitality, NN),..."
1,Product arrived labeled as Jumbo Salted Peanut...,"[(product, NN), (arrived, VBD), (labeled, JJ),..."
2,This is a confection that has been around a fe...,"[(confection, NN), (around, IN), (centuries, N..."
3,If you are looking for the secret ingredient i...,"[(looking, VBG), (secret, JJ), (ingredient, NN..."
4,Great taffy at a great price. There was a wid...,"[(great, JJ), (taffy, JJ), (great, JJ), (price..."


In [17]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def extract_entities(text):
    if not isinstance(text, str) or text.strip() == "":
        return []
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

ner_reviews = pos_tagged_reviews.copy()
ner_reviews['NER'] = ner_reviews['Text'].apply(extract_entities)

ner_reviews[['Text', 'NER']].head(10)


Unnamed: 0,Text,NER
0,I have bought several of the Vitality canned d...,"[(Vitality, ORG), (My Labrador, PERSON)]"
1,Product arrived labeled as Jumbo Salted Peanut...,"[(Jumbo Salted, ORG), (Jumbo, WORK_OF_ART)]"
2,This is a confection that has been around a fe...,"[(around a few centuries, DATE), (pillowy citr..."
3,If you are looking for the secret ingredient i...,"[(Robitussin, GPE), (the Root Beer Extract I, ..."
4,Great taffy at a great price. There was a wid...,"[(Delivery, PERSON)]"
5,I got a wild hair for taffy and ordered this f...,"[(five pound, QUANTITY), (only two weeks, DATE)]"
6,This saltwater taffy had great flavors and was...,"[(Fralinger's, ORG)]"
7,This taffy is so good. It is very soft and ch...,[]
8,Right now I'm mostly just sprouting this so my...,"[(Wheatgrass, PERSON)]"
9,This is a very healthy dog food. Good for thei...,[]
