In [2]:
!pip install pandas nltk



In [3]:
import pandas as pd, string, nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.tree import Tree

for pkg in ["punkt","stopwords","averaged_perceptron_tagger","maxent_ne_chunker","words"]:
    nltk.download(pkg, quiet=True)


In [4]:
df = pd.read_csv("Reviews.csv")
reviews = df['Text'].dropna().sample(10000, random_state=42).reset_index(drop=True)


In [5]:
tbl = str.maketrans('', '', string.punctuation)
def preprocess(s): return s.lower().translate(tbl)
reviews_clean = reviews.apply(preprocess)


In [6]:
stop = set(stopwords.words('english'))
def tokenize(text):
    return [w for w in word_tokenize(text) if w.isalpha() and w not in stop]
tokens = reviews_clean.apply(tokenize)


In [7]:
pos_tags = tokens.apply(pos_tag)
print("POS sample:", pos_tags.iloc[0][:20])


POS sample: [('tried', 'JJ'), ('couple', 'NN'), ('brands', 'NNS'), ('glutenfree', 'VBP'), ('sandwich', 'JJ'), ('cookies', 'NNS'), ('best', 'JJS'), ('bunch', 'NN'), ('theyre', 'NN'), ('crunchy', 'NN'), ('true', 'JJ'), ('texture', 'NN'), ('real', 'JJ'), ('cookies', 'NNS'), ('arent', 'VBP'), ('glutenfree', 'JJ'), ('might', 'MD'), ('think', 'VB'), ('filling', 'VBG'), ('makes', 'VBZ')]


In [9]:
nltk.download('maxent_ne_chunker_tab')
def extract_entities_from_pos(tagged):
    tree = ne_chunk(tagged, binary=False)
    ents = []
    for node in tree:
        if isinstance(node, Tree):
            label = node.label()
            text = " ".join(w for w,_ in node.leaves())
            ents.append((text, label))
    return ents

N = 200
ner_per_review = [extract_entities_from_pos(pos_tags.iloc[i]) for i in range(min(N, len(pos_tags)))]
flat_entities = [e for sub in ner_per_review for e in sub]
entities_df = pd.DataFrame(flat_entities, columns=["entity","label"])
print(entities_df.head())


[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker_tab.zip.


Empty DataFrame
Columns: [entity, label]
Index: []
