In [28]:
data = {
    'Review': [
        'At McDonald\'s the food was ok and the service was bad.',
        'I would not recommend this Japanese restaurant to anyone.',
        'I loved this restaurant when I traveled to Thailand last summer.',
        'The menu of Loving has a wide variety of options.',
        'The staff was friendly and helpful at Google\'s employees restaurant.',
        'The ambiance at Bella Italia is amazing, and the pasta dishes are delicious.',
        'I had a terrible experience at Pizza Hut. The pizza was burnt, and the service was slow.',
        'The sushi at Sushi Express is always fresh and flavorful.',
        'The steakhouse on Main Street has a cozy atmosphere and excellent steaks.',
        'The dessert selection at Sweet Treats is to die for!'
    ]
}


In [29]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha()]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [30]:
for review in data['Review']:
    preprocessed_text= preprocess_text(review)
    print(preprocessed_text)

at mcdonald the food wa ok and the service wa bad
i would not recommend this japanese restaurant to anyone
i loved this restaurant when i traveled to thailand last summer
the menu of loving ha a wide variety of option
the staff wa friendly and helpful at google employee restaurant
the ambiance at bella italia is amazing and the pasta dish are delicious
i had a terrible experience at pizza hut the pizza wa burnt and the service wa slow
the sushi at sushi express is always fresh and flavorful
the steakhouse on main street ha a cozy atmosphere and excellent steak
the dessert selection at sweet treat is to die for


In [31]:
import pandas as pd

In [32]:
df = pd.DataFrame(data)

In [33]:
df

Unnamed: 0,Review
0,At McDonald's the food was ok and the service ...
1,I would not recommend this Japanese restaurant...
2,I loved this restaurant when I traveled to Tha...
3,The menu of Loving has a wide variety of options.
4,The staff was friendly and helpful at Google's...
5,"The ambiance at Bella Italia is amazing, and t..."
6,I had a terrible experience at Pizza Hut. The ...
7,The sushi at Sushi Express is always fresh and...
8,The steakhouse on Main Street has a cozy atmos...
9,The dessert selection at Sweet Treats is to di...


In [34]:
df['preprocessed_text'] = df['Review'].apply(preprocess_text)

In [35]:
df

Unnamed: 0,Review,preprocessed_text
0,At McDonald's the food was ok and the service ...,at mcdonald the food wa ok and the service wa bad
1,I would not recommend this Japanese restaurant...,i would not recommend this japanese restaurant...
2,I loved this restaurant when I traveled to Tha...,i loved this restaurant when i traveled to tha...
3,The menu of Loving has a wide variety of options.,the menu of loving ha a wide variety of option
4,The staff was friendly and helpful at Google's...,the staff wa friendly and helpful at google em...
5,"The ambiance at Bella Italia is amazing, and t...",the ambiance at bella italia is amazing and th...
6,I had a terrible experience at Pizza Hut. The ...,i had a terrible experience at pizza hut the p...
7,The sushi at Sushi Express is always fresh and...,the sushi at sushi express is always fresh and...
8,The steakhouse on Main Street has a cozy atmos...,the steakhouse on main street ha a cozy atmosp...
9,The dessert selection at Sweet Treats is to di...,the dessert selection at sweet treat is to die...


In [36]:
df_preprocessed = df['preprocessed_text']
df_raw = df['Review']

In [37]:
import  spacy

nlp = spacy.load("en_core_web_sm")

def perform_ner(text):
    doc = nlp(text)
    
    # Extract entities and their labels
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    
    return entities

# Example usage
sample_text = "Apple is looking at buying U.K. startup for $1 billion on January 5, 2022."
entities = perform_ner(sample_text)
print(entities)     

[('Apple', 'ORG'), ('U.K.', 'GPE'), ('$1 billion', 'MONEY'), ('January 5, 2022', 'DATE')]


In [38]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize
def perform_pos_tagging(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    return pos_tags

In [39]:
for i in df['Review']:
   for word in i.split():
       print(perform_pos_tagging(word))

[('At', 'IN')]
[('McDonald', 'NNP'), ("'s", 'POS')]
[('the', 'DT')]
[('food', 'NN')]
[('was', 'VBD')]
[('ok', 'NN')]
[('and', 'CC')]
[('the', 'DT')]
[('service', 'NN')]
[('was', 'VBD')]
[('bad', 'JJ'), ('.', '.')]
[('I', 'PRP')]
[('would', 'MD')]
[('not', 'RB')]
[('recommend', 'NN')]
[('this', 'DT')]
[('Japanese', 'JJ')]
[('restaurant', 'NN')]
[('to', 'TO')]
[('anyone', 'NN'), ('.', '.')]
[('I', 'PRP')]
[('loved', 'VBN')]
[('this', 'DT')]
[('restaurant', 'NN')]
[('when', 'WRB')]
[('I', 'PRP')]
[('traveled', 'VBD')]
[('to', 'TO')]
[('Thailand', 'NN')]
[('last', 'JJ')]
[('summer', 'NN'), ('.', '.')]
[('The', 'DT')]
[('menu', 'NN')]
[('of', 'IN')]
[('Loving', 'VBG')]
[('has', 'VBZ')]
[('a', 'DT')]
[('wide', 'JJ')]
[('variety', 'NN')]
[('of', 'IN')]
[('options', 'NNS'), ('.', '.')]
[('The', 'DT')]
[('staff', 'NN')]
[('was', 'VBD')]
[('friendly', 'RB')]
[('and', 'CC')]
[('helpful', 'NN')]
[('at', 'IN')]
[('Google', 'NNP'), ("'s", 'POS')]
[('employees', 'NNS')]
[('restaurant', 'NN'), ('.', '

In [40]:
nltk.download('tagsets')
nltk.help.upenn_tagset('NN')

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\v_gol\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [41]:
nltk.help.upenn_tagset('NN')

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


In [42]:
df['cleaned_text'] = df['Review'].apply(preprocess_text)
df['ner_raw'] = df['Review'].apply(perform_ner)
df['ner_cleaned'] = df['Review'].apply(perform_ner)
df['pos_raw'] = df['Review'].apply(perform_pos_tagging)
df['pos_cleaned'] = df['Review'].apply(perform_pos_tagging)

In [43]:
df

Unnamed: 0,Review,preprocessed_text,cleaned_text,ner_raw,ner_cleaned,pos_raw,pos_cleaned
0,At McDonald's the food was ok and the service ...,at mcdonald the food wa ok and the service wa bad,at mcdonald the food wa ok and the service wa bad,"[(McDonald's, ORG)]","[(McDonald's, ORG)]","[(At, IN), (McDonald, NNP), ('s, POS), (the, D...","[(At, IN), (McDonald, NNP), ('s, POS), (the, D..."
1,I would not recommend this Japanese restaurant...,i would not recommend this japanese restaurant...,i would not recommend this japanese restaurant...,"[(Japanese, NORP)]","[(Japanese, NORP)]","[(I, PRP), (would, MD), (not, RB), (recommend,...","[(I, PRP), (would, MD), (not, RB), (recommend,..."
2,I loved this restaurant when I traveled to Tha...,i loved this restaurant when i traveled to tha...,i loved this restaurant when i traveled to tha...,"[(Thailand, GPE), (last summer, DATE)]","[(Thailand, GPE), (last summer, DATE)]","[(I, PRP), (loved, VBD), (this, DT), (restaura...","[(I, PRP), (loved, VBD), (this, DT), (restaura..."
3,The menu of Loving has a wide variety of options.,the menu of loving ha a wide variety of option,the menu of loving ha a wide variety of option,"[(Loving, GPE)]","[(Loving, GPE)]","[(The, DT), (menu, NN), (of, IN), (Loving, NNP...","[(The, DT), (menu, NN), (of, IN), (Loving, NNP..."
4,The staff was friendly and helpful at Google's...,the staff wa friendly and helpful at google em...,the staff wa friendly and helpful at google em...,"[(Google, ORG)]","[(Google, ORG)]","[(The, DT), (staff, NN), (was, VBD), (friendly...","[(The, DT), (staff, NN), (was, VBD), (friendly..."
5,"The ambiance at Bella Italia is amazing, and t...",the ambiance at bella italia is amazing and th...,the ambiance at bella italia is amazing and th...,"[(Bella Italia, ORG)]","[(Bella Italia, ORG)]","[(The, DT), (ambiance, NN), (at, IN), (Bella, ...","[(The, DT), (ambiance, NN), (at, IN), (Bella, ..."
6,I had a terrible experience at Pizza Hut. The ...,i had a terrible experience at pizza hut the p...,i had a terrible experience at pizza hut the p...,"[(Pizza Hut, ORG)]","[(Pizza Hut, ORG)]","[(I, PRP), (had, VBD), (a, DT), (terrible, JJ)...","[(I, PRP), (had, VBD), (a, DT), (terrible, JJ)..."
7,The sushi at Sushi Express is always fresh and...,the sushi at sushi express is always fresh and...,the sushi at sushi express is always fresh and...,"[(Sushi Express, ORG)]","[(Sushi Express, ORG)]","[(The, DT), (sushi, NN), (at, IN), (Sushi, NNP...","[(The, DT), (sushi, NN), (at, IN), (Sushi, NNP..."
8,The steakhouse on Main Street has a cozy atmos...,the steakhouse on main street ha a cozy atmosp...,the steakhouse on main street ha a cozy atmosp...,"[(Main Street, FAC)]","[(Main Street, FAC)]","[(The, DT), (steakhouse, NN), (on, IN), (Main,...","[(The, DT), (steakhouse, NN), (on, IN), (Main,..."
9,The dessert selection at Sweet Treats is to di...,the dessert selection at sweet treat is to die...,the dessert selection at sweet treat is to die...,"[(Sweet Treats, FAC)]","[(Sweet Treats, FAC)]","[(The, DT), (dessert, JJ), (selection, NN), (a...","[(The, DT), (dessert, JJ), (selection, NN), (a..."


Ex 2
----


In [44]:
import nltk
import spacy
import string
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import pandas as pd

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')

nlp = spacy.load('en_core_web_sm')

def preprocess_text(text: str) -> str:
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    doc = nlp(text)
    lemmatized_text = ' '.join([token.lemma_ for token in doc])
    return lemmatized_text

data = {
    'Review': [
        'At McDonald\'s the food was ok and the service was bad.',
        'I would not recommend this Japanese restaurant to anyone.',
        'I loved this restaurant when I traveled to Thailand last summer.',
        'The menu of Loving has a wide variety of options.',
        'The staff was friendly and helpful at Google\'s employees restaurant.',
        'The ambiance at Bella Italia is amazing, and the pasta dishes are delicious.',
        'I had a terrible experience at Pizza Hut. The pizza was burnt, and the service was slow.',
        'The sushi at Sushi Express is always fresh and flavorful.',
        'The steakhouse on Main Street has a cozy atmosphere and excellent steaks.',
        'The dessert selection at Sweet Treats is to die for!'
    ]
}

df = pd.DataFrame(data)
df['cleaned_text'] = df['text'].apply(preprocess_text)
df['tokenized_text'] = df['cleaned_text'].apply(word_tokenize)

print(df[['cleaned_text', 'tokenized_text']])


ModuleNotFoundError: No module named 'gensim'

Ex3
--