In [1]:
# Import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string # Import punc

In [2]:
# I choose the Yelp text. 
Yelp = pd.read_csv(r"D:\Caroline\Documents\Graduate\ISE 540 Text Analysis\Dataset\NLP Projects\NLP Projects\yelp_labelled_3.txt", sep='\t')
print(Yelp.head(10))
print(Yelp.tail(10))

                                             Message  Review
0                           Wow... Loved this place.       1
1                                 Crust is not good.       0
2          Not tasty and the texture was just nasty.       0
3  Stopped by during the late May bank holiday of...       1
4  The selection on the menu was great and so wer...       1
5     Now I am getting angry and I want my damn pho.       0
6              Honeslty it didn't taste THAT fresh.)       0
7  The potatoes were like rubber and you could te...       0
8                          The fries were great too.       1
9                                     A great touch.       1
                                               Message  Review
990  The refried beans that came with my meal were ...       0
991         Spend your money and time some place else.       0
992  A lady at the table next to us found a live gr...       0
993            the presentation of the food was awful.       0
994           

In [3]:
print("Length of the dataset:",len(Yelp))
print(Yelp.isnull().sum())

Length of the dataset: 1000
Message    0
Review     0
dtype: int64


In [4]:
nlp = spacy.load('en_core_web_sm')
comment = Yelp.iloc[:,0]
print(comment)

0                               Wow... Loved this place.
1                                     Crust is not good.
2              Not tasty and the texture was just nasty.
3      Stopped by during the late May bank holiday of...
4      The selection on the menu was great and so wer...
                             ...                        
995    I think food should have flavor and texture an...
996                             Appetite instantly gone.
997    Overall I was not impressed and would not go b...
998    The whole experience was underwhelming, and I ...
999    Then, as if I hadn't wasted enough of my life ...
Name: Message, Length: 1000, dtype: object


In [5]:
# Create the tokens
# Read your data into a pandas Series
Yelp_series = pd.Series(comment)  # Replace 'Your data series' with your actual data
# Convert the Series object to a string
Yelp_text = Yelp_series.str.cat(sep=' ')
# Pass the string to the nlp function
doc = nlp(Yelp_text)
for token in doc:
    print(token.text, token.pos_, token.dep_)

Wow INTJ ROOT
... PUNCT punct
Loved VERB ROOT
this DET det
place NOUN dobj
. PUNCT punct
Crust NOUN nsubj
is AUX ROOT
not PART neg
good ADJ acomp
. PUNCT punct
Not PART neg
tasty ADJ ROOT
and CCONJ cc
the DET det
texture NOUN nsubj
was AUX conj
just ADV advmod
nasty ADJ acomp
. PUNCT punct
Stopped VERB ROOT
by ADP prt
during ADP prep
the DET det
late ADJ amod
May PROPN compound
bank NOUN compound
holiday NOUN pobj
off ADP prep
Rick PROPN compound
Steve PROPN compound
recommendation NOUN pobj
and CCONJ cc
loved VERB conj
it PRON dobj
. PUNCT punct
The DET det
selection NOUN nsubj
on ADP prep
the DET det
menu NOUN pobj
was AUX ROOT
great ADJ acomp
and CCONJ cc
so ADV advmod
were AUX conj
the DET det
prices NOUN nsubj
. PUNCT punct
Now ADV advmod
I PRON nsubj
am AUX aux
getting VERB ROOT
angry ADJ acomp
and CCONJ cc
I PRON nsubj
want VERB conj
my PRON poss
damn ADJ amod
pho NOUN dobj
. PUNCT punct
Honeslty NOUN npadvmod
it PRON nsubj
did AUX aux
n't PART neg
taste VERB ROOT
THAT PRON dobj

In [6]:
# Clean the dataset using spacy
# Lowering, Tokenization and removing stopwords/punctuation
stopwords = spacy.lang.en.stop_words.STOP_WORDS
tokens = [token.text.lower() for token in doc if token.text.lower() not in stopwords and token.text not in string.punctuation]
print(tokens)



In [7]:
# Word frequency calculation
word_frequency = {}
for token in tokens:
    if token in word_frequency:
        word_frequency[token] += 1
    else:
        word_frequency[token] = 1

# Normalization by dividing with max frequency
max_frequency = max(word_frequency.values())
normalized_frequency = {word: freq / max_frequency for word, freq in word_frequency.items()}

# Sentence tokenization and scoring
sentences = [sent.text for sent in doc.sents]
sentence_scores = {}
for sentence in sentences:
    words = [token.text.lower() for token in nlp(sentence) if token.text.lower() not in stopwords and token.text not in string.punctuation]
    score = sum(normalized_frequency[word] for word in words)
    sentence_scores[sentence] = score

print("Tokens:", tokens)
print("Word Frequency:", word_frequency)
print("Normalized Frequency:", normalized_frequency)
print("Sentence Scores:", sentence_scores)



In [8]:
# 3.Select %30 sentences with maximum score
# (you can use heapq library and nlargest class and pass number of sentences, scores, and sentences as arguments)
import heapq
# Selecting top percentage of sentences with maximum scores
percentage = 30  # Percentage of sentences to select
num_sentences = int(len(sentences) * percentage / 100)
top_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)

print("Top Sentences:", top_sentences)

Top Sentences: ['I am far from a sushi connoisseur but I can definitely tell the difference between good food and bad food and this was certainly bad food.', 'Food was good, service was good, Prices were good.', 'Good value, great food, great service.', 'Very good food, great atmosphere.1 Damn good steak.', 'Good food , good service .', 'Great food and great service in a clean and friendly setting.', 'seems like a good quick place to grab a bite of some familiar pub food, but do yourself a favor and look elsewhere.', 'Best service and food ever, Maria our server was so good and friendly she made our day.', 'Took an hour to get our food only 4 tables in restaurant my food was Luke warm, Our sever was running around like he was totally overwhelmed.', 'If you want to wait for mediocre food and downright terrible service, then this is the place for you.', 'The food is very good for your typical bar food.', 'So good I am going to have to review this place twice - once hereas a tribute to th

In [9]:
# 4. Get the final summary and check the length of the summary.
# Combine selected top sentences to form the summary
summary = ' '.join(top_sentences)

# Print the summary and its length
print("Summary:", summary)


Summary: I am far from a sushi connoisseur but I can definitely tell the difference between good food and bad food and this was certainly bad food. Food was good, service was good, Prices were good. Good value, great food, great service. Very good food, great atmosphere.1 Damn good steak. Good food , good service . Great food and great service in a clean and friendly setting. seems like a good quick place to grab a bite of some familiar pub food, but do yourself a favor and look elsewhere. Best service and food ever, Maria our server was so good and friendly she made our day. Took an hour to get our food only 4 tables in restaurant my food was Luke warm, Our sever was running around like he was totally overwhelmed. If you want to wait for mediocre food and downright terrible service, then this is the place for you. The food is very good for your typical bar food. So good I am going to have to review this place twice - once hereas a tribute to the place and once as a tribute to an event

In [10]:
print("Summary Length:", len(summary))

Summary Length: 21345
