In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag

### Import scrapped data and drop unnessary columns

In [2]:
df_10k = pd.read_csv('data_all(10881).xlsx - Sheet1.csv')  
print(type(df_10k))
df_10k.drop(columns=['uid', 'Label #1', 'Label #2', 'Label #3'], inplace=True)
df_10k.columns = ['restaurant name', 'rating', 'content', 'label']
print(df_10k.columns)

<class 'pandas.core.frame.DataFrame'>
Index(['restaurant name', 'rating', 'content', 'label'], dtype='object')


In [3]:
df_10k

Unnamed: 0,restaurant name,rating,content,label
0,Konomi Zen,3,Crunchy tempura esp the vegetables,2.0
1,Vincent Western Food,5,this is one of the best western food i've eate...,1.0
2,Siam Square Mookata - Best Mookata Restaurant ...,5,Many choice of food to select. Love their teri...,1.0
3,Old Chang Kee,1,Buying snacks for customers but system mainten...,0.0
4,Hiang Ji Cantonese Roasts,1,Seriously overprice and rude service. Avoid at...,0.0
...,...,...,...,...
10112,Ichikokudo Hokkaido Ramen,4,Wasn’t crowded during the dinner period.\nDece...,
10113,Kedai Makan Muhajirin,4,"Had the mee rebus, mee siam and nasi lemak wit...",
10114,Shami Banana Leaf,5,"MY FAVOURITE INDIAN RESTAURANT.\nTheir soya, T...",
10115,Rahim Muslim Food,4,The taste is unlike the usual Mee rebus you fi...,


### Cleaning Content with Emoji Removal, Lemmatizer and Non English words

In [5]:
#!pip install flashtext
#!pip install emot
import emot
from emot.emo_unicode import UNICODE_EMOJI, UNICODE_EMOJI_ALIAS, EMOTICONS_EMO
from flashtext import KeywordProcessor

## formatting
all_emoji_emoticons = {**EMOTICONS_EMO,**UNICODE_EMOJI_ALIAS, **UNICODE_EMOJI_ALIAS}
all_emoji_emoticons = {k:v.replace(":","").replace("_"," ").strip() for k,v in all_emoji_emoticons.items()}

kp_all_emoji_emoticons = KeywordProcessor()
for k,v in all_emoji_emoticons.items():
    kp_all_emoji_emoticons.add_keyword(k, v)

In [6]:
# Might take awhile to run if dataset is large as it iterates through every row

#nltk.download('words')
words = set(nltk.corpus.words.words())
#!pip install clean-text
from cleantext import clean

#nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()

df_10k['content_clean'] = ""
df_10k['Tokenized'] = ""

for index, row in df_10k.iterrows():
    # removes all emoji
    #row['content_clean'] = clean(row['content'],no_emoji=True)
    
    # Lemmatize the words in sentence
    tokenized_text = word_tokenize(row['content'])
    lemmatized_text = [lemmatizer.lemmatize(word) for word in tokenized_text]

    """ 
    Removes non english words by:
    Joining English words w.lower() in words and joins with symbols/punctation --> w.alpha()
    Limitations :
    Removes some words:
    1. NER nouns (teriyaki chicken becomes chicken)
    2. Mispelled
    3. Split sort forms like can't , i've (i've become i' because ve is not a word)
    """ # Remove Non English word in nltk.corpus
    row['content_clean'] = " ".join(w for w in lemmatized_text if w.lower() in words or not w.isalpha())

    # Replacing emoji with words instead, done after because i want to retain the full text of emoji
    row['content_clean'] = kp_all_emoji_emoticons.replace_keywords(row['content_clean'])
    
    # Tokenize each sentence
    row['Tokenized'] = [ word_tokenize(t) for t in sent_tokenize(row['content_clean']) if t not in words ]
    
    # Pos tagging for each sentence in row['Tokenized']
    for i in range (len(row['Tokenized'])):
        # i refers to each of the tokenized sentence
        row['Tokenized'][i] = pos_tag(row['Tokenized'][i])
    
    df_10k['content_clean'][index] = row['content_clean']
    df_10k['Tokenized'][index] = row['Tokenized']

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_10k['content_clean'][index] = row['content_clean']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_10k['Tokenized'][index] = row['Tokenized']


In [7]:
df_10k

Unnamed: 0,restaurant name,rating,content,label,content_clean,Tokenized
0,Konomi Zen,3,Crunchy tempura esp the vegetables,2.0,Crunchy the vegetable,"[[(Crunchy, NNP), (the, DT), (vegetable, NN)]]"
1,Vincent Western Food,5,this is one of the best western food i've eate...,1.0,this is one of the best western food i 've eat...,"[[(this, DT), (is, VBZ), (one, CD), (of, IN), ..."
2,Siam Square Mookata - Best Mookata Restaurant ...,5,Many choice of food to select. Love their teri...,1.0,Many choice of food to select . Love their por...,"[[(Many, JJ), (choice, NN), (of, IN), (food, N..."
3,Old Chang Kee,1,Buying snacks for customers but system mainten...,0.0,snack for customer but system maintenance cant...,"[[(snack, NN), (for, IN), (customer, NN), (but..."
4,Hiang Ji Cantonese Roasts,1,Seriously overprice and rude service. Avoid at...,0.0,Seriously overprice and rude service . Avoid a...,"[[(Seriously, RB), (overprice, NN), (and, CC),..."
...,...,...,...,...,...,...
10112,Ichikokudo Hokkaido Ramen,4,Wasn’t crowded during the dinner period.\nDece...,,’ t crowded during the dinner period . Decent ...,"[[(’, JJ), (t, NN), (crowded, VBD), (during, I..."
10113,Kedai Makan Muhajirin,4,"Had the mee rebus, mee siam and nasi lemak wit...",,"Had the rebus , and nasi with , all in all an ...","[[(Had, VBD), (the, DT), (rebus, NN), (,, ,), ..."
10114,Shami Banana Leaf,5,"MY FAVOURITE INDIAN RESTAURANT.\nTheir soya, T...",,"MY RESTAURANT . Their soya , sambal & potato a...","[[(MY, PRP$), (RESTAURANT, NNP), (., .)], [(Th..."
10115,Rahim Muslim Food,4,The taste is unlike the usual Mee rebus you fi...,,The taste is unlike the usual rebus you find e...,"[[(The, DT), (taste, NN), (is, VBZ), (unlike, ..."


In [8]:
print(df_10k['content'][1002])
print(df_10k['content_clean'][1002])

Such luxuriously enjoyable Omakase 🤗
Rich sea urchin melts in our mouth 🤤
With friendly staff and chef, complementing with the sweet umeshu, making the meal even more mesmerising 🤠
Such luxuriously enjoyable hugging face Rich sea urchin melt in our mouth drooling face With friendly staff and chef , with the sweet , making the meal even more cowboy hat face


### Splitting Dataset into Labelled and UnLabelled 

In [19]:
df_eval = df_10k.loc[df_10k['label'].notna()]
df_train = df_10k.loc[df_10k['label'].isna()]

In [20]:
# Assigning Positive and Negative to Train data 
# <=2 is Negative, Label = 0
# >=3 is Positive, Label = 1

conditions = [
    (df_train['rating'] < 3),
    (df_train['rating'] >= 3)
    ]
Labels = [0, 1]
df_train['label'] = np.select(conditions,Labels)
print(df_train)

                 restaurant name  rating  \
1000            TungLok Teahouse       5   
1001              Malaysia Boleh       5   
1002               Douraku Sushi       5   
1003            511 Indian Store       5   
1004     The Teochew Kitchenette       4   
...                          ...     ...   
10112  Ichikokudo Hokkaido Ramen       4   
10113      Kedai Makan Muhajirin       4   
10114          Shami Banana Leaf       5   
10115          Rahim Muslim Food       4   
10116    The Teochew Kitchenette       5   

                                                 content  label  \
1000   Lisa is a very good host and made us feel very...      1   
1001   Very nice piping hot claypot with dark sauce a...      1   
1002   Such luxuriously enjoyable Omakase 🤗\nRich sea...      1   
1003                        Nice India ingredients foods      1   
1004   Ordered the Stir-Fry Kang Kong and Marmite Chi...      1   
...                                                  ...    ...   
10

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['label'] = np.select(conditions,Labels)


In [21]:
df_train['label'].value_counts()
# if use <= 3 is Negative, ratio is 0.30 : 0.70
# if use <= 2 is Negative ratio is 0.18 : 0.82

1    7434
0    1683
Name: label, dtype: int64

In [22]:
df_eval['label'].value_counts()
# Ratio of Negative : Positive : Neutral is 0.707 : 0.233 : 0.06

1.0    707
0.0    233
2.0     60
Name: label, dtype: int64

In [23]:
eval_rating_3 = df_eval.loc[df_eval['rating'] == 3 ]
print(eval_rating_3['label'].value_counts())
# Ratio of Negative : Positive : Neutral is 0.33 : 0.46 : 0.21

1.0    56
0.0    41
2.0    26
Name: label, dtype: int64


### Subjectivity Detection Using Textblob

#### https://link.springer.com.remotexs.ntu.edu.sg/chapter/10.1007/978-981-15-1884-3_14

In [24]:
#!pip install TextBlob
from textblob import TextBlob
def getSubjectivity(text):
    #"Biased":0, "Neutral":1
    subj = TextBlob(text).sentiment.subjectivity
    # print(f"debug: subjectivity score is {subj}")
    return subj

#### Testing Textblob subjectivity detection with our manual labelled data

In [25]:
df_eval['predicted_subjectivity'] = df_eval['content_clean'].apply(getSubjectivity)
df_train['predicted_subjectivity'] = df_train['content_clean'].apply(getSubjectivity)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_eval['predicted_subjectivity'] = df_eval['content_clean'].apply(getSubjectivity)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['predicted_subjectivity'] = df_train['content_clean'].apply(getSubjectivity)


In [26]:
objective = 0 # Number of Objective/Unbiased Review
neutral_count = 0
for i in range(len(df_eval)):
    
    if df_eval['predicted_subjectivity'][i] == 0:
        p = 1 # Neutral
    else:
        p = 0 # Biased
    
    if df_eval['label'][i] == 2: 
        label = 1 # Neutral
    else:
        label = 0 # Biased
    
    if label != p:
        # If there is a mismatch of objective class, add 1 count
        objective += 1
    #if label == 1 and p == 1:
        #neutral_count += 1

#print(neutral_count)  # Correctly identify 11 out of 60 Neutral labelled reviews...
# Percentage of Objective Review (non opinionated)
objective_score = objective/len(df_eval)
print('Subjectivity Detection Accuracy:', (1 - objective_score)*100)

Subjectivity Detection Accuracy: 93.6


In [27]:
print("Length of Original Labelled Dataset:", len(df_eval))
# Drop Predicted value = 0.0000 ... i.e. Neutral Review by Textblob Subjectivity
df_eval = df_eval[df_eval.predicted_subjectivity != 0]
print("Length of Labelled Dataset after Textblob Subjectivity:", len(df_eval), '\n')

print("Length of Original UnLabelled Dataset:", len(df_train))
# Drop Predicted value = 0.0000 ... i.e. Neutral Review by Textblob Subjectivity
df_train = df_train[df_train.predicted_subjectivity != 0]
print("Length of UnLabelled Dataset after Textblob Subjectivity:", len(df_train))

Length of Original Labelled Dataset: 1000
Length of Labelled Dataset after Textblob Subjectivity: 976 

Length of Original UnLabelled Dataset: 9117
Length of UnLabelled Dataset after Textblob Subjectivity: 8895


### Saving csv files for scrapped 10k data

keeping records if it is opinionated... <br> 
i.e predicted score for subjectivity > 0.000

In [28]:
df_eval.to_csv('labelled_10k.csv',index=False)
df_train.to_csv('unlabelled_10k.csv', index=False)

### Yelp dataset Cleaning

In [29]:
df_yelp = pd.read_csv('yelp_review_processed_3-5pos.csv')  
df_yelp.columns = ['content','label']
df_yelp

Unnamed: 0,content,label
0,Tried to give this place a second chance and w...,0
1,My Mom ordered penne pasta and received taglia...,0
2,The facility is clean and level however the st...,0
3,Absolutely do not bother There is a coffee bar...,0
4,Inga is the only competent employee here The e...,0
...,...,...
49995,Wow Talk about your dichotomy of the absolute...,1
49996,Awesome location right on the water Great beac...,1
49997,We had the Ropa Vieja and Pork Chop Chuletas F...,1
49998,I really enjoyed the place Its small but intim...,1


In [30]:
df_yelp['content'][0]

'Tried to give this place a second chance and will not be going back Used a Groupon for state inspection They told me I needed 5 bulbs replaced and front rotors replaced to pass inspection and it would be 400 Took my car back to Peruzzi where I purchased the vehicle to have them double check Peruzzi informed me only 2 bulbs needed to be replaced and that the rotors passed but need to be replaced soon Appears Meinke thinks women know nothing about cars and just want to lie and say things need to be replaced ASAP so you think you are forced to use them for repairs for your car will fail state inspection'

In [31]:
# Might take awhile to run if dataset is large as it iterates through every row

#nltk.download('words')
words = set(nltk.corpus.words.words())
#!pip install clean-text
from cleantext import clean

#nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()

df_yelp['content_clean'] = ""
df_yelp['Tokenized'] = ""

for index, row in df_yelp.iterrows():
    # removes all emoji
    #row['content_clean'] = clean(row['content'],no_emoji=True)
    
    # Lemmatize the words in sentence
    tokenized_text = word_tokenize(row['content'])
    lemmatized_text = [lemmatizer.lemmatize(word) for word in tokenized_text]

    """ 
    Removes non english words by:
    Joining English words w.lower() in words and joins with symbols/punctation --> w.alpha()
    Limitations :
    Removes some words:
    1. NER nouns (teriyaki chicken becomes chicken)
    2. Mispelled
    3. Split sort forms like can't , i've (i've become i' because ve is not a word)
    """ # Remove Non English word in nltk.corpus
    row['content_clean'] = " ".join(w for w in lemmatized_text if w.lower() in words or not w.isalpha())

    # Replacing emoji with words instead, done after because i want to retain the full text of emoji
    row['content_clean'] = kp_all_emoji_emoticons.replace_keywords(row['content_clean'])
    
    # Tokenize each sentence
    row['Tokenized'] = [ word_tokenize(t) for t in sent_tokenize(row['content_clean']) if t not in words ]
    
    # Pos tagging for each sentence in row['Tokenized']
    for i in range (len(row['Tokenized'])):
        # i refers to each of the tokenized sentence
        row['Tokenized'][i] = pos_tag(row['Tokenized'][i])
    
    df_yelp['content_clean'][index] = row['content_clean']
    df_yelp['Tokenized'][index] = row['Tokenized']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_yelp['content_clean'][index] = row['content_clean']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_yelp['Tokenized'][index] = row['Tokenized']


### 50k Yelp Textblob subjectivity classification

In [32]:
df_yelp['predicted_subjectivity'] = df_yelp['content_clean'].apply(getSubjectivity)

In [33]:
df_yelp

Unnamed: 0,content,label,content_clean,Tokenized,predicted_subjectivity
0,Tried to give this place a second chance and w...,0,Tried to give this place a second chance and w...,"[[(Tried, VBN), (to, TO), (give, VB), (this, D...",0.214286
1,My Mom ordered penne pasta and received taglia...,0,My ordered and received instead Delivery perso...,"[[(My, PRP$), (ordered, JJ), (and, CC), (recei...",0.688889
2,The facility is clean and level however the st...,0,The facility is clean and level however the st...,"[[(The, DT), (facility, NN), (is, VBZ), (clean...",0.433333
3,Absolutely do not bother There is a coffee bar...,0,Absolutely do not bother There is a coffee bar...,"[[(Absolutely, RB), (do, VBP), (not, RB), (bot...",0.476389
4,Inga is the only competent employee here The e...,0,is the only competent employee here The evenin...,"[[(is, VBZ), (the, DT), (only, JJ), (competent...",0.594728
...,...,...,...,...,...
49995,Wow Talk about your dichotomy of the absolute...,1,Wow Talk about your dichotomy of the absolutel...,"[[(Wow, NNP), (Talk, VBP), (about, IN), (your,...",0.566667
49996,Awesome location right on the water Great beac...,1,Awesome location right on the water Great beac...,"[[(Awesome, NNP), (location, NN), (right, RB),...",0.672619
49997,We had the Ropa Vieja and Pork Chop Chuletas F...,1,We had the and Pork Chop both were really real...,"[[(We, PRP), (had, VBD), (the, DT), (and, CC),...",0.550000
49998,I really enjoyed the place Its small but intim...,1,I really the place Its small but intimate Grea...,"[[(I, PRP), (really, RB), (the, DT), (place, N...",0.568519


In [34]:
original_length = len(df_yelp)
print("Length of Original 50k yelp Dataset:", original_length)

# Drop Predicted value = 0.0000 ... i.e. Neutral Review by Textblob Subjectivity
df_yelp = df_yelp[df_yelp.predicted_subjectivity != 0]
print("Length of UnLabelled Dataset after Textblob Subjectivity:", len(df_yelp))
print("Number of Non Subjective Review removed:" , original_length - len(df_yelp))


Length of Original 50k yelp Dataset: 50000
Length of UnLabelled Dataset after Textblob Subjectivity: 49639
Number of Non Subjective Review removed: 361


### Saving csv file for 50k Yelp data

In [35]:
df_yelp.to_csv('yelp_review_after_subjectivity_classification.csv',index=False)