In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import contractions
import emoji
import seaborn as sns
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import re
nltk.download('punkt')
nltk.download('stopwords')
from spellchecker import SpellChecker
from nltk.corpus import wordnet
from textblob import TextBlob
from wordcloud import WordCloud
from collections import Counter


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\limju\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\limju\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Jacintha's pre processing
- Remove duplicates
- Remove empty rows
- Remove non english
- Convert Emojis to english

### Split into train and test


In [2]:
df = pd.read_csv('final_cleaned_reviews_J.csv') #this is entirety of reviews from jacintha, add idx to column names
df = df[['idx','title','stars','text']]

testdf = pd.read_csv('to_annotate.csv') #test reviews only

  df = pd.read_csv('final_cleaned_reviews.csv') #this is entirety of reviews from jacintha


In [3]:
testidx = testdf.idx.tolist()


In [4]:
traindf = df[~df['idx'].isin(testidx)] #train reviews only

### Further cleaning and preprocessing
- This is applied to both train and test sets. It is applied to train set now, and test set in the individual model notebooks

1. Remove emojis
2. Remove stopwords
3. Extra whitespace
4. lemmatize (with POS)
5. Lowercase
6. Change contractions
7. Remove punctuations and numbers

In [8]:
# Remove emojis 
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  
        u"\U0001F300-\U0001F5FF"  
        u"\U0001F680-\U0001F6FF"  
        u"\U0001F1E0-\U0001F1FF" 
        u"\U00002500-\U00002BEF"  
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

def remove_stopwords(reviews):
    STOPWORDS = set(stopwords.words('english'))
    custom_stopwords = {'not', 'is', 'but'}
    STOPWORDS -= custom_stopwords
    if STOPWORDS is None:
        STOPWORDS = set(stopwords.words('english'))
    # Split the reviews into words and remove stopwords
    words = reviews.split()
    words_filtered = [word for word in words if word not in STOPWORDS]
    
    # Join the filtered words back into a string
    filtered_reviews = ' '.join(words_filtered)
    
    return filtered_reviews

def remove_extra_whitespace(reviews):
    return " ".join(reviews.split())

def get_wordnet_pos(text):
    # Map POS tag to first character lemmatize() accepts
    tags = nltk.pos_tag(text)
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    tags = [tag_dict.get(tag[1][0],  wordnet.NOUN) for tag in tags]
    return tags

def lemmaSentence(reviews):
    lemmatizer = WordNetLemmatizer()
    lemma_text = ''
    tok_text = word_tokenize(reviews)
    tags = get_wordnet_pos(tok_text)
    for i in range(len(tok_text)):
        lemma_text = lemma_text + ' ' + lemmatizer.lemmatize(tok_text[i], tags[i])
    return lemma_text[1:]

def lower_case(review):
    return review.lower()

# change contraction words such sa I'm = I am, shouldn't = should not
def change_contractions(review):
    
    expanded_words = [contractions.fix(word) for word in review.split()]

    expanded_review = ' '.join(expanded_words)
    return expanded_review

# Remove Punctuations
def remove_punctuations(review):
    
    new_review = review.translate(str.maketrans('', '', string.punctuation))
    return new_review

# Remove numbers
def remove_numbers(review):
    
    mapping = str.maketrans('', '', string.digits)
    new_review = review.translate(mapping)
    
    return new_review


def clean_text(data):

    data = data.apply(lower_case)
    data = data.apply(change_contractions)
    data = data.apply(remove_emojis)
    data = data.apply(remove_punctuations)
    data = data.apply(remove_numbers)
    data = data.apply(remove_stopwords)
    data = data.apply(remove_extra_whitespace)
    data = data.apply(lemmaSentence)
    
    return data


In [7]:
traindf['cleaned_text'] = clean_text(traindf.text)
traindf = traindf.reset_index(drop=True)
#testdf['cleaned_text'] = clean_text(testdf.text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  traindf['cleaned_text'] = clean_text(traindf.text)


In [39]:
traindf.to_csv("train_data.csv", index=False)

### Preparing test data

In [95]:
test_df=pd.read_csv('test_data.csv')
test_df.head()

Unnamed: 0,text,Polarity_Anno1,Polarity_Anno2,Polarity_Anno3
0,Food is generally good. Serving size on the sm...,POSITIVE,POSITIVE,POSITIVE
1,My friend and I are post duty from work and de...,NEGATIVE,NEGATIVE,POSITIVE
2,Their food is tasty and affordable. We ordered...,POSITIVE,POSITIVE,POSITIVE
3,A Mixed Experience at Tapas Club: Delectable C...,POSITIVE,POSITIVE,POSITIVE
4,Average taste with affordable price. Overall g...,POSITIVE,POSITIVE,POSITIVE


In [96]:
a1 = test_df.Polarity_Anno1
a2 = test_df.Polarity_Anno2
a3 = test_df.Polarity_Anno3

In [97]:
from sklearn.metrics import cohen_kappa_score
(cohen_kappa_score(a1, a2) + cohen_kappa_score(a1, a3) +cohen_kappa_score(a2, a3))/3

0.8579901511389728