In [None]:
# Programmer Name: Ms Lee Wen Xi, APD3F2211CS(IS)
# Program Name: generate_test_data.ipynb
# Description: A script to retrieve 50 data rows randomly from the full dataset
# First Written On: 26/05/2023
# Last Edited On:  22/07/2023

In [7]:
import pandas as pd
from nltk.tokenize import word_tokenize
# import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import os
import random
import string

In [10]:
# Preprocessing
nltk_stop_words = set(stopwords.words('english'))
punc = list(string.punctuation) + list('‘’')
stop_words = []
train_header = 'clean_text'
target = 'category'

# Lexicon of negation cues references: Negation Scope Detection for Twitter Sentiment Analysis + manual filter from nltk stop words
negations = ['aint', 'doesnt', 'havent', 'lacks', 'none', 'mightnt', 'shouldnt', 'cannot', 'dont', 'havnt', 'neither', 'nor', 'mustnt', 'wasnt', 'cant', 'hadnt', 'isnt', 'never', 'not', 'neednt', 'without', 'darent' 'hardly', 'lack', 'no', 'nothing', 'oughtnt', 'wouldnt', 'didnt', 'hasnt', 'lacking', 'nobody', 'nowhere', 'shant', 'ain', 'doesn', 'haven', 'mightn', 'shouldn', 'havn', 'mustn', 'wasn', 'hadn', 'isn', 'needn', 'oughtn', 'wouldn', 'didn', 'hasn', 'shan', 'couldn', 'won', 'don', 'aren', 'arent', 'weren', 'werent' 'against']
rneg = r"[A-Za-z]{1,}n't\b"

for w in nltk_stop_words:
    if not w in negations and not re.match(rneg, w):
        stop_words.append(w)
  
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

def remove_tags(string):
    result = re.sub(r'@[A-Za-z0-9]{0,}(\s|\b)', '', string)   #remove @ tags
    result = re.sub(r'#[A-Za-z0-9]{0,}(\s|\b)', '', result)   #remove # tags
    result = re.sub(r'\b((http|https):\/\/)[-a-zA-Z0-9@:%._\\+~#?&\/\/=]{0,}','',result)   #remove URLs
    result = remove_emojis(result)    # remove emojis
    result = result.lower()
    
    return result

In [9]:
# Twitter_Data records
data = pd.read_csv('Datasets\\Twitter Sentiment Dataset\\Twitter_Data.csv')
data = data.dropna()
data = data.sample(frac = 1)
data = data.head(50)

data[train_header] = data[train_header].apply(lambda cw : remove_tags(cw)) 
data[train_header] = data[train_header].apply(lambda x: x.replace('"', ''))
data[train_header] = data[train_header].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in (punc)]))
data[train_header] = data[train_header].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in (stop_words)]))
data[train_header] = data[train_header].apply(lambda x: re.sub('\W+',' ',x).strip())
data[train_header] = data[train_header].str.strip()

data.loc[data[target] == 1.0, target] = 'Positive'
data.loc[data[target] == 0.0, target] = 'Neutral'
data.loc[data[target] == -1.0, target] = 'Negative'

data = data.loc[:, [train_header, target]]
data.rename(columns = {train_header:'What is your opinion about the election?'}, inplace = True)
data.rename(columns = {target:'What is your opinion about the election?_Sentiment'}, inplace = True)

data.to_csv('Demo_Test_Data.csv', index=False)