In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import os
import re
import json
import pickle
import joblib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

cwd = os.getcwd()
print(cwd)

c:\Thesis\classifier\classifier\notebooks


### Count annotated aspects (aspect level)

In [7]:
emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
        "]+", re.UNICODE)

def get_list_aspect():
  data_path = "../../data/annotated/7/a2/al_annotator_2.json"
  annotated_file = json.loads(open(os.path.join(cwd, data_path), 'r', encoding="utf-8").read())

  aspects = []
  tweet_count = 0
  count_unique = 0
  count_aspects = 0
  pos_count = 0
  neg_count = 0
  neu_count = 0

  for i in range(len(annotated_file)):
    line = annotated_file[i] #dictionary
    tweet = line.get("data") #string
    tweet_aspect_label = line.get("label") #list

    tweet_count += 1

    for j in range(len(tweet_aspect_label)):
      aspect_label = tweet_aspect_label[j] #element
      
      start = aspect_label[0]
      end = aspect_label[1]
      temp_tweet = tweet[0:start]

      count,has_emoji = check_emoji(temp_tweet)

      if has_emoji:
        start -= count
        end -= count
      else:
        start = aspect_label[0]
        end = aspect_label[1]

      label = aspect_label[2]

      if label == "pos":
        pos_count += 1
        label = "positive"
      elif label == "neg":
        neg_count += 1
        label = "negative"
      elif label == "neu":
        neu_count += 1
        label = "neutral"

      aspect = tweet[start:end].strip(" ").lower()
      if (tweet[start:end+1].strip(" ").lower()) == aspect + "s":
        aspect = aspect + "s"
      if (tweet[start:end+2].strip(" ").lower()) == aspect + "es":
        aspect = aspect + "es"
      if (tweet[start:end+3].strip(" ").lower()) == aspect + "ren":
        aspect = aspect + "ren"

      if aspect not in aspects:
        aspects.append(aspect)
        count_unique += 1

      count_aspects += 1

  return tweet_count,pos_count,neg_count,neu_count,count_unique,count_aspects

def check_emoji(data):
  tokens = data.split()
  has_emoji = False
  count = 0
  
  for token in tokens:
    temp_token = re.sub(emoj, '', token)
    
    if temp_token == '' or len(temp_token) != len(token):
      count += 1
      has_emoji = True
    
  return count,has_emoji

In [8]:
tweet_count,pos_asp_count,neg_asp_count,neu_asp_count,count_unique,count_aspects = get_list_aspect()

In [None]:
# with open('translated.txt', 'w', encoding='utf-8') as f:
#     for line in text_list:
#         f.write(line)
#         f.write('\n')

### Count annotated tweets (sentence level)

In [9]:
def get_sentence_level_list():
  '''
    Takes the path of annotated json file and 
    returns the texts as list and the aspects with annotated labels
  '''
  data_path = "../../data/annotated/7/a2/sl_annotator_2.json"
  annotated_file = json.loads(open(os.path.join(cwd, data_path), 'r', encoding="utf-8").read())
  pos_count = 0
  neg_count = 0
  neu_count = 0

  for i in range(len(annotated_file)):
    line = annotated_file[i] #dictionary
    tweet = line.get("data") #string
    tweet_label = line.get("label") #list
    tweet_label = tweet_label[0]
    
    if tweet_label == "pos":
      pos_count += 1
    elif tweet_label == "neg":
      neg_count += 1
    elif tweet_label == "neu":
      neu_count += 1
  
  return pos_count,neg_count,neu_count

In [10]:
pos_sent_count,neg_sent_count,neu_sent_count = get_sentence_level_list()

### Data Exploration Report

In [11]:
print("==============================================================")
print("               ANNOTATED DATA EXPLORATION LOG                 ")
print("==============================================================")
print("Number of tweets             : {}".format(tweet_count))
print("Number of aspects            : {}".format(count_aspects))
print("Number of unique aspects     : {}".format(count_unique))
print("Number of positive aspects   : {}".format(pos_asp_count))
print("Number of negative aspects   : {}".format(neg_asp_count))
print("Number of neutral aspects    : {}".format(neu_asp_count))
print("Number of positive sentence  : {}".format(pos_sent_count))
print("Number of negative sentence  : {}".format(neg_sent_count))
print("Number of neutral sentence   : {}".format(neu_sent_count))

               ANNOTATED DATA EXPLORATION LOG                 
Number of tweets             : 3163
Number of aspects            : 5000
Number of unique aspects     : 980
Number of positive aspects   : 635
Number of negative aspects   : 750
Number of neutral aspects    : 3615
Number of positive sentence  : 257
Number of negative sentence  : 496
Number of neutral sentence   : 2410


### Remove noise from annotated aspects (do not run)

In [None]:
def get_tweet_preprocessed_aspect(annotated_aspect_dict_list):
  all_aspect_list = [] # store all aspect from tweets
  noise_removed_aspect_list = [] # store all preprocessed aspects 
  
  for i in range(len(annotated_aspect_dict_list)):
    inner_list = annotated_aspect_dict_list[i] # list of {aspect:tag} combination of a tweet
    tweet_aspect = [] # store the aspect of the tweet
    for j in range(len(inner_list)):
      inner_dict = inner_list[j] # dictionary of {aspect:tag} combination
      for key in inner_dict:
        tweet_aspect.append(key)
    all_aspect_list.append(tweet_aspect)
    
  for i in tqdm(range(len(all_aspect_list))):
    aspect_list = all_aspect_list[i]
    cleaned_aspect_list = []

    for j in range(len(aspect_list)):
      aspect = aspect_list[j]
      cleaned_aspect = preprocessing.noise_removal(aspect)
      cleaned_aspect_list.append(cleaned_aspect)

    noise_removed_aspect_list.append(cleaned_aspect_list)

  return noise_removed_aspect_list

In [None]:
print(text_list[1])
print(annotated_aspect_dict_list[1])

### Count number of aspects for each polarity

### Count all aspects (unique)

In [None]:
aspect_list = []

for i in range(len(annotated_aspect_dict_list)):
  annotated_aspect_list = annotated_aspect_dict_list[i]
  inner_list = []
  for j in range(len(annotated_aspect_list)):
    inner_dict = annotated_aspect_list[j]
    for aspect in inner_dict:
      inner_list.append(aspect)
  aspect_list.append(inner_list)

In [None]:
unique_annotated_aspect_list = list(dict.fromkeys(annotated_aspect_list))
unique_annotated_aspect_list = sorted(unique_annotated_aspect_list)
print("There are {} aspects annotated".format(len(unique_annotated_aspect_list)))

In [None]:
# Get aspects based on number of words
def get_multi_word_aspect(unique_annotated_aspect_list):
  single_word = []
  mwa_two = []
  mwa_greater_than_two = []
  
  for i in range(len(unique_annotated_aspect_list)):
    aspect = unique_annotated_aspect_list[i]
    number_of_words = len(aspect.split())
    if number_of_words == 2:
      mwa_two.append(aspect)
    elif number_of_words > 2:
      mwa_greater_than_two.append(aspect)
    elif number_of_words < 2:
      single_word.append(aspect)
  
  num_single_word = len(single_word)
  num_mwa_two = len(mwa_two)
  num_mwa_greater_than_two = len(mwa_greater_than_two)
  
  return (single_word,mwa_two,mwa_greater_than_two,num_single_word,num_mwa_two,num_mwa_greater_than_two)

single_word,mwa_two,mwa_greater_than_two,num_single_word,num_mwa_two,num_mwa_greater_than_two = get_multi_word_aspect(unique_annotated_aspect_list)

In [None]:
print("==================================================")
print("Number of single-word aspects: {}".format(num_single_word))
print("Single word annotated aspects:")
for i in range(len(single_word)):
  print(single_word[i])

print("\n==================================================")

print("Number of double-word aspects: {}".format(num_mwa_two))
print("Single word annotated aspects:")
for i in range(len(mwa_two)):
  print(mwa_two[i])
  
print("\n==================================================")
  
print("Number of greater than two-word aspects: {}".format(num_mwa_greater_than_two))
print("Single word annotated aspects:")
for i in range(len(mwa_greater_than_two)):
  print(mwa_greater_than_two[i])

### Plot Top Annotated Aspects

In [None]:
# Plot frequency of annotated aspects
count = utils.get_aspects_frequency(annotated_aspect_dict_list)
sample_df = pd.DataFrame(count.most_common(50), columns=['Word', 'Frequency'])
plt.figure(figsize=(10,10))
sns.set_style("whitegrid")
ax = sns.barplot(x="Frequency",y="Word", data=sample_df.head(50))

In [None]:
from wordcloud import WordCloud

def wc(data,bgcolor):
    plt.figure(figsize = (80,80))
    wc = WordCloud(background_color = bgcolor, max_words = 100,  max_font_size = 50)
    wc.generate(' '.join(data))
    plt.imshow(wc)
    plt.axis('off')

In [None]:
title = "wordcloud"
wc(annotated_aspect_list,bgcolor="black")