In [1]:
import os
import json
import pickle
import joblib
import numpy as np
import pandas as pd

cwd = os.getcwd()

In [2]:
def get_list_from_data_frame(data_path):
  data_frame = pickle.loads(open(os.path.join(cwd, data_path), 'rb').read())
    
  tweet_list = data_frame['tweets'].tolist()
  aspect_list = data_frame['annotated_aspects'].tolist()
  label = data_frame['sentence_label'].tolist()
  return tweet_list,aspect_list,label



In [3]:
def get_sentence_level_list(path):
  '''
    Takes the path of annotated json file and 
    returns the texts as list and the aspects with annotated labels
  '''
  annotated_file = json.loads(open(os.path.join(cwd, path), 'r', encoding="utf-8").read())
  tweet_list = list()
  label_list = list()

  for i in range(len(annotated_file)):
    line = annotated_file[i] #dictionary
    tweet = line.get("data") #string
    tweet_label = line.get("label") #list
    tweet_label = tweet_label[0]
    
    tweet_list.append(tweet)
    label_list.append(tweet_label)
  
  return tweet_list,label_list

In [4]:

import re

def get_list_aspect(data_path):
  '''
    Takes the path of aspect level annotated json file and 
    returns the texts as list and the aspects with annotated labels
    
    Data Structures
    ---------------
    Input:
      data_path   : STRING
    Returns:
      tweet_list  : LIST
      aspect_list : LIST
  '''
  annotated_file = json.loads(open(os.path.join(cwd, data_path), 'r', encoding="utf-8").read())
  tweet_list = []
  aspect_list = []

  for i in range(len(annotated_file)):
    line = annotated_file[i] #dictionary
    tweet = line.get("data") #string
    tweet_aspect_label = line.get("label") #list

    aspect_label_inner_list = []

    for j in range(len(tweet_aspect_label)):
      aspect_label = tweet_aspect_label[j] #element
      
      start = aspect_label[0]
      end = aspect_label[1]
      temp_tweet = tweet[0:start]
      
      count,has_emoji = check_emoji(temp_tweet)
            
      if has_emoji:
        start -= count
        end -= count
      else:
        start = aspect_label[0]
        end = aspect_label[1]
      
      label = aspect_label[2]
      
      if label == "pos":
        label = "positive"
      elif label == "neg":
        label = "negative"
      elif label == "neu":
        label = "neutral"
      
      aspect = tweet[start:end].strip(" ").lower()
      if (tweet[start:end+1].strip(" ").lower()) == aspect + "s":
        aspect = aspect + "s"
      if (tweet[start:end+2].strip(" ").lower()) == aspect + "es":
        aspect = aspect + "es"
      if (tweet[start:end+3].strip(" ").lower()) == aspect + "ren":
        aspect = aspect + "ren"

      label_dict = {aspect : label}
      aspect_label_inner_list.append(label_dict)
      
    aspect_list.append(aspect_label_inner_list)
    tweet_list.append(tweet)
  
  return tweet_list,aspect_list

emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
        "]+", re.UNICODE)

def check_emoji(data):
  tokens = data.split()
  has_emoji = False
  count = 0
  
  for token in tokens:
    temp_token = re.sub(emoj, '', token)
    
    if temp_token == '' or len(temp_token) != len(token):
      count += 1
      has_emoji = True
    
  return count,has_emoji

def remove_excess_space(data):
  text = str(data)
  text = re.sub(r"//t",r"\t", text)
  text = re.sub(r"( )\1+",r"\1", text)
  text = re.sub(r"(\n)\1+",r"\1", text)
  text = re.sub(r"(\r)\1+",r"\1", text)
  text = re.sub(r"(\t)\1+",r"\1", text)
  return text.strip()


In [26]:
def get_al_annotation(al_file_path):
  text_list, aspect_dict_list = get_list_aspect(al_file_path)
  return text_list,aspect_dict_list

def get_sl_annotation(sl_file_path):
  text_list, aspect_dict_list = get_sentence_level_list(sl_file_path)
  return text_list,aspect_dict_list

In [27]:
sl_text_list_1,sl_label_1 = get_sl_annotation("../../data/annotated/7/a1/sl_annotator_1.json")
sl_text_list_2,sl_label_2 = get_sl_annotation("../../data/annotated/7/a2/sl_annotator_2.json")
sl_text_list_3,sl_label_3 = get_sl_annotation("../../data/annotated/7/a3/sl_annotator_3.json")

In [28]:
al_text_list_1,al_aspect_dict_list_1 = get_al_annotation("../../data/annotated/7/a1/al_annotator_1.json")
al_text_list_2,al_aspect_dict_list_2 = get_al_annotation("../../data/annotated/7/a2/al_annotator_2.json")
al_text_list_3,al_aspect_dict_list_3 = get_al_annotation("../../data/annotated/7/al.json")


In [29]:
data_frame = pd.DataFrame()

In [30]:
data_frame['tweets'] = al_text_list_3
data_frame['annotated_aspects_1'] = al_aspect_dict_list_1
data_frame['annotated_aspects_2'] = al_aspect_dict_list_2
data_frame['annotated_aspects_3'] = al_aspect_dict_list_3
data_frame['sentence_label_1'] = sl_label_1
data_frame['sentence_label_2'] = sl_label_2
data_frame['sentence_label_3'] = sl_label_3

In [None]:
data_frame.to_pickle('../data/annotated/7/dataset.pkl')

In [31]:
# split dataset
from sklearn.model_selection import train_test_split

train,test = train_test_split(data_frame, test_size=0.3, train_size=0.7, random_state=1)

In [12]:
train_1 = train[['tweets','annotated_aspects_1', 'sentence_label_1']]
train_2 = train[['tweets','annotated_aspects_2', 'sentence_label_2']]
train_3 = train[['tweets','annotated_aspects_3', 'sentence_label_3']]

test_1 = test[['tweets','annotated_aspects_1', 'sentence_label_1']]
test_2 = test[['tweets','annotated_aspects_2', 'sentence_label_2']]
test_3 = test[['tweets','annotated_aspects_3', 'sentence_label_3']]

train_1 = train_1.rename(columns={"annotated_aspects_1":"annotated_aspects","sentence_label_1": "sentence_label"})
train_2 = train_2.rename(columns={"annotated_aspects_2":"annotated_aspects","sentence_label_2": "sentence_label"})
train_3 = train_3.rename(columns={"annotated_aspects_3":"annotated_aspects","sentence_label_3": "sentence_label"})
test_1 = test_1.rename(columns={"annotated_aspects_1":"annotated_aspects","sentence_label_1": "sentence_label"})
test_2 = test_2.rename(columns={"annotated_aspects_2":"annotated_aspects","sentence_label_2": "sentence_label"})
test_3 = test_3.rename(columns={"annotated_aspects_3":"annotated_aspects","sentence_label_3": "sentence_label"})

In [None]:
train_1.to_pickle('../../data/annotated/7/a1/train.pkl')
train_2.to_pickle('../../data/annotated/7/a2/train.pkl')
train_3.to_pickle('../../data/annotated/7/a3/train.pkl')

In [None]:
test_1.to_pickle('../../data/annotated/7/a1/test.pkl')
test_2.to_pickle('../../data/annotated/7/a2/test.pkl')
test_3.to_pickle('../../data/annotated/7/a3/test.pkl')

In [None]:
os.chdir("..")
from scripts import preprocessing

In [None]:
train_text_list, train_aspect_dict_list,train_label = get_list_from_data_frame('../data/annotated/6/train.pkl')
train_text_list = preprocessing.preprocess(train_text_list)
joblib.dump(train_text_list, 'data/annotated/7/preprocessed_train_list.pkl')

In [None]:
test_text_list, test_aspect_dict_list,test_label = get_list_from_data_frame('../data/annotated/6/test.pkl')
test_text_list = preprocessing.preprocess(test_text_list)
joblib.dump(test_text_list, 'data/annotated/7/preprocessed_test_list.pkl')