### Imports

In [4]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
os.chdir("..")

In [5]:
import json
import pickle
import joblib
import numpy as np
import pandas as pd
from scripts.utils import *

### Functions

In [6]:
cwd = os.getcwd()
print(cwd)

c:\Thesis\classifier


In [7]:
def get_train_data():
  '''
		This function returns the training data with annotations

		Data Structures
		---------------
		Input:
			None
		Output:
			train_text_list					: LIST
			train_aspect_dict_list	: LIST
			train_label							: LIST
  '''
  train_path = 'data/annotated/7/a2/train.pkl'
  (original_train_text_list,train_aspect_dict_list,
    train_label_list) = get_list_from_data_frame(train_path)

  preprocessed_train_list_path = 'data/annotated/7/preprocessed_train_list.pkl'
  preprocessed_train_text_list = pickle.loads(
    open(os.path.join(cwd, preprocessed_train_list_path), 'rb').read())

  return (original_train_text_list, preprocessed_train_text_list,
          train_aspect_dict_list, train_label_list)

def get_test_data():
  '''
		This function returns the test data with annotations

		Data Structures
		---------------
		Input:
			None
		Output:
			test_text_list							: LIST
			preprocessed_test_text_list	: LIST
			test_aspect_dict_list				: LIST
			test_label									: LIST
  '''
  test_path = 'data/annotated/7/a2/test.pkl'
  (original_test_text_list,test_aspect_dict_list,
    test_label_list) = get_list_from_data_frame(test_path)

  preprocessed_test_list_path = 'data/annotated/7/preprocessed_test_list.pkl'
  preprocessed_test_text_list = pickle.loads(
    open(os.path.join(cwd, preprocessed_test_list_path), 'rb').read())

  return (original_test_text_list, preprocessed_test_text_list,
          test_aspect_dict_list,test_label_list)

In [8]:
def get_number(annotation, mode):
  positive = 0
  negative = 0
  neutral = 0
  
  if mode == "al":
    for inner_list in annotation:
      for _dict in inner_list:
        for key in _dict:
          label = _dict[key]
          if label == "positive":
            positive += 1
          elif label == "negative":
            negative += 1
          elif label == "neutral":
            neutral += 1
  elif mode == "sl":
    for label in annotation:
      if label == "pos":
        positive += 1
      elif label == "neg":
        negative += 1
      elif label == "neu":
        neutral += 1
  
  return positive, negative, neutral


def get_sentence_level_list(da):
  '''
    Takes the path of annotated json file and 
    returns the texts as list and the aspects with annotated labels
  '''
  data_path = "../../data/annotated/7/a2/sl_annotator_2.json"
  annotated_file = json.loads(open(os.path.join(cwd, data_path), 'r', encoding="utf-8").read())
  pos_count = 0
  neg_count = 0
  neu_count = 0

  for i in range(len(annotated_file)):
    line = annotated_file[i] #dictionary
    tweet = line.get("data") #string
    tweet_label = line.get("label") #list
    tweet_label = tweet_label[0]
    
    if tweet_label == "pos":
      pos_count += 1
    elif tweet_label == "neg":
      neg_count += 1
    elif tweet_label == "neu":
      neu_count += 1
  
  return pos_count,neg_count,neu_count

def get_data(data_frame):
  tweet_list = data_frame['tweets'].tolist()
  aspect_list = data_frame['annotated_aspects'].tolist()
  sentence_label_list = data_frame['sentence_label'].tolist()
  
  return tweet_list,aspect_list,sentence_label_list

### Train Description

In [9]:
train_df = pickle.loads(open(os.path.join(cwd, 'data/annotated/7/a2/train.pkl'), 'rb').read())


In [10]:
(original_train_text_list, preprocessed_train_text_list,
          train_aspect_dict_list, train_label_list) = get_train_data()

In [11]:
train_aspect_list = get_all_aspects(train_aspect_dict_list)
tweet_count = len(original_train_text_list)
count_aspects = len(train_aspect_list)
count_unique = len(list(dict.fromkeys(train_aspect_list)))
(pos_asp_count, neg_asp_count, neu_asp_count) = get_number(train_aspect_dict_list, mode="al")
(pos_sent_count, neg_sent_count, neu_sent_count) = get_number(train_label_list, mode="sl")

In [12]:
print("==============================================================")
print("                   TRAIN DATA EXPLORATION LOG                 ")
print("==============================================================")
print("Number of tweets             : {}".format(tweet_count))
print("Number of aspects            : {}".format(count_aspects))
print("Number of unique aspects     : {}".format(count_unique))
print("Number of positive aspects   : {}".format(pos_asp_count))
print("Number of negative aspects   : {}".format(neg_asp_count))
print("Number of neutral aspects    : {}".format(neu_asp_count))
print("Number of positive sentence  : {}".format(pos_sent_count))
print("Number of negative sentence  : {}".format(neg_sent_count))
print("Number of neutral sentence   : {}".format(neu_sent_count))

                   TRAIN DATA EXPLORATION LOG                 
Number of tweets             : 2214
Number of aspects            : 3522
Number of unique aspects     : 802
Number of positive aspects   : 454
Number of negative aspects   : 519
Number of neutral aspects    : 2549
Number of positive sentence  : 182
Number of negative sentence  : 352
Number of neutral sentence   : 1680


### Test Description

In [13]:
# split dataset
from sklearn.model_selection import train_test_split
test_df = pickle.loads(open(os.path.join(cwd, 'data/annotated/7/a2/test.pkl'), 'rb').read())
implement_df,validate_df = train_test_split(test_df, test_size=0.2, train_size=0.8, random_state=1)

In [None]:
implement_df.to_pickle('data/annotated/7/a2/implement.pkl')
validate_df.to_pickle('data/annotated/7/a2/validate.pkl')

In [None]:
(val_tweet_list, val_aspect_dict_list, 
    val_sentence_label_list) = get_data(validate_df)
val_aspect_list = get_all_aspects(val_aspect_dict_list)
val_tweet_count = len(val_tweet_list)
val_count_aspects = len(val_aspect_list)
val_count_unique = len(list(dict.fromkeys(val_aspect_list)))
(val_pos_asp_count, val_neg_asp_count, val_neu_asp_count) = get_number(val_aspect_dict_list, mode="al")
(val_pos_sent_count, val_neg_sent_count, val_neu_sent_count) = get_number(val_sentence_label_list, mode="sl")

print("==============================================================")
print("                VALIDATE DATA EXPLORATION LOG                 ")
print("==============================================================")
print("Number of tweets             : {}".format(val_tweet_count))
print("Number of aspects            : {}".format(val_count_aspects))
print("Number of unique aspects     : {}".format(val_count_unique))
print("Number of positive aspects   : {}".format(val_pos_asp_count))
print("Number of negative aspects   : {}".format(val_neg_asp_count))
print("Number of neutral aspects    : {}".format(val_neu_asp_count))
print("Number of positive sentence  : {}".format(val_pos_sent_count))
print("Number of negative sentence  : {}".format(val_neg_sent_count))
print("Number of neutral sentence   : {}".format(val_neu_sent_count))

In [None]:
(imp_tweet_list, imp_aspect_dict_list, 
    imp_sentence_label_list) = get_data(implement_df)
imp_aspect_list = get_all_aspects(imp_aspect_dict_list)
imp_tweet_count = len(imp_tweet_list)
imp_count_aspects = len(imp_aspect_list)
imp_count_unique = len(list(dict.fromkeys(imp_aspect_list)))
(imp_pos_asp_count, imp_neg_asp_count, imp_neu_asp_count) = get_number(imp_aspect_dict_list, mode="al")
(imp_pos_sent_count, imp_neg_sent_count, imp_neu_sent_count) = get_number(imp_sentence_label_list, mode="sl")

print("==============================================================")
print("                    TEST DATA EXPLORATION LOG                 ")
print("==============================================================")
print("Number of tweets             : {}".format(imp_tweet_count))
print("Number of aspects            : {}".format(imp_count_aspects))
print("Number of unique aspects     : {}".format(imp_count_unique))
print("Number of positive aspects   : {}".format(imp_pos_asp_count))
print("Number of negative aspects   : {}".format(imp_neg_asp_count))
print("Number of neutral aspects    : {}".format(imp_neu_asp_count))
print("Number of positive sentence  : {}".format(imp_pos_sent_count))
print("Number of negative sentence  : {}".format(imp_neg_sent_count))
print("Number of neutral sentence   : {}".format(imp_neu_sent_count))