<a href="https://colab.research.google.com/github/SORYNICI/ML/blob/main/Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import codecs
import pprint
from sklearn.model_selection import train_test_split
import csv

from torch import equal
# You can import known libraries to write your own code
import sys
import io

filePath = "C://Download/HW01/"

def load_json(file_name):
    """
    Load json file
    :param file_name: file name
    :return: loaded data from the file
    """
    with codecs.open(file_name, 'r', encoding='utf-8') as json_f:
        return json.load(json_f)


def extract_data(dataset):
    """
    Extract sentences and their labels from dataset
    - human_sentence: list of the first human utterance in a conversation
    - emotion_label: list of the emotion label.
    The range of the emotion label is 0 to 5
    0: 분노
    1: 슬픔
    2: 불안
    3: 상처
    4: 당황
    5: 기쁨
    :param dataset: loaded data from load_json function
    :return: human_sentences and their emotion labels
    """

    human_sentence = list()
    emotion_label = list()

    dialog_arr = {'HS01', 'HS02', 'HS03', 'SS01', 'SS02', 'SS03'}
    for data in dataset:
        for dialog in dialog_arr:
            if not len(data['talk']['content'][dialog]) == 0:
                human_sentence.append(data['talk']['content'][dialog])
                # E68 에서 6 - 1
                emotion_label.append(str(int(data['profile']['emotion']['type'][1]) -1))

    # pass    # Task 1

    return human_sentence, emotion_label


def save_csv(file_name, sentences, labels):
    """
    Save the sentences and labels to csv file
    Header is needed to identify the column

    :param file_name: output file name
    :param sentences: human sentences from extract_data function
    :param labels: emotion labels from extract_data function
    :return: None
    """
    # pass    # Task 2

    idx = 0
    with open(file_name, 'w', newline='\n', encoding='utf-8-sig') as file:
        write = csv.writer(file)
        write.writerow(["sentence", "label"])
        for i, sentence in enumerate(sentences):
            write.writerow([sentence, labels[i]])
            idx += 1


def main():
    """
    Entry of the program
    :return: None
    """
    training_data_json_filename = filePath + "감성대화말뭉치(최종데이터)_Training.json"
    test_data_json_filename = filePath + "감성대화말뭉치(최종데이터)_Validation.json"

    training_data_csv_filename = filePath + "train.csv"
    valid_data_csv_filename = filePath + "valid.csv"
    test_data_csv_filename = filePath + "test.csv"

    test_size = 0.1
    random_state = 108

    # 1. Load json files
    training_data = load_json(training_data_json_filename)
    test_data = load_json(test_data_json_filename)
    pprint.pprint(training_data[0])
    print()

    # 2. Extract data
    training_sentences, training_labels = extract_data(training_data)
    test_sentences, test_labels = extract_data(test_data)

    # 3. Split training data to training and valid data
    # Task 3
    # Use train_test_split function
    # Use test_size and random_state values to split the dataset

    # test_size=0.2 20% 를 valid 에 사용
    # shuffle
    # stratify 한 쪽에 쏠려서 분배되는 것을 방지
    # random_state 세트를 섞을 때 이 값을 참고, 데이터 셋이 매번 변경되는 것을 방지
    training_sentences, valid_sentences, training_labels, valid_labels = train_test_split(training_sentences, training_labels, test_size=0.2, shuffle=True, stratify=training_labels, random_state=47)

    print("# training data: ", len(training_sentences))
    print("# validation data: ", len(valid_sentences))
    print("# test data: ", len(test_sentences))

    # 4. Save data to csv file
    save_csv(training_data_csv_filename, training_sentences, training_labels)
    save_csv(valid_data_csv_filename, valid_sentences, valid_labels)
    save_csv(test_data_csv_filename, test_sentences, test_labels)


if __name__ == "__main__":
    main()