In [57]:
# import gensim.downloader as api
# from gensim.models import KeyedVectors

import torch
from torch.utils.data import Dataset
import re
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, classification_report

# fx. used to lowercase the sentence and strip of punctuation
def cleanText(train_text):
  return_list = []
  for i in range(len(train_text)):
    # strip of punctuation and lowercase
    cleaned_string = re.sub(r'[^\w\s]', '', train_text[i].lower())
    # strip and append to list
    return_list.append(cleaned_string.strip())
  return return_list

# tokenize each sentence to words ["this", "is", "sample"]
def tokenize(text):
  word_list = []
  for sentence in text:
    word_list.append(sentence.split())
  return word_list

# create a vocabulary set of word -> index to be used later
def buildVocab(sentence_list, dict):
  count = 2
  for sentence in sentence_list:
    for word in sentence:
      if word not in dict:
        dict[word] = count
        count = count + 1
  return dict

# this fx. converts each setence in the data set to indices ["this", "is", "sample"] -> [2,4,5]
def numerize(data_set, vocab_set):
  return_set = []
  for sentence in data_set:
    sentence_list = []
    for word in sentence:
      if word in vocab_set:
        sentence_list.append(vocab_set[word])
      else:
        sentence_list.append(vocab_set["<UNK>"])
    return_set.append(sentence_list)

  # print(return_set)
  return return_set





def main():
  # loading the datasets
  train_df = pd.read_csv('https://raw.githubusercontent.com/QUANHONGLE/CS421-emotion-prediction/main/P1_data/trac2_CONVT_train.csv')
  test_df =  pd.read_csv('https://raw.githubusercontent.com/QUANHONGLE/CS421-emotion-prediction/main/P1_data/trac2_CONVT_test.csv', on_bad_lines='skip')
  dev_df =  pd.read_csv('https://raw.githubusercontent.com/QUANHONGLE/CS421-emotion-prediction/main/P1_data/trac2_CONVT_dev.csv', on_bad_lines='skip')


  print(train_df.head())
  # print("\n")
  # print("TEST:")
  # print(test_df.head())
  # print("\n")
  # print("DEV")
  # print(dev_df.head())

  # split the data set into texts for preprocessing
  train_text = train_df["text"].astype(str).tolist()
  test_text = test_df["text"].astype(str).tolist()
  dev_text = dev_df["text"].astype(str).tolist()

  # train id,  emotion, polarity, and empathy
  train_id = train_df["id"].astype(int).tolist()
  train_emotion = train_df["Emotion"].astype(float).tolist()
  train_polarity = train_df["EmotionalPolarity"].astype(int).tolist()
  train_empathy = train_df["Empathy"].astype(float).tolist()



  cleaned_train = cleanText(train_text)
  tokenized_train_list = tokenize(cleaned_train)
  # print(tokenized_sentence_list)

  # processing the test set
  cleaned_test = cleanText(test_text)
  tokenized_test_list = tokenize(cleaned_test)
  # processing the dev set
  cleaned_dev = cleanText(dev_text)
  tokenized_dev_list = tokenize(cleaned_dev)

  # building a vocab set for normalization
  vocabulary_set = {"<PAD>" : 0 , "<UNK>": 1}
  buildVocab(tokenized_train_list, vocabulary_set)

  train_text_to_index = numerize(tokenized_train_list, vocabulary_set)
  test_text_to_index = numerize(tokenized_test_list, vocabulary_set)
  dev_text_to_index = numerize(tokenized_dev_list, vocabulary_set)



  # print(train_text_to_index)
  # print(test_text_to_index)
  # print(dev_text_to_index)

main()

   id  article_id  conversation_id  turn_id   speaker  \
0   0          35                1        0  Person 1   
1   1          35                1        1  Person 2   
2   2          35                1        2  Person 1   
3   3          35                1        3  Person 2   
4   4          35                1        4  Person 1   

                                                text person_id_1 person_id_2  \
0              what did you think about this article        p019        p012   
1  It's definitely really sad to read, considerin...        p019        p012   
2  I think it's super sad... they seem to never c...        p019        p012   
3  I can't imagine just living in an area that is...        p019        p012   
4  Me too.. I also can't imagine living in the po...        p019        p012   

   Emotion  EmotionalPolarity  Empathy  SelfDisclosure  
0        1                  1        1          1.0000  
1        3                  2        4          2.0000  
2    