In [1]:
import pandas as pd
import numpy as np
import unicodedata
import contractions
import transformers
from transformers import BertTokenizer
import logging
import torch
import numpy as np
import warnings
from transformers import BertTokenizer, BertModel
import pickle as pkl
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [2]:
def to_lower(data: pd.Series):
    return data.str.lower()

def remove_accented_characters(data: pd.Series):
    return data.apply(lambda x: unicodedata.normalize("NFKD", x).encode("ascii", "ignore").decode("utf-8", "ignore"))

def remove_html_encodings(data: pd.Series):
    return data.str.replace(r"\d+;", " ", regex=True)

def remove_html_tags(data: pd.Series):
    return data.str.replace(r"<[a-zA-Z]+\s?/?>", " ", regex=True)

def remove_url(data: pd.Series):
    return data.str.replace(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", " ", regex=True)

def remove_non_alpha_characters(data: pd.Series):
    return data.str.replace(r"_+|\|[^a-zA-Z0-9\s]", " ", regex=True)

def remove_extra_spaces(data: pd.Series):
    return data.str.replace(r"^\s*|\s\s*", " ", regex=True)

def fix_contractions(data: pd.Series):
    def contraction_fixer(txt: str):
        return " ".join([contractions.fix(word) for word in txt.split()])

    return data.apply(contraction_fixer)

def remove_special_words(data: pd.Series):
    return data.str.replace(r"\-[^a-zA-Z]{3}\-", " ", regex=True)

def get_train_test_split(model_data, x_columns, y_column, stratify_column):
    X_train, X_test, Y_train, Y_test = train_test_split(
        model_data[x_columns],
        model_data[y_column],
        train_size=0.8, 
        test_size=0.2, 
        random_state=42, 
        shuffle=True,
        stratify=model_data[stratify_column])
    
    print("Train: ",X_train.shape, Y_train.shape,
      "Test: ",(X_test.shape, Y_test.shape))
    print(type(X_train))
    return X_train, X_test, Y_train, Y_test

cls = "[CLS]"
sep = "[SEP]"
pad = "[PAD]"
space = " "

In [3]:
data = pd.read_csv("text_data_compiled.csv")
data = data.drop(columns=['key', 'show', 'sarcasm_type'])
data.set_index('scene', inplace = True)

le = preprocessing.LabelEncoder()
data['speaker'] = le.fit_transform(data['speaker'])
data

Unnamed: 0_level_0,context,target,speaker,sarcasm
scene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1_10004,A few months. How long have you been involved ...,"And of those few months, how long have you bee...",25,0.0
1_10009,Ah-da-da-da-da! What the hell?! Excuse me? Tha...,"Let the dead man talk. So, why do you think that?",15,0.0
1_1001,It's smashed beyond repair. What are you gonna...,"What else? Sell it on eBay as ""slightly used.""",21,0.0
1_1003,I'm gonna go back and try talking to her again...,"Good idea, sit with her. Hold her, comfort her...",7,1.0
1_10190,"Sure. What's up? Leonard, I could use your ass...","Well, now that I've given up string theory, I'...",25,0.0
...,...,...,...,...
3_S06E02_398,"I mean, he really, really likes Pied Piper. He...","Look, we cannot take blood money.",14,0.0
3_S06E03_366,Right. Yeah. -we could just buy Hooli. -(laugh...,The-the same way we can buy America and everyt...,22,1.0
3_S06E05_355,"I was just curious to know, like, what's it li...","Well, maybe some time when you're working on s...",14,1.0
3_S06E06_143,-Were you gonna tell me about this? -No. You g...,I thought that was the company policy-these days.,6,1.0


In [4]:
data_cleaning_pipeline = [
        to_lower,
        remove_special_words,
        remove_accented_characters,
        remove_html_encodings,
        remove_html_tags,
        remove_url,
        fix_contractions,
        remove_non_alpha_characters,
        remove_extra_spaces]

inputs = ["target", "context"]

def clean_data(data):
    data_copy = data.copy()
    for col in inputs:
        temp_data = data_copy[col].copy()
        for func in data_cleaning_pipeline:
            temp_data = func(temp_data)
        data_copy[col] = temp_data.copy()
    return data_copy

cleaned_data = clean_data(data)

In [5]:
cleaned_data.head()

Unnamed: 0_level_0,context,target,speaker,sarcasm
scene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1_10004,a few months. how long have you been involved...,"and of those few months, how long have you be...",25,0.0
1_10009,ah-da-da-da-da! what the hell?! excuse me? th...,"let the dead man talk. so, why do you think t...",15,0.0
1_1001,it is smashed beyond repair. what are you goi...,"what else? sell it on ebay as ""slightly used.""",21,0.0
1_1003,i am going to go back and try talking to her ...,"good idea, sit with her. hold her, comfort he...",7,1.0
1_10190,"sure. what is up? leonard, i could use your a...","well, now that i have given up string theory,...",25,0.0


In [6]:
data["target_"] = cls + space + data["target"].astype(str) + space + sep
data["target_context"] = cls + space + data["target"].astype(str) + space + data["context"].astype(str) + space + sep
# data["target_context"] = cls + space + data["target"].astype(str) + space + sep + space + data["context"].astype(str) + space + sep
X_train, X_test, Y_train, Y_test = get_train_test_split(data, ["target_", "target_context", "speaker"], ["sarcasm"], "sarcasm")

Train:  (961, 3) (961, 1) Test:  ((241, 3), (241, 1))
<class 'pandas.core.frame.DataFrame'>


In [7]:
X_train.head(10)

Unnamed: 0_level_0,target_,target_context,speaker
scene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1_S09E05_291,[CLS] I've been told it's a good way to move o...,[CLS] I've been told it's a good way to move o...,25
1_S12E07_179,"[CLS] Yeah, sure. You slept with your husband....","[CLS] Yeah, sure. You slept with your husband....",1
2_210,[CLS] When are you coming home? [SEP],[CLS] When are you coming home? Okay. Alright....,16
1_S12E02_262,[CLS] Riveting. [SEP],[CLS] Riveting. Bingo. Then I lifted the cushi...,0
2_103,"[CLS] No, this is just part of a daredevil gam...","[CLS] No, this is just part of a daredevil gam...",2
2_267,[CLS] Really!? [SEP],[CLS] Really!? Pa-haa!! I would love to go wit...,20
2_447,[CLS] It was an accident. Not like I was acros...,[CLS] It was an accident. Not like I was acros...,2
1_S11E11_182,"[CLS] Oh, fun. Can I help? [SEP]","[CLS] Oh, fun. Can I help? of all the cool thi...",15
1_S10E12_115,[CLS] Cause at the end I assumed there'd be nu...,[CLS] Cause at the end I assumed there'd be nu...,14
1_S10E07_267,[CLS] I told you Penny was hiding his things?!...,[CLS] I told you Penny was hiding his things?!...,7


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
def create_tensors_BERT(column, text):
    bert_pad_len = 512
    print("Tokenizing text...")
    logging.basicConfig(level = logging.INFO)
    tokenizer = BertTokenizer.from_pretrained("bhadresh-savani/bert-base-uncased-emotion")
    tokenized_text = [tokenizer.tokenize(x) for x in text]
    tokenized_text = [x + ([pad] * (bert_pad_len - len(x))) for x in tokenized_text]
    indexed_text = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_text]
    segment_ids = []
    if column == "target_bsfuhi": 
        for text in tokenized_text:
            septoken_index = [i for i, x in enumerate(text) if x == sep]
            septoken_index.sort()
            first_index = septoken_index[0]
            second_index = septoken_index[1]
            segment_ids_0 = [0] * (first_index + 1)
            segment_ids_1 = [1] * (second_index - first_index)
            segment_ids_pad = [0] * (len(text) - second_index - 1)
            segment_id = segment_ids_0 + segment_ids_1 + segment_ids_pad
            segment_ids.append(segment_id)
    else:
        for text in tokenized_text:
            septoken_index = [i for i, x in enumerate(text) if x == sep]
            septoken_index.sort()
            first_index = septoken_index[0]
            segment_ids_1 = [1] * (first_index + 1)
            segment_ids_pad = [0] * (len(text) - first_index - 1)
            segment_id = segment_ids_1 + segment_ids_pad
            segment_ids.append(segment_id)

    torch_idx_text = torch.LongTensor(indexed_text)
    torch_seg_ids = torch.LongTensor(segment_ids)
    return tokenized_text, torch_idx_text, torch_seg_ids 

def get_embeddings(torch_idx_text, torch_seg_ids):
    print("Getting Embeddings...")
    model = BertModel.from_pretrained('bhadresh-savani/bert-base-uncased-emotion', output_hidden_states = True)
    model.eval()

    torch_idx_text, torch_seg_ids = torch_idx_text.to("cpu"), torch_seg_ids.to("cpu")
    model.to(device)
    with torch.no_grad():
        bert_embeddings = []
        for i in range(len(torch_idx_text)):
            print(i, end = "\r")
            text_temp = torch.unsqueeze(torch_idx_text[i], dim = 0).to(device)
            sgmt_temp = torch.unsqueeze(torch_seg_ids[i], dim = 0).to(device)
            output = model(text_temp, sgmt_temp)
            bert_embeddings.append(output[2])
            del text_temp, sgmt_temp
    del model
  
    return bert_embeddings

In [10]:
def create_word_embeddings(bert_embeddings):
    final_embeds = []
    for embed in bert_embeddings:
        token_embeddings = torch.stack(embed, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1,0,2)
    
        token_vecs_sum = []
        for token in token_embeddings:
            sum_vec = torch.mean(token[-4:], dim=0)
            token_vecs_sum.append(sum_vec)
        final_embeds.append(token_vecs_sum)
    
    print(len(final_embeds))
    print('Shape is: %d x %d' % (len(final_embeds[0]), len(final_embeds[0][0])))
    return final_embeds
    
def save_embeddings(embeddings_file_path, embeddings, tokenized_text):
    with open(embeddings_file_path, mode="wb") as file:
        pkl.dump({"embeddings": embeddings, "tokenized_txt": tokenized_text}, file, protocol=pkl.HIGHEST_PROTOCOL)
        
def create_embeddings(train_cleaned_data, test_cleaned_data, column):
    train_tokenized_text, train_torch_idx_text, train_torch_seg_ids = create_tensors_BERT(column, train_cleaned_data[column])
    train_bert_embeddings = get_embeddings(train_torch_idx_text, train_torch_seg_ids)
    train_bert_embeddings = create_word_embeddings(train_bert_embeddings)
    test_tokenized_text, test_torch_idx_text, test_torch_seg_ids = create_tensors_BERT(column, test_cleaned_data[column])
    test_bert_embeddings = get_embeddings(test_torch_idx_text, test_torch_seg_ids)
    test_bert_embeddings = create_word_embeddings(test_bert_embeddings)
    
    train_embeddings_file_path = "bert_embeddings/train_bert_emo_embeddings_" + column + ".pkl"
    test_embeddings_file_path = "bert_embeddings/test_bert_emo_embeddings_" + column + ".pkl"

    save_embeddings(train_embeddings_file_path, train_bert_embeddings, train_tokenized_text)
    save_embeddings(test_embeddings_file_path, test_bert_embeddings, test_tokenized_text)


inputs = ["target_", "target_context"]
for col in inputs:
    create_embeddings(X_train, X_test, col)

Tokenizing text...


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/935 [00:00<?, ?B/s]

Getting Embeddings...


Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at bhadresh-savani/bert-base-uncased-emotion were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


961
Shape is: 512 x 768
Tokenizing text...
Getting Embeddings...


Some weights of the model checkpoint at bhadresh-savani/bert-base-uncased-emotion were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


241
Shape is: 512 x 768
Tokenizing text...
Getting Embeddings...


Some weights of the model checkpoint at bhadresh-savani/bert-base-uncased-emotion were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


961
Shape is: 512 x 768
Tokenizing text...
Getting Embeddings...


Some weights of the model checkpoint at bhadresh-savani/bert-base-uncased-emotion were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


241
Shape is: 512 x 768


In [11]:
Y_train.to_csv("bert_embeddings/train_labels_bert_emo.csv", index = False)
Y_test.to_csv("bert_embeddings/test_labels_bert_emo.csv", index = False)

In [12]:
X_train.to_csv("bert_embeddings/train_data_bert_emo.csv", index = False)
X_test.to_csv("bert_embeddings/test_data_bert_emo.csv", index = False)