In [1]:
import pandas as pd
import numpy as np
import unicodedata
import contractions
from transformers import BertTokenizer
import logging
import torch
import numpy as np
import warnings
from transformers import BertTokenizer, BertModel
import pickle as pkl
from sklearn.model_selection import train_test_split

In [2]:
def to_lower(data: pd.Series):
    return data.str.lower()

def remove_accented_characters(data: pd.Series):
    return data.apply(lambda x: unicodedata.normalize("NFKD", x).encode("ascii", "ignore").decode("utf-8", "ignore"))

def remove_html_encodings(data: pd.Series):
    return data.str.replace(r"\d+;", " ", regex=True)

def remove_html_tags(data: pd.Series):
    return data.str.replace(r"<[a-zA-Z]+\s?/?>", " ", regex=True)

def remove_url(data: pd.Series):
    return data.str.replace(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", " ", regex=True)

def remove_non_alpha_characters(data: pd.Series):
    return data.str.replace(r"_+|\|[^a-zA-Z0-9\s]", " ", regex=True)

def remove_extra_spaces(data: pd.Series):
    return data.str.replace(r"^\s*|\s\s*", " ", regex=True)

def fix_contractions(data: pd.Series):
    def contraction_fixer(txt: str):
        return " ".join([contractions.fix(word) for word in txt.split()])

    return data.apply(contraction_fixer)

def remove_special_words(data: pd.Series):
    return data.str.replace(r"\-[^a-zA-Z]{3}\-", " ", regex=True)

def get_train_test_split(model_data, x_columns, y_column, stratify_column):
    X_train, X_test, Y_train, Y_test = train_test_split(
        model_data[x_columns],
        model_data[y_column],
        train_size=0.8, 
        test_size=0.2, 
        random_state=42, 
        shuffle=True,
        stratify=model_data[stratify_column])
    
    print("Train: ",X_train.shape, Y_train.shape,
      "Test: ",(X_test.shape, Y_test.shape))
    print(type(X_train))
    return X_train, X_test, Y_train, Y_test

cls = "[CLS]"
sep = "[SEP]"
pad = "[PAD]"
space = " "
bert_pad_len = 512

In [3]:
data = pd.read_csv("text_data_compiled.csv")
data = data.drop(columns=['key', 'show', 'sarcasm_type'])
data.set_index('scene', inplace = True)
data

Unnamed: 0_level_0,context,target,speaker,sarcasm
scene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1_10004,A few months. How long have you been involved ...,"And of those few months, how long have you bee...",SHELDON,0.0
1_10009,Ah-da-da-da-da! What the hell?! Excuse me? Tha...,"Let the dead man talk. So, why do you think that?",PENNY,0.0
1_1001,It's smashed beyond repair. What are you gonna...,"What else? Sell it on eBay as ""slightly used.""",RAJ,0.0
1_1003,I'm gonna go back and try talking to her again...,"Good idea, sit with her. Hold her, comfort her...",HOWARD,1.0
1_10190,"Sure. What's up? Leonard, I could use your ass...","Well, now that I've given up string theory, I'...",SHELDON,0.0
...,...,...,...,...
3_S06E02_398,"I mean, he really, really likes Pied Piper. He...","Look, we cannot take blood money.",OTHER,0.0
3_S06E03_366,Right. Yeah. -we could just buy Hooli. -(laugh...,The-the same way we can buy America and everyt...,RICHARD,1.0
3_S06E05_355,"I was just curious to know, like, what's it li...","Well, maybe some time when you're working on s...",OTHER,1.0
3_S06E06_143,-Were you gonna tell me about this? -No. You g...,I thought that was the company policy-these days.,GILFOYLE,1.0


In [4]:
data["target_"] = cls + space + data["target"].astype(str) + space + sep
data["target_speaker"] = cls + space + data["target"].astype(str) + space + sep + space + data["speaker"].astype(str) + space + sep
data["target_context"] = cls + space + data["target"].astype(str) + space + sep + space + data["context"].astype(str) + space + sep
data["target_context_speaker"] = cls + space + data["target"].astype(str) + space + sep + space + data["context"].astype(str) + space + sep + space + data["speaker"].astype(str) + space + sep

In [5]:
X_train, X_test, Y_train, Y_test = get_train_test_split(data, ["target_", "target_speaker", "target_context", "target_context_speaker"], ["sarcasm"], "sarcasm")

Train:  (961, 4) (961, 1) Test:  ((241, 4), (241, 1))
<class 'pandas.core.frame.DataFrame'>


In [6]:
data_cleaning_pipeline = [
        to_lower,
        remove_special_words,
        remove_accented_characters,
        remove_html_encodings,
        remove_html_tags,
        remove_url,
        fix_contractions,
        remove_non_alpha_characters,
        remove_extra_spaces]

inputs = ["target_", "target_speaker", "target_context", "target_context_speaker"]

def clean_data(data):
    data_copy = data.copy()
    for col in inputs:
        temp_data = data_copy[col].copy()
        for func in data_cleaning_pipeline:
            temp_data = func(temp_data)
        data_copy[col] = temp_data.copy()
    return data_copy

train_cleaned_data = clean_data(X_train)
test_cleaned_data = clean_data(X_test)

In [7]:
train_cleaned_data.head()

Unnamed: 0_level_0,target_,target_speaker,target_context,target_context_speaker
scene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1_S09E05_291,[cls] i have been told it is a good way to mo...,[cls] i have been told it is a good way to mo...,[cls] i have been told it is a good way to mo...,[cls] i have been told it is a good way to mo...
1_S12E07_179,"[cls] yeah, sure. you slept with your husband...","[cls] yeah, sure. you slept with your husband...","[cls] yeah, sure. you slept with your husband...","[cls] yeah, sure. you slept with your husband..."
2_210,[cls] when are you coming home? [sep],[cls] when are you coming home? [sep] person ...,[cls] when are you coming home? [sep] okay. a...,[cls] when are you coming home? [sep] okay. a...
1_S12E02_262,[cls] riveting. [sep],[cls] riveting. [sep] amy [sep],[cls] riveting. [sep] bingo. then i lifted th...,[cls] riveting. [sep] bingo. then i lifted th...
2_103,"[cls] no, this is just part of a daredevil ga...","[cls] no, this is just part of a daredevil ga...","[cls] no, this is just part of a daredevil ga...","[cls] no, this is just part of a daredevil ga..."


In [8]:
test_cleaned_data.head()

Unnamed: 0_level_0,target_,target_speaker,target_context,target_context_speaker
scene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2_388,"[cls] yeah, she could not live without the ch...","[cls] yeah, she could not live without the ch...","[cls] yeah, she could not live without the ch...","[cls] yeah, she could not live without the ch..."
1_5058,[cls] an entire dinner to talk about your res...,[cls] an entire dinner to talk about your res...,[cls] an entire dinner to talk about your res...,[cls] an entire dinner to talk about your res...
1_S11E21_080,[cls] is it your teen years? [sep],[cls] is it your teen years? [sep] howard [sep],"[cls] is it your teen years? [sep] no, there ...","[cls] is it your teen years? [sep] no, there ..."
1_S11E12_038,[cls] that is funny. i always thought howard ...,[cls] that is funny. i always thought howard ...,[cls] that is funny. i always thought howard ...,[cls] that is funny. i always thought howard ...
1_S11E01_337,"[cls] i am sorry, what? [sep]","[cls] i am sorry, what? [sep] penny [sep]","[cls] i am sorry, what? [sep] you could have ...","[cls] i am sorry, what? [sep] you could have ..."


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
def create_tensors_BERT(text):
    print("Tokenizing text...")
    logging.basicConfig(level = logging.INFO)
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    tokenized_text = [tokenizer.tokenize(x) for x in text]
    tokenized_text = [x + ([pad] * (bert_pad_len - len(x))) for x in tokenized_text]
    indexed_text = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_text]
    segment_ids = [[1] * len(x) for x in tokenized_text]
    torch_idx_text = torch.LongTensor(indexed_text)
    torch_seg_ids = torch.LongTensor(segment_ids)
    return tokenized_text, torch_idx_text, torch_seg_ids 

def get_embeddings(torch_idx_text, torch_seg_ids):
    print("Getting Embeddings...")
    model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
    model.eval()

    torch_idx_text, torch_seg_ids = torch_idx_text.to("cpu"), torch_seg_ids.to("cpu")
    model.to(device)
    with torch.no_grad():
        bert_embeddings = []
        for i in range(len(torch_idx_text)):
            print(i, end = "\r")
            text_temp = torch.unsqueeze(torch_idx_text[i], dim = 0).to(device)
            sgmt_temp = torch.unsqueeze(torch_seg_ids[i], dim = 0).to(device)
            output = model(text_temp, sgmt_temp)
            bert_embeddings.append(output[0])
            del text_temp, sgmt_temp
    del model
  
    return bert_embeddings

In [11]:
def save_embeddings(embeddings_file_path, embeddings, tokenized_text):
    with open(embeddings_file_path, mode="wb") as file:
        pkl.dump({"embeddings": embeddings, "tokenized_txt": tokenized_text}, file, protocol=pkl.HIGHEST_PROTOCOL)
        
def create_embeddings(train_cleaned_data, test_cleaned_data, column):
    train_tokenized_text, train_torch_idx_text, train_torch_seg_ids = create_tensors_BERT(train_cleaned_data[column])
    test_tokenized_text, test_torch_idx_text, test_torch_seg_ids = create_tensors_BERT(test_cleaned_data[column])
    train_bert_embeddings = get_embeddings(train_torch_idx_text, train_torch_seg_ids)
    test_bert_embeddings = get_embeddings(test_torch_idx_text, test_torch_seg_ids)
    train_embeddings_file_path = "embeddings/train_bert_embeddings_" + column + ".pkl"
    test_embeddings_file_path = "embeddings/test_bert_embeddings_" + column + ".pkl"


    save_embeddings(train_embeddings_file_path, train_bert_embeddings, train_tokenized_text)
    save_embeddings(test_embeddings_file_path, test_bert_embeddings, test_tokenized_text)

    print(len(train_bert_embeddings), train_bert_embeddings[0].shape)
    print(len(test_bert_embeddings), test_bert_embeddings[0].shape)

    train_bert_embeddings = torch.cat(train_bert_embeddings)
    test_bert_embeddings = torch.cat(test_bert_embeddings)
    train_avg_embeddings = torch.sum(train_bert_embeddings, dim=1) / 512
    test_avg_embeddings = torch.sum(test_bert_embeddings, dim=1) / 512

    train_avg_embeddings_file_path = "embeddings/train_avg_embeddings_" + column + ".pkl"
    test_avg_embeddings_file_path = "embeddings/test_avg_embeddings_" + column + ".pkl"

    with open(train_avg_embeddings_file_path, mode="wb") as train_file, open(test_avg_embeddings_file_path, mode="wb") as test_file:
        pkl.dump(train_avg_embeddings.numpy(), train_file, protocol=pkl.HIGHEST_PROTOCOL)
        pkl.dump(test_avg_embeddings.numpy(), test_file, protocol=pkl.HIGHEST_PROTOCOL)
    
    print(train_avg_embeddings.shape)
    print(test_avg_embeddings.shape)

for col in inputs:
    create_embeddings(train_cleaned_data, test_cleaned_data, col)

Tokenizing text...
Tokenizing text...
Getting Embeddings...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting Embeddings...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


961 torch.Size([1, 512, 768])
241 torch.Size([1, 512, 768])
torch.Size([961, 768])
torch.Size([241, 768])
Tokenizing text...
Tokenizing text...
Getting Embeddings...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting Embeddings...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


961 torch.Size([1, 512, 768])
241 torch.Size([1, 512, 768])
torch.Size([961, 768])
torch.Size([241, 768])
Tokenizing text...
Tokenizing text...
Getting Embeddings...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting Embeddings...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


961 torch.Size([1, 512, 768])
241 torch.Size([1, 512, 768])
torch.Size([961, 768])
torch.Size([241, 768])
Tokenizing text...
Tokenizing text...
Getting Embeddings...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting Embeddings...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


961 torch.Size([1, 512, 768])
241 torch.Size([1, 512, 768])
torch.Size([961, 768])
torch.Size([241, 768])


In [12]:
Y_train.to_csv("embeddings/train_labels.csv", index = False)
Y_test.to_csv("embeddings/test_labels.csv", index = False)