In [1]:
!pip install transformers 

import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import pandas as pd
import pickle
from sklearn.svm import SVC
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4


In [2]:
# get data
url = "https://raw.githubusercontent.com/soujanyaporia/MUStARD/master/data/sarcasm_data.json"
df = pd.read_json(url,orient='index')

In [6]:
# first time: feature extraction
# https://towardsdatascience.com/feature-extraction-with-bert-for-text-classification-533dde44dc2f

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)

tokenized_utterance = tokenizer(df["utterance"].values.tolist(), padding = True, truncation = True, return_tensors="pt")
tokenized_utterance = {k:torch.tensor(v).to(device) for k,v in tokenized_utterance.items()}
with torch.no_grad():
  hidden_utterance = model(**tokenized_utterance)
cls_utterance = hidden_utterance.last_hidden_state[:,0,:]
print("fin utterance")
file = open('drive/MyDrive/bert_utterance', 'wb')
pickle.dump(cls_utterance, file)
file.close()

df["context"] = df.apply(lambda r:" ".join(r["context"]), axis=1)
tokenized_context = tokenizer(df["context"].values.tolist(), padding = True, truncation = True, return_tensors="pt")
tokenized_context = {k:torch.tensor(v).to(device) for k,v in tokenized_context.items()}
with torch.no_grad():
  hidden_context = model(**tokenized_context)
cls_context = hidden_context.last_hidden_state[:,0,:]
print("fin context")
file = open('drive/MyDrive/bert_context', 'wb')
pickle.dump(cls_context, file)
file.close()


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


fin utterance


  tokenized_context = {k:torch.tensor(v).to(device) for k,v in tokenized_context.items()}


fin context


In [10]:
# second time: load features
file = open('drive/MyDrive/bert_utterance', 'rb')
check = pickle.load(file)
file.close()
file2 = open('drive/MyDrive/bert_context', 'rb')
check2 = pickle.load(file2)
file2.close()

In [59]:
# split

# sarcasm = np.expand_dims(df["sarcasm"], axis=1)
# df_embeddings = np.concatenate([cls_utterance, cls_context, sarcasm], axis=1)

X_train, X_test, y_train, y_test = train_test_split(np.concatenate([cls_utterance, cls_context], axis=1), df["sarcasm"], test_size=0.3, random_state = 42)


In [60]:
# tune hyperparameters
params = [{'kernel': ['rbf'], 
                     'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100]},
                    {'kernel': ['linear'],
                     'C': [1, 10, 100]},
         {'kernel': ['poly'],
                     'degree': [1, 10, 100]}]
gs = GridSearchCV(SVC(),
                      param_grid=params,
                      scoring='f1',
                      cv=5, n_jobs=4)
gs.fit(X_train, y_train)
print(gs.best_params_)

{'degree': 10, 'kernel': 'poly'}


In [61]:
clf = gs.best_estimator_
scores = cross_val_score(clf, X_test, y_test, cv=5, scoring='f1')
sum(scores) / len(scores)

0.7074074074074075