In [None]:
# packages

!pip install transformers 

import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import pandas as pd
import pickle
from sklearn.svm import SVC
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# get data
url = "https://raw.githubusercontent.com/soujanyaporia/MUStARD/master/data/sarcasm_data.json"
df = pd.read_json(url,orient='index')

In [None]:
# first time: feature extraction
# https://towardsdatascience.com/feature-extraction-with-bert-for-text-classification-533dde44dc2f

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)

tokenized_utterance = tokenizer(df["utterance"].values.tolist(), padding = True, truncation = True, return_tensors="pt")
tokenized_utterance = {k:torch.tensor(v).to(device) for k,v in tokenized_utterance.items()}
with torch.no_grad():
  hidden_utterance = model(**tokenized_utterance)
cls_utterance = hidden_utterance.last_hidden_state[:,0,:]
print("fin utterance")
file = open('drive/MyDrive/proj/bert_utterance', 'wb')
pickle.dump(cls_utterance, file)
file.close()

df["context"] = df.apply(lambda r:" ".join(r["context"]), axis=1)
tokenized_context = tokenizer(df["context"].values.tolist(), padding = True, truncation = True, return_tensors="pt")
tokenized_context = {k:torch.tensor(v).to(device) for k,v in tokenized_context.items()}
with torch.no_grad():
  hidden_context = model(**tokenized_context)
cls_context = hidden_context.last_hidden_state[:,0,:]
print("fin context")
file = open('drive/MyDrive/proj/bert_context', 'wb')
pickle.dump(cls_context, file)
file.close()


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


fin utterance


  tokenized_context = {k:torch.tensor(v).to(device) for k,v in tokenized_context.items()}


fin context


In [None]:
# second time: load features
file = open('drive/MyDrive/proj/bert_utterance', 'rb')
cls_utterance = pickle.load(file)
file.close()
file2 = open('drive/MyDrive/proj/bert_context', 'rb')
cls_context = pickle.load(file2)
file2.close()

In [None]:
# split
X_train, X_test, y_train, y_test = train_test_split(np.concatenate([cls_utterance, cls_context], axis=1), df["sarcasm"], test_size=0.3, random_state = 42)

In [None]:
# tune hyperparameters
params = [{'kernel': ['rbf'], 
                     'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100]},
                    {'kernel': ['linear'],
                     'C': [1, 10, 100]},
         {'kernel': ['poly'],
                     'degree': [1, 10, 100]}]
gs = GridSearchCV(SVC(),
                      param_grid=params,
                      scoring='f1',
                      cv=5, n_jobs=4)
gs.fit(X_train, y_train)
print(gs.best_params_)

{'degree': 10, 'kernel': 'poly'}


In [None]:
# evaluate
clf = gs.best_estimator_
scores = cross_val_score(clf, X_test, y_test, cv=5, scoring='f1')
sum(scores) / len(scores)

0.7074074074074075

In [None]:
# only utterance
import pandas as pd
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import jsonlines
import numpy as np

CLS_TOKEN_INDEX = 0

# read embeddings (feature column) and output (output column)
embeddings = []
with jsonlines.open("../MUStARD/data/bert-output.jsonl") as utterances:
    for utterance in utterances:
        features = utterance["features"][CLS_TOKEN_INDEX]
        bert_embedding_target = np.mean([np.array(features["layers"][layer]["values"])
                                            for layer in range(4)], axis=0)
        embeddings.append(np.copy(bert_embedding_target))
output = pd.read_json("../MUStARD/data/sarcasm_data.json")
output = output.transpose()["sarcasm"].astype(int)

# split
X_train, X_test, y_train, y_test = train_test_split(embeddings, 
                                                    output,
                                                    test_size=0.3,
                                                    random_state=0)

# tune hyperparameters
params = [{'kernel': ['rbf'], 
                     'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100]},
                    {'kernel': ['linear'],
                     'C': [1, 10, 100]},
         {'kernel': ['poly'],
                     'degree': [1, 10, 100]}]
gs = GridSearchCV(svm.SVC(),
                      param_grid=params,
                      scoring='f1',
                      cv=5, n_jobs=4)
gs.fit(X_train, y_train)
print(gs.best_params_)

# sample f1, confusion matrix
pred = gs.predict(X_test)
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

# evaluate 
clf = gs.best_estimator_
scores = cross_val_score(clf, X_test, y_test, cv=5, scoring='f1')
sum(scores) / len(scores)




# data visualization
from sklearn import decomposition
pca = decomposition.PCA(n_components=2)
pca.fit(embeddings)
dim = pca.transform(embeddings)

# display flatten data
import matplotlib.pyplot as plt
plt.scatter(dim[:,0],dim[:,1],c=output)
plt.scatter(dim_clf.support_vectors_[:,0],dim_clf.support_vectors_[:,1])
plt.show()
# see pres1 / https://github.com/TaliaWang/50.038-FINAL-PROJECT/blob/08ad6141b48b531246d99a9a786e0335c9c3454e/svm/run.ipynb for graph

# diplay f1
from sklearn.metrics import roc_curve, roc_auc_score

fpr, tpr, thresholds = roc_curve(y_test, pred)
plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc_score(y_test, pred))
plt.plot([0, 1], [0, 1], 'k--')  # random predictions curve
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate or (1 - Specifity)')
plt.ylabel('True Positive Rate or (Sensitivity)')
plt.title('Receiver Operating Characteristic')
plt.show()
# graph source