In [None]:
import pandas as pd
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import jsonlines
import numpy as np

CLS_TOKEN_INDEX = 0

# read embeddings (feature column) and output (output column)
embeddings = []
with jsonlines.open("../MUStARD/data/bert-output.jsonl") as utterances:
    for utterance in utterances:
        features = utterance["features"][CLS_TOKEN_INDEX]
        bert_embedding_target = np.mean([np.array(features["layers"][layer]["values"])
                                            for layer in range(4)], axis=0)
        embeddings.append(np.copy(bert_embedding_target))
output = pd.read_json("../MUStARD/data/sarcasm_data.json")
output = output.transpose()["sarcasm"].astype(int)

# split
X_train, X_test, y_train, y_test = train_test_split(embeddings, 
                                                    output,
                                                    test_size=0.3,
                                                    random_state=0)
# tune hyperparameters
params = [{'kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
         'degree': [3, 10, 15],
         'gamma': ['scale', 'auto', 2],
          'coef0': [0, 1, 5]}]
gs = GridSearchCV(svm.SVC(),
                      param_grid=params,
                      scoring='f1',
                      cv=5)
gs.fit(X_train, y_train)
print(gs.best_params_)

pred = gs.predict(X_test)
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

# evaluate 
clf = gs.best_estimator_
scores = cross_val_score(clf, X_test, y_test, cv=5, scoring='f1')
sum(scores) / len(scores)


In [1]:
9+9

18

In [2]:
import pandas as pd
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import jsonlines
import numpy as np

CLS_TOKEN_INDEX = 0

# read embeddings (feature column) and output (output column)
embeddings = []
with jsonlines.open("../MUStARD/data/bert-output.jsonl") as utterances:
    for utterance in utterances:
        features = utterance["features"][CLS_TOKEN_INDEX]
        bert_embedding_target = np.mean([np.array(features["layers"][layer]["values"])
                                            for layer in range(4)], axis=0)
        embeddings.append(np.copy(bert_embedding_target))
output = pd.read_json("../MUStARD/data/sarcasm_data.json")
output = output.transpose()["sarcasm"].astype(int)



In [3]:
# split
X_train, X_test, y_train, y_test = train_test_split(embeddings, 
                                                    output,
                                                    test_size=0.3,
                                                    random_state=0)


In [None]:
# tune hyperparameters
params = [{'kernel': ['rbf', 'poly'],
         'degree': [10, 15],
         'gamma': ['scale'],
          'coef0': [0]}]
gs = GridSearchCV(svm.SVC(),
                      param_grid=params,
                      scoring='f1',
                      cv=5)
gs.fit(X_train, y_train)
print(gs.best_params_)


In [None]:
pred = gs.predict(X_test)
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))


In [None]:
# evaluate 
clf = gs.best_estimator_
scores = cross_val_score(clf, X_test, y_test, cv=5, scoring='f1')
sum(scores) / len(scores)
