In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [8]:
cd /content/gdrive/MyDrive/SemEval2023/SemEval2022-Task10/re_run

/content/gdrive/.shortcut-targets-by-id/1lC-ZKLaCDQyfLcof2Ak7FDa6IvTt318A/SemEval2023/SemEval2022-Task10/re_run


In [5]:
import warnings
warnings.filterwarnings("ignore")

## Load Data

In [9]:
import pandas as pd

target_column = "label_category"
data = pd.read_csv("../Data/starting_ki/train_all_tasks.csv")
data = data[data[target_column]!="none"]

label_values = [
    '1. threats, plans to harm and incitement',
    '2. derogation',
    '3. animosity',
    '4. prejudiced discussions',
]

In [10]:
label_map = {
    0: '1. threats, plans to harm and incitement',
    1: '2. derogation', 
    2: '3. animosity', 
    3: '4. prejudiced discussions',
    '1. threats, plans to harm and incitement':0,
    '2. derogation':1,
    '3. animosity':2,
    '4. prejudiced discussions':3,
}

In [11]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Initialize the stopwords
stoplist = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [17]:
import string
from nltk.tokenize import TweetTokenizer
nltktokenizer = TweetTokenizer()

def preprocess(sent, **kwargs):
  sent = sent.lower()
  words = nltktokenizer.tokenize(sent)

  # Remove Stop words
  words = [w for w in words if w not in stoplist]

  # Remove Punctuation
  words = [w for w in words if w not in string.punctuation]
  
  # # Lematization
  # if "lemma" in kwargs and kwargs["lemma"]:
  #   words = [lemmatizer.lemmatize(w) for w in words]
    
  return words

## Logistic Regression with all words

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

def logistic_preprocess(sent):
  words = preprocess(sent, lemma=True)
  return words

def dummy_word_tokenize(sent):
  return sent

# vectorizer = CountVectorizer(
#     preprocessor=logistic_preprocess,
#     tokenizer=dummy_word_tokenize
# )

In [30]:
from tqdm import tqdm
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

def cross_validation():
  _metrics = {
      "acc":[],
      "p":[],
      "r":[],
      "f1":[],
      "F": []
  }

  for k in tqdm(range(5)):
    train = pd.read_csv(f"./Data/{k}_train.csv")
    test = pd.read_csv(f"./Data/{k}_test.csv")
    
    vectorizer = CountVectorizer(
        preprocessor=logistic_preprocess,
        tokenizer=dummy_word_tokenize
    )

    X_train = vectorizer.fit_transform(train["text"].values)
    X_test = vectorizer.transform(test["text"].values)

    y_train = [label_map[l] for l in train[target_column].values]
    y_test = [label_map[l] for l in test[target_column].values]

    model = LogisticRegression(random_state=42, penalty="l2")
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)

    # f1 = metrics.f1_score(y_test, y_predict, average='macro')
    acc = metrics.accuracy_score(y_test, y_predict)
    p, r, f1, sup = metrics.precision_recall_fscore_support(y_test, y_predict, average="macro")

    # _metrics["acc"].append(acc)
    _metrics["f1"].append(f1)
    _metrics["p"].append(p)
    _metrics["r"].append(r)
    _metrics["F"].append(model.n_features_in_)
  
  print("")
  for m in _metrics:
    print(f"{m}: {np.mean(_metrics[m]):.3f} ± {np.std(_metrics[m]):.3f}")
  

model = cross_validation()

100%|██████████| 5/5 [00:10<00:00,  2.15s/it]


acc: nan ± nan
p: 0.485 ± 0.041
r: 0.375 ± 0.034
f1: 0.397 ± 0.040
F: 8311.600 ± 24.532





8296

## Logistic Regression with selected lexicons

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

class Preprocessor:
  def __init__(self, vocabs):
    self.vocabs = vocabs
  
  def run(self, sent):
    words = preprocess(sent, lemma=True)

    # Remove words that are not in vocabs
    words = [w for w in words if w in self.vocabs]
    return words

def dummy_word_tokenize(sent):
  return sent

def train_reg(data, lexicon, **kwargs):
  vocabs = set()
  for label in lexicon:
    vocabs.update(lexicon[label].keys())

  preprocessor = Preprocessor(vocabs)
  vectorizer = CountVectorizer(
      preprocessor=preprocessor.run,
      tokenizer=dummy_word_tokenize
  )

  X_train = vectorizer.fit_transform(data["text"].values)

  y_train = [label_map[l] for l in data[target_column].values]
  
  print("#Vocab", len(vectorizer.get_feature_names_out()))

  model = LogisticRegression(random_state=42, penalty="l2")
  model.fit(X_train, y_train)

  return model, vectorizer


In [43]:
def run_eval_feat(data, model, vectorizer, **kwargs):
  y_test = [label_map[l] for l in data[target_column].values]
  X_test = vectorizer.transform(data["text"].values)

  y_predict = model.predict(X_test)
  # print("F1: %.3f" % metrics.f1_score(y_test, y_predict, average='macro'))
  # print("Acc: %.3f" % metrics.accuracy_score(y_test, y_predict))

  p, r, f1, sup = metrics.precision_recall_fscore_support(y_test, y_predict, average="macro")
  # print("P: %.3f" % p)
  # print("R: %.3f" % r)
  return p, r, f1

In [44]:
from tqdm import tqdm
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import json

def cross_validation_v2(lexicon_prefix):
  _metrics = {
      "acc":[],
      "p":[],
      "r":[],
      "f1":[],
      "feat": []
  }

  for k in tqdm(range(5)):
    train = pd.read_csv(f"./Data/{k}_train.csv")
    test = pd.read_csv(f"./Data/{k}_test.csv")
    
    with open(f'{lexicon_prefix}_{k}.json') as fin:
      lexicons = json.load(fin)
    
    model, vectorizer = train_reg(train, lexicons)
    p, r, f1 = run_eval_feat(test, model, vectorizer)

    # _metrics["acc"].append(acc)
    _metrics["f1"].append(f1)
    _metrics["p"].append(p)
    _metrics["r"].append(r)
    _metrics["feat"].append(model.n_features_in_)
  
  print("")
  for m in _metrics:
    print(f"{m}: {np.mean(_metrics[m]):.3f} ± {np.std(_metrics[m]):.3f}")
  return ;



In [45]:
cross_validation_v2("Results/TaskB/lexicon_pmi_train")

  0%|          | 0/5 [00:00<?, ?it/s]

#Vocab 4434


 20%|██        | 1/5 [00:01<00:05,  1.33s/it]

#Vocab 4487


 40%|████      | 2/5 [00:02<00:03,  1.31s/it]

#Vocab 4528


 60%|██████    | 3/5 [00:04<00:03,  1.63s/it]

#Vocab 4465


 80%|████████  | 4/5 [00:06<00:01,  1.62s/it]

#Vocab 4483


100%|██████████| 5/5 [00:07<00:00,  1.48s/it]


acc: nan ± nan
p: 0.260 ± 0.094
r: 0.258 ± 0.012
f1: 0.209 ± 0.013
feat: 4479.400 ± 30.663





In [46]:
cross_validation_v2("Results/TaskB/augmented_lexicon_pmi_train")

  0%|          | 0/5 [00:00<?, ?it/s]

#Vocab 4500


 20%|██        | 1/5 [00:01<00:06,  1.60s/it]

#Vocab 4550


 40%|████      | 2/5 [00:03<00:04,  1.62s/it]

#Vocab 4603


 60%|██████    | 3/5 [00:04<00:03,  1.57s/it]

#Vocab 4535


 80%|████████  | 4/5 [00:06<00:01,  1.77s/it]

#Vocab 4551


100%|██████████| 5/5 [00:08<00:00,  1.73s/it]


acc: nan ± nan
p: 0.258 ± 0.093
r: 0.256 ± 0.009
f1: 0.207 ± 0.011
feat: 4547.800 ± 33.199





In [47]:
cross_validation_v2("Results/TaskB/gptj_lexicon_pmi_train")

  0%|          | 0/5 [00:00<?, ?it/s]

#Vocab 6427


 20%|██        | 1/5 [00:02<00:10,  2.56s/it]

#Vocab 6489


 40%|████      | 2/5 [00:04<00:06,  2.27s/it]

#Vocab 6551


 60%|██████    | 3/5 [00:07<00:04,  2.42s/it]

#Vocab 6471


 80%|████████  | 4/5 [00:09<00:02,  2.29s/it]

#Vocab 6487


100%|██████████| 5/5 [00:11<00:00,  2.33s/it]


acc: nan ± nan
p: 0.461 ± 0.026
r: 0.362 ± 0.025
f1: 0.384 ± 0.027
feat: 6485.000 ± 39.840





In [48]:
cross_validation_v2("Results/TaskB/bertweet_lexicon_pmi_train")

  0%|          | 0/5 [00:00<?, ?it/s]

#Vocab 7328


 20%|██        | 1/5 [00:02<00:11,  2.85s/it]

#Vocab 7363


 40%|████      | 2/5 [00:05<00:07,  2.50s/it]

#Vocab 7404


 60%|██████    | 3/5 [00:07<00:04,  2.49s/it]

#Vocab 7334


 80%|████████  | 4/5 [00:10<00:02,  2.47s/it]

#Vocab 7357


100%|██████████| 5/5 [00:13<00:00,  2.60s/it]


acc: nan ± nan
p: 0.475 ± 0.039
r: 0.371 ± 0.030
f1: 0.394 ± 0.035
feat: 7357.200 ± 26.888





In [49]:
cross_validation_v2("Results/TaskB/lexicon_shapley_train")

  0%|          | 0/5 [00:00<?, ?it/s]

#Vocab 1080


 20%|██        | 1/5 [00:01<00:05,  1.45s/it]

#Vocab 1085


 40%|████      | 2/5 [00:02<00:03,  1.24s/it]

#Vocab 1084


 60%|██████    | 3/5 [00:04<00:02,  1.35s/it]

#Vocab 1081


 80%|████████  | 4/5 [00:05<00:01,  1.38s/it]

#Vocab 1083


100%|██████████| 5/5 [00:07<00:00,  1.50s/it]


acc: nan ± nan
p: 0.443 ± 0.084
r: 0.315 ± 0.014
f1: 0.310 ± 0.023
feat: 1082.600 ± 1.855





In [50]:
cross_validation_v2("Results/TaskB/augmented_lexicon_shapley_train")

  0%|          | 0/5 [00:00<?, ?it/s]

#Vocab 1192


 20%|██        | 1/5 [00:02<00:09,  2.36s/it]

#Vocab 1212


 40%|████      | 2/5 [00:03<00:05,  1.83s/it]

#Vocab 1219


 60%|██████    | 3/5 [00:05<00:03,  1.67s/it]

#Vocab 1192


 80%|████████  | 4/5 [00:06<00:01,  1.45s/it]

#Vocab 1199


100%|██████████| 5/5 [00:07<00:00,  1.54s/it]


acc: nan ± nan
p: 0.444 ± 0.086
r: 0.315 ± 0.018
f1: 0.312 ± 0.027
feat: 1202.800 ± 10.907





In [51]:
cross_validation_v2("Results/TaskB/gptj_lexicon_shapley_train")

  0%|          | 0/5 [00:00<?, ?it/s]

#Vocab 3418


 20%|██        | 1/5 [00:02<00:09,  2.47s/it]

#Vocab 3415


 40%|████      | 2/5 [00:04<00:06,  2.10s/it]

#Vocab 3426


 60%|██████    | 3/5 [00:06<00:04,  2.27s/it]

#Vocab 3410


 80%|████████  | 4/5 [00:08<00:02,  2.17s/it]

#Vocab 3410


100%|██████████| 5/5 [00:11<00:00,  2.29s/it]


acc: nan ± nan
p: 0.455 ± 0.017
r: 0.370 ± 0.025
f1: 0.390 ± 0.025
feat: 3415.800 ± 5.946





In [63]:
cross_validation_v2("Results/TaskB/bertweet_lexicon_shapley_train")

  0%|          | 0/5 [00:00<?, ?it/s]

#Vocab 5346


 20%|██        | 1/5 [00:02<00:09,  2.29s/it]

#Vocab 5324


 40%|████      | 2/5 [00:04<00:07,  2.41s/it]

#Vocab 5345


 60%|██████    | 3/5 [00:07<00:04,  2.33s/it]

#Vocab 5347


 80%|████████  | 4/5 [00:09<00:02,  2.46s/it]

#Vocab 5343


100%|██████████| 5/5 [00:11<00:00,  2.38s/it]


acc: nan ± nan
p: 0.460 ± 0.044
r: 0.371 ± 0.030
f1: 0.393 ± 0.035
feat: 5341.000 ± 8.602



