In [64]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [65]:
cd /content/gdrive/MyDrive/SemEval2023/SemEval2022-Task10/re_run

/content/gdrive/.shortcut-targets-by-id/1lC-ZKLaCDQyfLcof2Ak7FDa6IvTt318A/SemEval2023/SemEval2022-Task10/re_run


In [69]:
import warnings
warnings.filterwarnings("ignore")

## Load Data

In [72]:
import pandas as pd

target_column = "label_vector"
data = pd.read_csv("../Data/starting_ki/train_all_tasks.csv")
data = data[data[target_column]!="none"]

label_values = [
      '1.1 threats of harm',
      '1.2 incitement and encouragement of harm',
      '2.1 descriptive attacks',
      '2.2 aggressive and emotive attacks',
      '2.3 dehumanising attacks & overt sexual objectification',
      '3.1 casual use of gendered slurs, profanities, and insults',
      '3.2 immutable gender differences and gender stereotypes',
      '3.3 backhanded gendered compliments',
      '3.4 condescending explanations or unwelcome advice',
      '4.1 supporting mistreatment of individual women',
      '4.2 supporting systemic discrimination against women as a group',
  ]

In [73]:
label_map = {
    0: '1.1 threats of harm',
    1: '1.2 incitement and encouragement of harm',
    2: '2.1 descriptive attacks',
    3: '2.2 aggressive and emotive attacks',
    4: '2.3 dehumanising attacks & overt sexual objectification',
    5: '3.1 casual use of gendered slurs, profanities, and insults',
    6: '3.2 immutable gender differences and gender stereotypes',
    7: '3.3 backhanded gendered compliments',
    8: '3.4 condescending explanations or unwelcome advice',
    9: '4.1 supporting mistreatment of individual women',
    10: '4.2 supporting systemic discrimination against women as a group',
    '1.1 threats of harm': 0,
    '1.2 incitement and encouragement of harm': 1,
    '2.1 descriptive attacks': 2,
    '2.2 aggressive and emotive attacks': 3,
    '2.3 dehumanising attacks & overt sexual objectification': 4,
    '3.1 casual use of gendered slurs, profanities, and insults': 5,
    '3.2 immutable gender differences and gender stereotypes': 6,
    '3.3 backhanded gendered compliments': 7,
    '3.4 condescending explanations or unwelcome advice': 8,
    '4.1 supporting mistreatment of individual women': 9,
    '4.2 supporting systemic discrimination against women as a group': 10,
}

In [74]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Initialize the stopwords
stoplist = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [75]:
import string
from nltk.tokenize import TweetTokenizer
nltktokenizer = TweetTokenizer()

def preprocess(sent, **kwargs):
  sent = sent.lower()
  words = nltktokenizer.tokenize(sent)

  # Remove Stop words
  words = [w for w in words if w not in stoplist]

  # Remove Punctuation
  words = [w for w in words if w not in string.punctuation]
  
  # # Lematization
  # if "lemma" in kwargs and kwargs["lemma"]:
  #   words = [lemmatizer.lemmatize(w) for w in words]
    
  return words

## Logistic Regression with all words

In [76]:
from sklearn.feature_extraction.text import CountVectorizer

def logistic_preprocess(sent):
  words = preprocess(sent, lemma=True)
  return words

def dummy_word_tokenize(sent):
  return sent

# vectorizer = CountVectorizer(
#     preprocessor=logistic_preprocess,
#     tokenizer=dummy_word_tokenize
# )

In [89]:
from tqdm import tqdm
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

def cross_validation():
  _metrics = {
      "acc":[],
      "p":[],
      "r":[],
      "f1":[],
      "F": []
  }

  for k in tqdm(range(5)):
    train = pd.read_csv(f"./Data/{k}_train.csv")
    test = pd.read_csv(f"./Data/{k}_test.csv")
    
    vectorizer = CountVectorizer(
        preprocessor=logistic_preprocess,
        tokenizer=dummy_word_tokenize
    )

    X_train = vectorizer.fit_transform(train["text"].values)
    X_test = vectorizer.transform(test["text"].values)

    y_train = [label_map[l] for l in train[target_column].values]
    y_test = [label_map[l] for l in test[target_column].values]

    model = LogisticRegression(random_state=42, penalty="l2")
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)

    # f1 = metrics.f1_score(y_test, y_predict, average='macro')
    acc = metrics.accuracy_score(y_test, y_predict)
    p, r, f1, sup = metrics.precision_recall_fscore_support(y_test, y_predict, average="macro")

    # _metrics["acc"].append(acc)
    _metrics["f1"].append(f1)
    _metrics["p"].append(p)
    _metrics["r"].append(r)
    _metrics["F"].append(model.n_features_in_)
  
  print("")
  for m in _metrics:
    print(f"{m}: {np.mean(_metrics[m]):.3f} ± {np.std(_metrics[m]):.3f}")
  
  return model

model = cross_validation()

100%|██████████| 5/5 [00:18<00:00,  3.74s/it]


acc: nan ± nan
p: 0.312 ± 0.076
r: 0.245 ± 0.018
f1: 0.250 ± 0.026
F: 8311.600 ± 24.532





## Logistic Regression with selected lexicons

In [93]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

class Preprocessor:
  def __init__(self, vocabs):
    self.vocabs = vocabs
  
  def run(self, sent):
    words = preprocess(sent, lemma=True)

    # Remove words that are not in vocabs
    words = [w for w in words if w in self.vocabs]
    return words

def dummy_word_tokenize(sent):
  return sent

def train_reg(data, lexicon, **kwargs):
  vocabs = set()
  for label in lexicon:
    vocabs.update(lexicon[label].keys())

  preprocessor = Preprocessor(vocabs)
  vectorizer = CountVectorizer(
      preprocessor=preprocessor.run,
      tokenizer=dummy_word_tokenize
  )

  X_train = vectorizer.fit_transform(data["text"].values)

  y_train = [label_map[l] for l in data[target_column].values]
  
  print("#Vocab", len(vectorizer.get_feature_names_out()))

  model = LogisticRegression(random_state=42, penalty="l2")
  model.fit(X_train, y_train)

  return model, vectorizer


In [94]:
def run_eval_feat(data, model, vectorizer, **kwargs):
  y_test = [label_map[l] for l in data[target_column].values]
  X_test = vectorizer.transform(data["text"].values)

  y_predict = model.predict(X_test)
  # print("F1: %.3f" % metrics.f1_score(y_test, y_predict, average='macro'))
  # print("Acc: %.3f" % metrics.accuracy_score(y_test, y_predict))

  p, r, f1, sup = metrics.precision_recall_fscore_support(y_test, y_predict, average="macro")
  # print("P: %.3f" % p)
  # print("R: %.3f" % r)
  return p, r, f1

In [96]:
from tqdm import tqdm
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import json

def cross_validation_v2(lexicon_prefix):
  _metrics = {
      "acc":[],
      "p":[],
      "r":[],
      "f1":[],
      "feat": []
  }

  for k in tqdm(range(5)):
    train = pd.read_csv(f"./Data/{k}_train.csv")
    test = pd.read_csv(f"./Data/{k}_test.csv")
    
    with open(f'{lexicon_prefix}_{k}.json') as fin:
      lexicons = json.load(fin)
    
    model, vectorizer = train_reg(train, lexicons)
    p, r, f1 = run_eval_feat(test, model, vectorizer)

    # _metrics["acc"].append(acc)
    _metrics["f1"].append(f1)
    _metrics["p"].append(p)
    _metrics["r"].append(r)
    _metrics["feat"].append(model.n_features_in_)
    assert(len(model.classes_)==11)
    
  print("")
  for m in _metrics:
    print(f"{m}: {np.mean(_metrics[m]):.3f} ± {np.std(_metrics[m]):.3f}")
  return ;



In [97]:
cross_validation_v2("Results/TaskC/lexicon_pmi_train")

  0%|          | 0/5 [00:00<?, ?it/s]

#Vocab 7000


 20%|██        | 1/5 [00:03<00:13,  3.36s/it]

#Vocab 6985


 40%|████      | 2/5 [00:06<00:09,  3.20s/it]

#Vocab 7021


 60%|██████    | 3/5 [00:09<00:06,  3.30s/it]

#Vocab 6974


 80%|████████  | 4/5 [00:13<00:03,  3.48s/it]

#Vocab 6985


100%|██████████| 5/5 [00:16<00:00,  3.34s/it]


acc: nan ± nan
p: 0.224 ± 0.047
r: 0.167 ± 0.014
f1: 0.167 ± 0.018
feat: 6993.000 ± 16.260





In [98]:
cross_validation_v2("Results/TaskC/augmented_lexicon_pmi_train")

  0%|          | 0/5 [00:00<?, ?it/s]

#Vocab 7006


 20%|██        | 1/5 [00:03<00:13,  3.44s/it]

#Vocab 6992


 40%|████      | 2/5 [00:06<00:09,  3.32s/it]

#Vocab 7031


 60%|██████    | 3/5 [00:09<00:06,  3.30s/it]

#Vocab 6984


 80%|████████  | 4/5 [00:13<00:03,  3.22s/it]

#Vocab 6993


100%|██████████| 5/5 [00:16<00:00,  3.39s/it]


acc: nan ± nan
p: 0.225 ± 0.048
r: 0.167 ± 0.014
f1: 0.167 ± 0.017
feat: 7001.200 ± 16.485





In [99]:
cross_validation_v2("Results/TaskC/gptj_lexicon_pmi_train")

  0%|          | 0/5 [00:00<?, ?it/s]

#Vocab 7771


 20%|██        | 1/5 [00:05<00:20,  5.15s/it]

#Vocab 7763


 40%|████      | 2/5 [00:09<00:13,  4.42s/it]

#Vocab 7831


 60%|██████    | 3/5 [00:13<00:08,  4.20s/it]

#Vocab 7768


 80%|████████  | 4/5 [00:17<00:04,  4.40s/it]

#Vocab 7778


100%|██████████| 5/5 [00:21<00:00,  4.32s/it]


acc: nan ± nan
p: 0.311 ± 0.075
r: 0.243 ± 0.015
f1: 0.251 ± 0.023
feat: 7782.200 ± 24.879





In [100]:
cross_validation_v2("Results/TaskC/bertweet_lexicon_pmi_train")

  0%|          | 0/5 [00:00<?, ?it/s]

#Vocab 7927


 20%|██        | 1/5 [00:04<00:18,  4.52s/it]

#Vocab 7929


 40%|████      | 2/5 [00:09<00:13,  4.57s/it]

#Vocab 7999


 60%|██████    | 3/5 [00:13<00:08,  4.43s/it]

#Vocab 7932


 80%|████████  | 4/5 [00:17<00:04,  4.42s/it]

#Vocab 7941


100%|██████████| 5/5 [00:22<00:00,  4.43s/it]


acc: nan ± nan
p: 0.299 ± 0.082
r: 0.242 ± 0.022
f1: 0.247 ± 0.030
feat: 7945.600 ± 27.126





In [101]:
cross_validation_v2("Results/TaskC/lexicon_shapley_train")

  0%|          | 0/5 [00:00<?, ?it/s]

#Vocab 5306


 20%|██        | 1/5 [00:05<00:22,  5.73s/it]

#Vocab 5317


 40%|████      | 2/5 [00:10<00:15,  5.01s/it]

#Vocab 5357


 60%|██████    | 3/5 [00:13<00:08,  4.18s/it]

#Vocab 5327


 80%|████████  | 4/5 [00:16<00:03,  3.77s/it]

#Vocab 5320


100%|██████████| 5/5 [00:19<00:00,  3.99s/it]


acc: nan ± nan
p: 0.305 ± 0.070
r: 0.246 ± 0.012
f1: 0.252 ± 0.020
feat: 5325.400 ± 17.188





In [102]:
cross_validation_v2("Results/TaskC/augmented_lexicon_shapley_train")

  0%|          | 0/5 [00:00<?, ?it/s]

#Vocab 5380


 20%|██        | 1/5 [00:03<00:13,  3.50s/it]

#Vocab 5390


 40%|████      | 2/5 [00:06<00:10,  3.47s/it]

#Vocab 5432


 60%|██████    | 3/5 [00:10<00:06,  3.40s/it]

#Vocab 5407


 80%|████████  | 4/5 [00:13<00:03,  3.36s/it]

#Vocab 5407


100%|██████████| 5/5 [00:16<00:00,  3.37s/it]


acc: nan ± nan
p: 0.305 ± 0.070
r: 0.246 ± 0.014
f1: 0.252 ± 0.021
feat: 5403.200 ± 17.725





In [103]:
cross_validation_v2("Results/TaskC/gptj_lexicon_shapley_train")

  0%|          | 0/5 [00:00<?, ?it/s]

#Vocab 6025


 20%|██        | 1/5 [00:03<00:13,  3.41s/it]

#Vocab 6027


 40%|████      | 2/5 [00:06<00:10,  3.41s/it]

#Vocab 6080


 60%|██████    | 3/5 [00:10<00:06,  3.47s/it]

#Vocab 6058


 80%|████████  | 4/5 [00:13<00:03,  3.52s/it]

#Vocab 6063


100%|██████████| 5/5 [00:17<00:00,  3.48s/it]


acc: nan ± nan
p: 0.306 ± 0.079
r: 0.247 ± 0.019
f1: 0.254 ± 0.029
feat: 6050.600 ± 21.378





In [104]:
cross_validation_v2("Results/TaskC/bertweet_lexicon_shapley_train")

  0%|          | 0/5 [00:00<?, ?it/s]

#Vocab 6909


 20%|██        | 1/5 [00:04<00:16,  4.06s/it]

#Vocab 6934


 40%|████      | 2/5 [00:07<00:11,  3.93s/it]

#Vocab 6981


 60%|██████    | 3/5 [00:11<00:07,  3.86s/it]

#Vocab 6936


 80%|████████  | 4/5 [00:15<00:03,  3.89s/it]

#Vocab 6928


100%|██████████| 5/5 [00:19<00:00,  3.88s/it]


acc: nan ± nan
p: 0.314 ± 0.075
r: 0.246 ± 0.019
f1: 0.254 ± 0.028
feat: 6937.600 ± 23.703



