In [29]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [30]:
cd /content/gdrive/MyDrive/SemEval2023/SemEval2022-Task10/re_run

/content/gdrive/.shortcut-targets-by-id/1lC-ZKLaCDQyfLcof2Ak7FDa6IvTt318A/SemEval2023/SemEval2022-Task10/re_run


In [31]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [32]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Initialize the stopwords
stoplist = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [33]:
import string

def preprocess(sent, lemma=True):
  sent = sent.lower()
  words = nltk.word_tokenize(sent)

  # Remove Stop words
  words = [w for w in words if w not in stoplist]

  # Remove Punctuation
  words = [w for w in words if w not in string.punctuation]
  
  # # Lematization
  # if lemma:
  #   words = [lemmatizer.lemmatize(w) for w in words]
  return words


# Pointwise mutual information (PMI)

<img src="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhPFTkoN4lp4MA1do7rOHhOx_v5DjyyoELu4pKJ52Bh0XldLX5uJetgocwm9PoCi0Sx-dr4bdDBm2BD5q9rOB950cOKFEBv-L8pbJ6BQuoVkWYCJ7mbOrqElQQtjIVGq9AYsYw46sMmO7E7lSf7-_nJ1KURR7X8cY6VWqaJRL9bnzg6JnJKctPIoe0iuQ/s1600/PPMI.PNG">

Read: https://nocodefunctions.com/blog/pmi-tf-idf/


In [34]:
import pandas as pd

In [35]:
from collections import Counter

In [36]:
target_column = "label_vector"
if target_column == "label_category":
  label_values = [
      '1. threats, plans to harm and incitement',
      '2. derogation',
      '3. animosity',
      '4. prejudiced discussions',
  ]
  
elif target_column == "label_vector":
  label_values = [
      '1.1 threats of harm',
      '1.2 incitement and encouragement of harm',
      '2.1 descriptive attacks',
      '2.2 aggressive and emotive attacks',
      '2.3 dehumanising attacks & overt sexual objectification',
      '3.1 casual use of gendered slurs, profanities, and insults',
      '3.2 immutable gender differences and gender stereotypes',
      '3.3 backhanded gendered compliments',
      '3.4 condescending explanations or unwelcome advice',
      '4.1 supporting mistreatment of individual women',
      '4.2 supporting systemic discrimination against women as a group',
  ]

In [37]:
from collections import Counter

def get_counters(words, labels):
  Cx = Counter()
  Cy = Counter()
  Cxy = Counter()

  for sent in words:
    Cx.update(sent)

  for idx, sent in enumerate(words):
    for w in sent:
      Cxy[(labels[idx], w)] += 1

  Cy.update(labels)
  
  return Cx, Cy, Cxy

In [38]:
import math

def get_PMI(label_values, Cx, Cy, Cxy):
  vocabs = Cx.keys()
  Nx = sum(Cx.values())
  Ny = sum(Cy.values())
  N = Nx + Ny

  PMI = {}
  for label in label_values:
    PMI[label] = {}

  for label in label_values:
    Py = Cy[label]/N
    for w in vocabs:
      Px = Cx[w]/N
      Pxy = Cxy[(label, w)]/N

      delta = 1e-10
      PMI[label][w] = max(math.log2((Pxy+delta)/(Px*Py)), 0)
  return PMI

In [39]:
import matplotlib.pyplot as plt
import numpy as np


def select_lexicons(label_values, vocabs, PMI, q=0.9):
  lexicons = {}
  for label in label_values:
    lexicons[label] = {}
    
    pmi_values = PMI[label].values()

    pmi_values = [p for p in pmi_values if p > 0]
    threshold = np.quantile(pmi_values, q)
    for w in vocabs:
      if PMI[label][w] >= threshold:
        lexicons[label][w] = PMI[label][w]
  return lexicons
  

## Evaluation

In [40]:
from sklearn.metrics import f1_score

def predict(word, lexicons):
  cnt = {}
  for label in label_values:
    cnt[label] = 0

  for w in word:
    for label in label_values:
      if w in lexicons[label]:
        cnt[label] += lexicons[label][w]

  
  if sum(cnt.values())==0:
    return None
  
  return max(cnt.items(), key=lambda k: k[1])[0]

# lexicons = select_lexicons(label_values, vocabs, PMI, q=0.75)

def run_predict(test_words, test_labels, lexicons, return_predict=False):
  y_pred = []
  y_test = []
  non = 0
  for word, label in zip(test_words, test_labels):
    pred = predict(word, lexicons)
    if pred is None:
      non += 1
      continue

    y_pred.append(pred)
    y_test.append(label)

  f1 = f1_score(y_test, y_pred, average='macro')
  skip = non/len(test_words)

  if return_predict:
    return f1, skip, y_test, y_pred
    
  # print("F1:", f1)
  # print("Skip:", skip)
  return f1, skip

# run_predict(test_words, test_labels, lexicons)

## Hyperparameter Tuning 

In [41]:
all_f1, all_skip = {}, {}
for q in np.arange(0.5, 1, 0.05):
  all_f1[q] = []
  all_skip[q] = []

In [42]:

for i in range(5):
  train = pd.read_csv(f"Data/{i}_train.csv")
  val = pd.read_csv(f"Data/{i}_val.csv")
  # test = pd.read_csv(f"Data/{i}_test.csv")

  texts = train["text"].values
  train_words = [preprocess(s, lemma=True) for s in texts]
  train_labels = train[target_column].values

  texts = val["text"].values
  val_words = [preprocess(s, lemma=True) for s in texts]
  val_labels = val[target_column].values

  # texts = test["text"].values
  # test_words = [preprocess(s, lemma=True) for s in texts]
  # test_labels = test[target_column].values

  
  Cx, Cy, Cxy = get_counters(train_words, train_labels)
  vocabs = Cx.keys()

  PMI = get_PMI(label_values, Cx, Cy, Cxy)


  for q in np.arange(0.5, 1, 0.05):
    lexicons = select_lexicons(label_values, vocabs, PMI, q=q)
    f1, skip = run_predict(val_words, val_labels, lexicons)

    all_f1[q].append(f1)
    all_skip[q].append(skip)

  print("DONE",i)
  

DONE 0
DONE 1
DONE 2
DONE 3
DONE 4


In [43]:
import numpy as np

for q in np.arange(0.5, 1, 0.05):
  print(f"{q:.3f} >> F1:{np.mean(all_f1[q]):.3f}±{np.std(all_f1[q]):.3f}, SKIP: {np.mean(all_skip[q]):.3f}±{np.std(all_skip[q]):.3f}")

0.500 >> F1:0.165±0.017, SKIP: 0.057±0.012
0.550 >> F1:0.176±0.020, SKIP: 0.084±0.013
0.600 >> F1:0.174±0.013, SKIP: 0.112±0.016
0.650 >> F1:0.123±0.021, SKIP: 0.232±0.013
0.700 >> F1:0.125±0.020, SKIP: 0.315±0.012
0.750 >> F1:0.124±0.014, SKIP: 0.426±0.029
0.800 >> F1:0.128±0.024, SKIP: 0.451±0.020
0.850 >> F1:0.132±0.022, SKIP: 0.467±0.018
0.900 >> F1:0.132±0.022, SKIP: 0.467±0.018
0.950 >> F1:0.132±0.022, SKIP: 0.467±0.018


## Evaluate Lexicons

In [45]:
from sklearn.metrics import precision_recall_fscore_support
import json

all_f1 = []
all_skip = []
all_p = []
all_r = []

for i in range(5):
  train = pd.read_csv(f"Data/{i}_train.csv")
  test = pd.read_csv(f"Data/{i}_test.csv")

  
  texts = train["text"].values
  train_words = [preprocess(s, lemma=True) for s in texts]
  train_labels = train[target_column].values

  texts = test["text"].values
  test_words = [preprocess(s, lemma=True) for s in texts]
  test_labels = test[target_column].values

  # print(texts[0:5])

  
  Cx, Cy, Cxy = get_counters(train_words, train_labels)
  vocabs = Cx.keys()

  PMI = get_PMI(label_values, Cx, Cy, Cxy)


  lexicons = select_lexicons(label_values, vocabs, PMI, q=0.50)

  with open(f'Results/TaskC/lexicon_pmi_train_{i}.json', 'w') as outfile:
      json.dump(lexicons, outfile)

  f1, skip, y_test, y_pred = run_predict(test_words, test_labels, lexicons, return_predict=True)
  p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')

  all_p.append(p)
  all_r.append(r)
  all_f1.append(f1)
  all_skip.append(skip)

print(f"F1:{np.mean(all_f1):.3f}±{np.std(all_f1):.3f}, SKIP: {np.mean(all_skip):.3f}±{np.std(all_skip):.3f}")
print(f"P:{np.mean(all_p):.3f}±{np.std(all_p):.3f}, R: {np.mean(all_r):.3f}±{np.std(all_r):.3f}")

F1:0.174±0.013, SKIP: 0.061±0.015
P:0.188±0.016, R: 0.218±0.013


In [46]:
data = pd.read_csv("../Data/starting_ki/train_all_tasks.csv")
data = data[data[target_column]!="none"]

texts = data["text"].values
words = [preprocess(s, lemma=True) for s in texts]
labels = data[target_column].values
Cx, Cy, Cxy = get_counters(words, labels)
vocabs = Cx.keys()

PMI = get_PMI(label_values, Cx, Cy, Cxy)


lexicons = select_lexicons(label_values, vocabs, PMI, q=0.50)

In [47]:
for c in lexicons:
  lexicons[c] = {k: v for k, v in sorted(lexicons[c].items(), key=lambda item: -item[1])}

  print(c, list(lexicons[c].keys())[0:10])

1.1 threats of harm ['laura', 'loomer', 'kosher', 'printer', 'jams', 'afar', 'sip', 'napalm', 'ripe', 'harmed']
1.2 incitement and encouragement of harm ['nine', 'mm', 'aboutism', 'collect', 'resale', 'befall', 'whte', 'decked', 'bbw', 'exterminated']
2.1 descriptive attacks ['initial', 'flocking', 'penguins', 'hesitating', 'iceberg', 'dried-up', 'negates', 'yah', 'nawalts', 'behalf']
2.2 aggressive and emotive attacks ['collecting', 'solace', 'moral', 'relativism', 'fifty', 'binge', 'somthing', 'bauble', 'bling', 'lulz']
2.3 dehumanising attacks & overt sexual objectification ['debit', 'mika', 'brzezinski', 'msnbc', 'futas', 'hello', '57', 'dems', 'vaginally', 'intentionally']
3.1 casual use of gendered slurs, profanities, and insults ['/u/08winchester', 'predictably', 'lee', 'senate', 'australia', 'chained', 'iv', 'lean', 'comedies', 'nervousness']
3.2 immutable gender differences and gender stereotypes ['same-gender', 'operates', 'holidays', 'latte', 'remote', '😱', 'complaints', 'un

In [48]:
import json
with open('Results/TaskC/lexicon_pmi.json', 'w') as outfile:
    json.dump(lexicons, outfile)