In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
cd /content/gdrive/MyDrive/PHD/SemEval2023/SemEval2022-Task10/private_space

/content/gdrive/MyDrive/PHD/SemEval2023/SemEval2022-Task10/private_space


In [4]:
import json

with open('Results/lexicon_shapley.json') as fin:
    lexicon_shapley = json.load(fin)

In [5]:
ls Results

1a_bert.csv                        predicted_data_gab.csv
1b_bert.csv                        shapley_lexicon_1-aug.txt
1c_bert.csv                        shapley_lexicon_1.txt
lexicon_pmi.json                   shapley_lexicon_2.txt
lexicon_shapley_augmented_v1.json  shapley_lexicon_3.txt
lexicon_shapley.json               shapley_lexicon_4.txt
PMI_lexicon_1.txt                  shapley_values_1.pickle
PMI_lexicon_2.txt                  shapley_values_2.pickle
PMI_lexicon_3.txt                  shapley_values_3.pickle
PMI_lexicon_4.txt                  shapley_values_4.pickle
PMI.pickle


In [None]:
import pandas as pd
unsupervised_data_gab = pd.read_csv("Data/starting_ki/gab_1M_unlabelled.csv")
unsupervised_data_reddit = pd.read_csv("Data/starting_ki/reddit_1M_unlabelled.csv")

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Initialize the stopwords
stoplist = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [None]:
from collections import Counter

In [None]:
from tqdm import tqdm
tqdm.pandas()

import string

def preprocess(sent):
  sent = sent.lower()
  words = nltk.word_tokenize(sent)

  # Remove Stop words
  words = [w for w in words if w not in stoplist]

  # Remove Punctuation
  words = [w for w in words if w not in string.punctuation]
  
  # Lematization
  
  words = [lemmatizer.lemmatize(w) for w in words]
  return words

def pandas_preprocess(row):
  return preprocess(row["text"])

texts = unsupervised_data_gab.progress_apply(pandas_preprocess, axis=1).values

100%|██████████| 1000000/1000000 [06:29<00:00, 2569.50it/s]


In [None]:
texts[0]

['saying', '....', 'lose', 'biatches']

In [None]:
def cal_PMI(texts, window=5):
  counter = Counter()
  vocabs = set()
  Nx = 0
  Nxy = 0

  for sent in tqdm(texts, total=len(texts)):
    vocabs.update(sent)
    for w in sent:
      counter[(None, w)] += 1
      Nx += 1
    
    for aidx in range(len(sent)):
      for bidx in range(aidx+1, len(sent)):
        wa = sent[aidx]
        wb = sent[bidx]
        counter[(wa, wb)] += 1
        counter[(wb, wa)] += 1

        Nxy += 1
  return counter, vocabs, Nx, Nxy

PMI = cal_PMI(texts)

100%|██████████| 1000000/1000000 [03:25<00:00, 4867.23it/s]


In [None]:
# Estimate Threshold
import random
import numpy as np
import math

def get_PMI(PMI, wa, wb):
  PMI_counter, PMI_vocabs, PMI_Nx, PMI_Nxy = PMI

  Px = PMI_counter[(None, wa)]/PMI_Nx
  Py = PMI_counter[(None, wb)]/PMI_Nx
  Pxy = PMI_counter[(wa, wb)]/PMI_Nxy

  delta = 1e-10
  p = max(math.log2((Pxy+delta)/(Px*Py)), 0)
  return p

# N = int(len(PMI[1])*0.5)
N = 100000
wordA = random.sample(PMI[1], N)
wordB = random.sample(PMI[1], N)
values = [get_PMI(PMI, wa, wb) for wa, wb in zip(wordA, wordB)]

In [None]:
# import matplotlib.pyplot as plt
# plt.hist(values, bins=20)
# plt.show()

In [None]:
len(values), np.mean(values), np.quantile(values, 0.99), max(values)

(100000, 11.41148994028057, 13.35977390501769, 16.857735028142546)

In [None]:
threshold = np.quantile(values, 0.99)
threshold

13.35977390501769

In [None]:
vocabs = set()
for label in lexicon_shapley:
  vocabs.update(lexicon_shapley[label].keys())

def augment(PMI, word, threshold):
  PMI_counter, PMI_vocabs, PMI_Nx, PMI_Nxy = PMI
  if word not in PMI_vocabs:
    return []

  candidates = []
  for v in PMI_vocabs:
    pmi = get_PMI(PMI, word, v)
    if pmi > threshold:
      candidates.append((v, pmi))
  return candidates

augmented_lexicons = {}
for v in tqdm(vocabs, total=len(vocabs)):
  candidates = augment(PMI, v, threshold)
  augmented_lexicons[v] = candidates

100%|██████████| 1191/1191 [15:02<00:00,  1.32it/s]


In [None]:
new_lexicon_shapley = {}
for label in lexicon_shapley:
  new_lexicon_shapley[label] = {}

  lexicons = lexicon_shapley[label]

  new_lexicon = {}
  for word in lexicons:
    for new_word, pmi in augmented_lexicons[word]:
      val = math.sqrt(lexicons[word]*pmi)
      if new_word in new_lexicon:
        new_lexicon[new_word] = min(val, new_lexicon[new_word])
      else:
        new_lexicon[new_word] = val

  pmi_values = list(new_lexicon.values())
  pmi_threshold = np.quantile(pmi_values, 0)
  pmi_max = max(pmi_values)
  pmi_min = min(pmi_values)

  # print(pmi_threshold==pmi_min)

  for word in lexicons:
    new_lexicon_shapley[label][word] = lexicons[word]

  for word in new_lexicon:
    new_lexicon_shapley[label][word] = new_lexicon[word]

In [None]:
import json
with open('Results/lexicon_shapley_augmented_v1.json', 'w') as outfile:
    json.dump(new_lexicon_shapley, outfile)

In [239]:
ls -al Results/lexicon_*

-rw------- 1 root root  66018 Jan 12 14:33 Results/lexicon_pmi.json
-rw------- 1 root root 280150 Jan 12 18:49 Results/lexicon_shapley_augmented_v1.json
-rw------- 1 root root  42420 Jan 12 14:17 Results/lexicon_shapley.json


In [7]:
import pickle

# with open("Results/PMI.pickle", 'wb') as fin:
#   pickle.dump(PMI, fin)

In [8]:
with open("Results/PMI.pickle", 'rb') as fin:
    PMIv2 = pickle.load(fin)

In [None]:
# import pandas as pd
# from sklearn.model_selection import train_test_split

# target_column = "label_category"
# data = pd.read_csv("Data/starting_ki/train_all_tasks.csv")
# data = data[data[target_column]!="none"]

# train, test = train_test_split(data, test_size=0.2, random_state=42)

# label_values = [
#     '1. threats, plans to harm and incitement',
#     '2. derogation',
#     '3. animosity',
#     '4. prejudiced discussions',
# ]

# label_map = {
#     0: '1. threats, plans to harm and incitement',
#     1: '2. derogation', 
#     2: '3. animosity', 
#     3: '4. prejudiced discussions',
#     '1. threats, plans to harm and incitement':0,
#     '2. derogation':1,
#     '3. animosity':2,
#     '4. prejudiced discussions':3,
# }

# def preprocess(sent, **kwargs):
#   sent = sent.lower()
#   words = nltk.word_tokenize(sent)

#   # Remove Stop words
#   words = [w for w in words if w not in stoplist]

#   # Remove Punctuation
#   words = [w for w in words if w not in string.punctuation]
  
#   # Lematization
#   if "lemma" in kwargs and kwargs["lemma"]:
#     words = [lemmatizer.lemmatize(w) for w in words]
    
#   return words

# from sklearn.metrics import f1_score

# def predict(words, lexicons):
#   cnt = {}
#   for label in lexicons:
#     cnt[label] = 0

#   for w in words:
#     for label in lexicons:
#       if w in lexicons[label]:
#         cnt[label] += lexicons[label][w]

#   if sum(cnt.values())==0:
#     return None
  
#   return max(cnt.items(), key=lambda k: k[1])[0]


# def run_eval(data, lexicons, **kwargs):
#   texts = data["text"].values
#   sents = [preprocess(t, **kwargs) for t in texts]
#   labels = data[target_column].values

#   y_pred = []
#   y_test = []
#   non = 0
#   for words, label in zip(sents, labels):
#     pred = predict(words, lexicons)
#     if pred is None:
#       non += 1
#       continue

#     y_pred.append(pred)
#     y_test.append(label)

#   print("F1:", f1_score(y_test, y_pred, average='macro'))
#   print("Skip:", non/len(sents))
#   return 

# print("Evaluate Augmented Shapley Lexicons")
# run_eval(test, new_lexicon_shapley, lemma=False)