In [4]:
import torch
from scipy.spatial.distance import cosine
from transformers import AutoModel, AutoTokenizer
import torch.nn.functional as F
from datasets import load_dataset

In [5]:
# Import our models. The package will take care of downloading the models automatically
model_ckpt = "princeton-nlp/sup-simcse-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [6]:
texts = [
    "There's a kid on a skateboard.",
    "A kid is skateboarding.",
    "A kid is inside the house."
]
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

In [7]:
# Get the embeddings
with torch.no_grad():
    embeddings = model(**inputs, output_hidden_states=True, return_dict=True).pooler_output
embeddings.size()

torch.Size([3, 768])

In [8]:
# Calculate cosine similarities
# Cosine similarities are in [-1, 1]. Higher means more similar
cosine_sim_0_1 = 1 - cosine(embeddings[0], embeddings[1])
cosine_sim_0_2 = 1 - cosine(embeddings[0], embeddings[2])
cosine_sim_1_2 = 1 - cosine(embeddings[1], embeddings[2])

print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[1], cosine_sim_0_1))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[2], cosine_sim_0_2))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[1], texts[2], cosine_sim_1_2))

Cosine similarity between "There's a kid on a skateboard." and "A kid is skateboarding." is: 0.943
Cosine similarity between "There's a kid on a skateboard." and "A kid is inside the house." is: 0.439
Cosine similarity between "A kid is skateboarding." and "A kid is inside the house." is: 0.454


In [9]:
import numpy as np
""""""
def cosine_sim_matric(emeb):
    n = emeb.size(0)
    cos_metric = np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            cos_metric[i][j] = 1 - cosine(emeb[i], emeb[j]) 
    return cos_metric

In [7]:
cosine_sim_matric(embeddings)

array([[1.        , 0.94252908, 0.4387798 ],
       [0.94252908, 1.        , 0.45440719],
       [0.4387798 , 0.45440719, 1.        ]])

In [8]:
data_files = {"train": "olid_train.json"}
dataset = load_dataset("Dataset" , data_files=data_files)

Using custom data configuration Dataset-e33be6df9e0fed9d


Downloading and preparing dataset json/Dataset to /root/.cache/huggingface/datasets/json/Dataset-e33be6df9e0fed9d/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/Dataset-e33be6df9e0fed9d/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
train_data = dataset["train"]

In [15]:
train_data = dataset['train'].rename_column('subtask_a' , 'labels')

In [16]:
train_data= train_data.remove_columns(['id' ,'subtask_b' , 'subtask_c'])

In [17]:
train_data

Dataset({
    features: ['tweet', 'labels'],
    num_rows: 1
})

In [19]:
import json , csv
with open("Dataset/olid_train.json" ,'r') as file :
    data = json.load(file)
data.keys()

dict_keys(['id', 'tweet', 'subtask_a', 'subtask_b', 'subtask_c'])

In [25]:
from collections import defaultdict

train_data = defaultdict(list)

for t_value , label_value in zip(data["tweet"].values(),data["subtask_a"].values()) :
    train_data["tweets"].append(t_value)
    train_data["labels"].append(1 if label_value =="OFF" else 0)
    train_data["labels_name"].append(label_value)


In [27]:
from datasets import Dataset
train_dataset = Dataset.from_dict(train_data)
train_dataset[0]

{'tweets': '@USER She should ask a few native Americans what their take on this is.',
 'labels': 1,
 'labels_name': 'OFF'}

In [28]:
train_dataset.to_json("Dataset/train.json")

Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

2331148

In [16]:
inputs = tokenizer(train["tweets"], padding=True, truncation=True, return_tensors="pt")

In [17]:
# Get the embeddings
with torch.no_grad():
    embeddings = model(**inputs, output_hidden_states=True, return_dict=True).pooler_output
embeddings.size()

torch.Size([1655, 768])

In [14]:
metric = cosine_sim_matric(embeddings)

In [18]:
from LP import LabelPropagation , LabelSpreading

In [19]:
LP = LabelSpreading(kernel='knn',n_jobs=-1,n_neighbors=20)   
LP.fit(embeddings.numpy(),train["labels"])

In [53]:
with open("Dataset/olid_test.json","r") as file :
    test_data = json.load(file)
test_data.keys()

dict_keys(['id', 'tweet', 'subtask_a'])

In [124]:
test_dataset = defaultdict(list)
for v , vl in zip(test_data["tweet"].values()  ,test_data["subtask_a"].values() ):
    test_dataset["tweets"].append(v)
    test_dataset["labels"].append(0 if vl=='OFF' else 1)

In [125]:
test = Dataset.from_dict(test_dataset)

In [126]:
test = test.shard(num_shards=4 , index=0)

In [154]:
test["tweets"][:6]

['¿Who the fuck is Yoru?',
 '@USER Hahahaha I wish...but a week is good, I’m extremely happy I get to have to for that long. It’s a good test drive for me...🙌',
 '@USER @USER @USER Katy is one of the best poets we have!',
 'Last night before rolling over to fall asleep Jimmy says "I would do anything for you"  Man I love him with every last fiber of my being',
 '@USER sometimes things can be hard to hear.but we need to hear them.',
 'Damn these niggas b lame that’s crazy']

In [148]:
inputs_test = tokenizer(test["tweets"], padding=True, truncation=True, return_tensors="pt")

In [149]:
# Get the embeddings
with torch.no_grad():
    embeddings_test = model(**inputs_test, output_hidden_states=True, return_dict=True).pooler_output
embeddings_test.size()

torch.Size([972, 768])

In [150]:
preds = LP.predict(embeddings_test)

In [153]:
preds[:6]

array([1, 1, 1, 1, 1, 0])

In [93]:
testt =  test["tweets"][9:19]

In [95]:
inp = tokenizer(testt, padding=True, truncation=True, return_tensors="pt")

In [98]:
with torch.no_grad():
    embds = embeddings_test = model(**inp, output_hidden_states=True, return_dict=True).pooler_output
embds.size()

torch.Size([10, 768])

In [158]:
len(test["labels"])

972

In [163]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(test["labels"] , preds) * 100
print(f"Accuracy of Data-Labeling : {acc:0.2f}")

Accuracy of Data-Labeling : 83.64
