In [2]:
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

2.1.0+cu118
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m67.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m64.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone


In [3]:
import torch_geometric
from torch_geometric.datasets import Planetoid

#Tutorial1: Introduction

#Load the dataset

In [4]:
dataset = Planetoid(root="tutorial1", name= "Cora")

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!


Dataset properties

In [6]:
print(dataset)
print(f"Number of graphs:\t\t {len(dataset)}")
print(f"Number of classes:\t\t {dataset.num_classes}")
print(f"Number of node features:\t {dataset.num_node_features}")
print(f"Number of edge features: \t {dataset.num_edge_features}")

Cora()
Number of graphs:		 1
Number of classes:		 7
Number of node features:	 1433
Number of edge features: 	 0


Dataset shapes

In [7]:
print(dataset.data)

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])




In [9]:
#shape of 2 multiplied by 10556
#node 0 goes into 633, 1862, 2582...
print(f"edge_index:\t\t {dataset.data.edge_index.shape}")
print(dataset.data.edge_index)
print("\n")

#1-d tensor, boolean values that indicate which set is on training,
#validation or test
print(f"train_mask:\t\t {dataset.data.train_mask.shape}")
print(dataset.data.train_mask)
print("\n")

#2708nodes, each node containing 1433 features
print(f"x:\t\t {dataset.data.x.shape}")
print(dataset.data.x)
print("\n")

#node labelling
print(f"y:\t\t {dataset.data.y.shape}")
print(dataset.data.y)

edge_index:		 torch.Size([2, 10556])
tensor([[   0,    0,    0,  ..., 2707, 2707, 2707],
        [ 633, 1862, 2582,  ...,  598, 1473, 2706]])


train_mask:		 torch.Size([2708])
tensor([ True,  True,  True,  ..., False, False, False])


x:		 torch.Size([2708, 1433])
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])


y:		 torch.Size([2708])
tensor([3, 4, 4,  ..., 3, 3, 3])


In [10]:
import os.path as osp

import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

In [12]:
data = dataset[0]

Defining the neural network

In [22]:
class Net(torch.nn.Module):
  def __init__(self):
    super(Net, self).__init__()

    self.conv = SAGEConv(dataset.num_features,
                         dataset.num_classes,
                         aggr="max") # specifying the aggregation

  def forward(self):
    x = self.conv(data.x, data.edge_index)
    return F.log_softmax(x, dim=1)

In [23]:
#to put model on cpu or gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model, data = Net().to(device), data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [24]:
def train():
  model.train()
  optimizer.zero_grad()
  F.nll_loss(model()[data.train_mask], data.y[data.train_mask]).backward()
  optimizer.step()

In [25]:
def test():
  model.eval()
  logits, accs = model(), []
  for _, mask in data("train_mask", "val_mask", "test_mask"):
    pred = logits[mask].max(1)[1]
    acc = pred.eq(data.y[mask]).sum().item()/ mask.sum().item()
    accs.append(acc)
  return accs

In [26]:
best_val_acc = test_acc = 0
for epoch in range(1, 100):
  train()
  _, val_acc, tmp_test_acc = test()
  if val_acc > best_val_acc:
    best_val_acc = val_acc
    test_acc = tmp_test_acc
  log = "Epoch: {:03d}, Val: {:.4f}, Test: {:.4f}"

  if epoch % 10 == 0:
    print(log.format(epoch, best_val_acc, test_acc))

Epoch: 010, Val: 0.7280, Test: 0.7190
Epoch: 020, Val: 0.7280, Test: 0.7190
Epoch: 030, Val: 0.7280, Test: 0.7190
Epoch: 040, Val: 0.7280, Test: 0.7190
Epoch: 050, Val: 0.7280, Test: 0.7190
Epoch: 060, Val: 0.7280, Test: 0.7190
Epoch: 070, Val: 0.7280, Test: 0.7190
Epoch: 080, Val: 0.7280, Test: 0.7190
Epoch: 090, Val: 0.7280, Test: 0.7190


#Trying to load own dataset

Embedding the sentences

In [5]:
import pandas as pd

In [11]:
data = pd.read_csv("agr_en_train.csv")

In [12]:
data

Unnamed: 0,POST,Comments,Classification
0,facebook_corpus_msr_1723796,Well said sonu..you have courage to stand agai...,OAG
1,facebook_corpus_msr_466073,"Most of Private Banks ATM's Like HDFC, ICICI e...",NAG
2,facebook_corpus_msr_1493901,"Now question is, Pakistan will adhere to this?",OAG
3,facebook_corpus_msr_405512,Pakistan is comprised of fake muslims who does...,OAG
4,facebook_corpus_msr_1521685,"??we r against cow slaughter,so of course it w...",NAG
...,...,...,...
11994,facebook_corpus_msr_394638,They belong to you flight dirty terrorist coun...,OAG
11995,facebook_corpus_msr_429177,"Really motivating programme, congratulations t...",NAG
11996,facebook_corpus_msr_2032370,fabricated news,OAG
11997,facebook_corpus_msr_1722926,What's wrong with you secular idiots,OAG


In [13]:
data['Comments']

0        Well said sonu..you have courage to stand agai...
1        Most of Private Banks ATM's Like HDFC, ICICI e...
2           Now question is, Pakistan will adhere to this?
3        Pakistan is comprised of fake muslims who does...
4        ??we r against cow slaughter,so of course it w...
                               ...                        
11994    They belong to you flight dirty terrorist coun...
11995    Really motivating programme, congratulations t...
11996                                      fabricated news
11997                 What's wrong with you secular idiots
11998    Looks like inevitable after all political hard...
Name: Comments, Length: 11999, dtype: object

In [17]:
!pip install sentence_transformers



In [20]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/use-cmlm-multilingual')

for comment in data['Comments'][0:10]:
    print(model.encode(comment))

Downloading (…)7729f/.gitattributes:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Downloading (…)a63d77729f/README.md:   0%|          | 0.00/1.85k [00:00<?, ?B/s]

Downloading (…)3d77729f/config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)7729f/tokenizer.json:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)a63d77729f/vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

Downloading (…)d77729f/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/sentence-transformers_use-cmlm-multilingual/ were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[-0.00652132 -0.00492031  0.04025589 -0.03243421 -0.01074097 -0.06785867
  0.0269713  -0.01575314 -0.03266335  0.01177471  0.00306701  0.0327425
  0.03902072 -0.00274698  0.00353636  0.02122785  0.00751358 -0.01918032
  0.01756336  0.04298618  0.04358811  0.06350102 -0.04742334  0.04615135
 -0.00513171  0.06759537  0.08632553  0.00148574 -0.02959458 -0.02631752
  0.03718531 -0.01659323  0.01281352  0.00259998  0.00194734  0.01686916
 -0.00083092 -0.01006564 -0.03282909 -0.02027193  0.01607621  0.03122762
  0.03376517 -0.06073269  0.00523186 -0.01915884  0.00282113 -0.02521159
 -0.01359135  0.03272867  0.03399456  0.01103424  0.01559083 -0.00893444
 -0.00934248 -0.06393569 -0.0027667   0.02596988  0.04110724 -0.00418819
 -0.04188355 -0.00801683  0.00478555 -0.00246361  0.04935288  0.00832294
 -0.02834169  0.02730796  0.07358879 -0.00572001  0.03700849  0.06569271
 -0.0489553   0.0032382  -0.01007151 -0.0115098  -0.03437835  0.01856899
  0.00343369  0.00629603  0.0324493  -0.05940339 -0.

In [21]:
from torch.nn import CosineSimilarity

In [23]:
X = []
for comment in data['Comments'][0:10]:
    X.append(model.encode(comment))

In [24]:
X

[array([-0.00652132, -0.00492031,  0.04025589, -0.03243421, -0.01074097,
        -0.06785867,  0.0269713 , -0.01575314, -0.03266335,  0.01177471,
         0.00306701,  0.0327425 ,  0.03902072, -0.00274698,  0.00353636,
         0.02122785,  0.00751358, -0.01918032,  0.01756336,  0.04298618,
         0.04358811,  0.06350102, -0.04742334,  0.04615135, -0.00513171,
         0.06759537,  0.08632553,  0.00148574, -0.02959458, -0.02631752,
         0.03718531, -0.01659323,  0.01281352,  0.00259998,  0.00194734,
         0.01686916, -0.00083092, -0.01006564, -0.03282909, -0.02027193,
         0.01607621,  0.03122762,  0.03376517, -0.06073269,  0.00523186,
        -0.01915884,  0.00282113, -0.02521159, -0.01359135,  0.03272867,
         0.03399456,  0.01103424,  0.01559083, -0.00893444, -0.00934248,
        -0.06393569, -0.0027667 ,  0.02596988,  0.04110724, -0.00418819,
        -0.04188355, -0.00801683,  0.00478555, -0.00246361,  0.04935288,
         0.00832294, -0.02834169,  0.02730796,  0.0

In [25]:
cos = CosineSimilarity(X, X)

In [26]:
print(cos)

CosineSimilarity()
