In [1]:
import numpy as np
import pandas as pd
import gzip
import json
import matplotlib.pyplot as plt

from os import walk

In [2]:
Path = r'C:\Users\marvi\Desktop\New_Concatenated_MatchingFile.gz'
LBData = []

with gzip.open(Path, 'r') as dataFile:
    for line in dataFile:
        lineData = json.loads(line.decode('utf-8'))
        LBData.append(lineData)
data = pd.DataFrame(LBData)

data = data.iloc[:1000]

In [3]:
columns = ['name', 'addressregion', 'streetaddress', 'addresslocality', 'addresscountry', 'longitude', 'latitude']
data['concat'] = data[columns].astype(str).agg(' '.join, axis=1)
data['clusterID'] = data.groupby('telephoneNorm').ngroup()


In [4]:
from transformers import RobertaForSequenceClassification, RobertaTokenizerFast

model = RobertaForSequenceClassification.from_pretrained('roberta-base')
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', max_length = 512)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [5]:
def applyTokens(text):
    tokens = tokenizer(text, padding = 'max_length', truncation = True)
    return tokens

data['tokens'] = data['concat'].apply(lambda x: applyTokens(x))

df = data['tokens'].apply(pd.Series)

inputIDS = df[['input_ids', 'attention_mask']].values.tolist()

In [6]:
x = inputIDS
y = data['clusterID']

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [7]:
import torch 

x_train = torch.tensor(x_train)
x_test = torch.tensor(x_test)

In [8]:
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification,Trainer, TrainingArguments

from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
#import wandb
import os

In [9]:
# define accuracy metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [10]:
# define the training arguments
training_args = TrainingArguments(
    output_dir = r'C:\Users\marvi\Desktop\TeamProject\Result',
    num_train_epochs=3,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 16,    
    per_device_eval_batch_size= 8,
    evaluation_strategy = "epoch",
    disable_tqdm = False, 
    load_best_model_at_end=True,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps = 8,
    fp16 = True,
    logging_dir=r'C:\Users\marvi\Desktop\TeamProject\Result',
    dataloader_num_workers = 8,
    run_name = 'roberta-classification'
)

NVIDIA GeForce RTX 3080 with CUDA capability sm_86 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_61 sm_70 sm_75 compute_37.
If you want to use the NVIDIA GeForce RTX 3080 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



In [25]:
class dataDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [27]:
train_data = dataDataset(x_train, y_train)
test_data = dataDataset(x_test, y_test)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device