In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)


Thu Feb 22 00:01:53 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0              47W / 350W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 89.6 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
import tensorflow as tf
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA A100-SXM4-40GB


In [None]:
import torch
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from functools import reduce
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix

# 01. Imports

In [None]:
!pip install transformers

In [None]:
from torch.utils.data import TensorDataset
from transformers import RobertaTokenizer, RobertaConfig, RobertaForSequenceClassification, AdamW

In [None]:
print('Loading RoBERTa tokenizer...')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

In [None]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

In [None]:
model.load_state_dict(torch.load('/content/finetuned_RoBERTa_epoch_10.model', map_location=torch.device('cpu')))

# 02. Implementation (Takes 6+ hours on A100)

In [None]:
years = ["2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022"]

for year in years:
  df_full = pd.read_csv("../" + year + ".csv", lineterminator='\n')
  df_pred = df_full
  df_pred["data_type"] = "pred"
  df_pred["label"] = 1

  # Using the tokenizer to encode the pred data
  encoded_df_pred = tokenizer.batch_encode_plus(
      df_pred[df_pred.data_type=='pred'].text.values,
      add_special_tokens=True,
      return_attention_mask=True,
      pad_to_max_length=True,
      truncation=True,
      max_length=256,
      return_tensors='pt'
  )

  # Separating the input_ids, attention_masks and labels from the encoded data

  input_ids_pred = encoded_df_pred['input_ids']
  attention_masks_pred = encoded_df_pred['attention_mask']
  labels_pred = torch.tensor(df_pred[df_pred.data_type=='pred'].label.values)

  # Creating a Tensor Dataset from the input_ids, attention_masks and labels
  dataset_pred = TensorDataset(input_ids_pred, attention_masks_pred, labels_pred)

  # Creating a Dataloader from the Tensor Dataset for pred data
  dataloader_pred = DataLoader(dataset_pred,
                              sampler=SequentialSampler(dataset_pred),
                              batch_size=batch_size)

  _, predictions, true_vals = evaluate(dataloader_pred)

  def predicted_list_generate(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    df_label_created = pd.DataFrame()

    predicted = preds_flat.tolist()

    return predicted

  prediction = predicted_list_generate(predictions, true_vals)
  df_pred['predicted_label'] = prediction
  df_pred = df_pred.drop(["label", "data_type"], axis=1)

  df_pred['category_pred'] = np.nan

  #Creates a new dictionary with keys and values reversed from an existing dictionary 'label_dict'
  label_dict_reversed = {v: k for k, v in label_dict.items()}

  # Fills in the 'category_pred' column with values from the 'label_dict_reversed' dictionary based on the corresponding values in the 'predicted_label' column.
  df_pred['category_pred']= df_pred["predicted_label"].map(label_dict_reversed).fillna(df_pred["category_pred"])

  df_pred = df_pred.drop(["predicted_label"], axis=1)

  # Save
  df_pred.to_csv("../" + year + "_Predicted.csv", index=False)
