In [None]:
#Runtime -> Change runtime type -> Hardware accelerator: GPU

import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 31.0 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 59.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 67.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


Just specify the right path to twitter dataset

In [None]:
import pandas as pd
import numpy as np
import datetime
import random
import time
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

from collections import Counter
from transformers import get_linear_schedule_with_warmup
from transformers import AutoConfig, AutoTokenizer, AutoModel, TFAutoModel
from transformers import BertForSequenceClassification, AdamW, BertConfig
from tensorflow.keras.utils import to_categorical
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, random_split
import torch

from termcolor import colored
from tqdm import tqdm

In [None]:
# Load a trained model and vocabulary that is fine-tuned
saved_model_path = 'HooshvareLab/bert-fa-zwnj-base'

config = AutoConfig.from_pretrained(saved_model_path)
tokenizer = AutoTokenizer.from_pretrained(saved_model_path)



def pred(x):


    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in tqdm(x):
      # `encode_plus` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      #   (5) Pad or truncate the sentence to `max_length`
      #   (6) Create attention masks for [PAD] tokens.
        encoded_dict = tokenizer.encode_plus(
                          sent,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          max_length = max_len,           # Pad & truncate all sentences.
                          pad_to_max_length = True,
                          return_attention_mask = True,   # Construct attn. masks.
                          return_tensors = 'pt',     # Return pytorch tensors.
                    )
      
      # Add the encoded sentence to the list.    
        input_ids.append(encoded_dict['input_ids'])
      
      # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

  # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)


  # Print sentence 0, now as a list of IDs.
  #print('Original: ', xx[0])
  #print('Token IDs:', input_ids[0])
    return input_ids,attention_masks


In [None]:
data_pred=pd.read_csv('/content/drive/MyDrive/snappfood_comments_preprocessed.csv')

In [None]:
texts=data_pred['Cleaned'].values

In [None]:
labels = data_pred['label_id'].values

In [None]:
validation_ratio = 10/100

In [None]:
test_texts = texts[int(len(texts)-(len(texts)*validation_ratio)):]

In [None]:
test_labels = labels[int(len(texts)-(len(texts)*validation_ratio)):]

In [None]:
max_len=512
input_ids_t, attention_masks_t=pred(test_texts)

pred_dataset = TensorDataset(input_ids_t, attention_masks_t)
pred_dataloader = DataLoader(
            pred_dataset, # The validation samples.
            sampler = SequentialSampler(pred_dataset), # Pull out batches sequentially.
            batch_size = 1 # Evaluate with this batch size.
        )






# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(pred_dataset)))




model= BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/finetuned_model')

# Copy the model to the GPU.
model.to(device)


# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions = []

# Predict 
for batch in tqdm(pred_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
  
    # Unpack the inputs from our dataloader
    b1, b2= batch
  
    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b1, token_type_ids=None,attention_mask=b2)

    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
  
  
    # Store predictions 
    predictions.append(logits)
 

  
print('    DONE.')

100%|██████████| 6948/6948 [00:04<00:00, 1570.30it/s]


Predicting labels for 6,948 test sentences...


100%|██████████| 6948/6948 [03:59<00:00, 29.00it/s]

    DONE.





In [None]:
preee=[np.argmax(predictions[k], axis=1)[0] for k in range(len(predictions))]


In [None]:
preee

[0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,


In [None]:
df = pd.DataFrame.from_dict({'Text':test_texts,'Predicted':preee, 'true_labels': test_labels})
df.to_excel('predictions.xlsx', header=True, index=False)


In [None]:
target_names = ['class 0', 'class 1']
print(classification_report(test_labels, preee, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.90      0.82      0.86      3485
     class 1       0.84      0.90      0.87      3463

    accuracy                           0.86      6948
   macro avg       0.87      0.86      0.86      6948
weighted avg       0.87      0.86      0.86      6948

