In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [2]:
def load_imdb_data(data_file):
    df = pd.read_csv("IMDB Dataset.csv")
    texts = df['review'].tolist()
    labels = [1 if sentiment == "positive" else 0 for sentiment in df['sentiment'].tolist()]
    return texts, labels

In [3]:
cwd = os.getcwd()
df = pd.read_csv(r'C:\Users\rory\Documents\IMDB_Dataset.csv')
texts = df['review'].tolist()
labels = [1 if sentiment == "positive" else 0 for sentiment in df['sentiment'].tolist()]

In [4]:
directory = r'C:\Users\rory\Documents'
files = os.listdir(directory)
print(files)

['.ipynb_checkpoints', 'IMDB_Dataset.csv', 'kaggle_data', 'language_model.ipynb', 'My Music', 'My Pictures', 'My Videos', 'tester.ipynb', 'Uno']


In [5]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [6]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [7]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [8]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [9]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return "positive" if preds.item() == 1 else "negative"

In [10]:
# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 2e-5
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

In [12]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [21]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

Epoch 1/4


KeyboardInterrupt: 

In [13]:
test_text = "this movie was not great"
sentiment = predict_sentiment(test_text, model, tokenizer, device)
print("The movie was great and I really enjoyed the performances of the actors.")
print(f"Predicted sentiment: {sentiment}")

The movie was great and I really enjoyed the performances of the actors.
Predicted sentiment: negative


In [14]:
model = BertModel.from_pretrained('bert-base-uncased')
"The movie was great and I really enjoyed the performances of the actors."

'The movie was great and I really enjoyed the performances of the actors.'

In [15]:
model.encode('The movie was great and I really enjoyed the performances of the actors.')

AttributeError: 'BertModel' object has no attribute 'encode'

In [16]:
from transformers import pipeline
unmasker = pipeline('fill-mask', model='bert-base-uncased')
unmasker("My penis is [MASK] [MASK] .")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[[{'score': 0.1475975215435028,
   'token': 2145,
   'token_str': 'still',
   'sequence': '[CLS] my penis is still [MASK]. [SEP]'},
  {'score': 0.1113625317811966,
   'token': 2061,
   'token_str': 'so',
   'sequence': '[CLS] my penis is so [MASK]. [SEP]'},
  {'score': 0.0420222245156765,
   'token': 3294,
   'token_str': 'completely',
   'sequence': '[CLS] my penis is completely [MASK]. [SEP]'},
  {'score': 0.03979630023241043,
   'token': 2025,
   'token_str': 'not',
   'sequence': '[CLS] my penis is not [MASK]. [SEP]'},
  {'score': 0.036821480840444565,
   'token': 2200,
   'token_str': 'very',
   'sequence': '[CLS] my penis is very [MASK]. [SEP]'}],
 [{'score': 0.14292874932289124,
   'token': 2524,
   'token_str': 'hard',
   'sequence': '[CLS] my penis is [MASK] hard. [SEP]'},
  {'score': 0.07940872013568878,
   'token': 14750,
   'token_str': 'aching',
   'sequence': '[CLS] my penis is [MASK] aching. [SEP]'},
  {'score': 0.05975637957453728,
   'token': 17061,
   'token_str': 'th

In [66]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained("bert-base-uncased")
text = ["456","789","I like playing with dinosaurs"]
encoded_input = tokenizer(text, return_tensors='tf',padding=True, truncation=True)
output = model(encoded_input)
output.last_hidden_state
features = output.last_hidden_state
features_list = tf.reshape(features, [features.shape[0], -1])

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [67]:
type(output)

transformers.modeling_tf_outputs.TFBaseModelOutputWithPoolingAndCrossAttentions

In [70]:
output.last_hidden_state

<tf.Tensor: shape=(3, 7, 768), dtype=float32, numpy=
array([[[-0.54378474,  0.2409769 , -0.24549067, ...,  0.23936334,
          0.44163346,  0.41809237],
        [ 0.25412276, -0.09595229, -0.21622771, ..., -0.31149828,
          0.15741283, -0.5495563 ],
        [ 1.1260716 ,  0.10477532,  0.58316606, ..., -0.58053017,
         -0.38747334, -0.6490364 ],
        ...,
        [-0.20509365, -0.15304819,  0.01746574, ...,  0.11709079,
          0.2677412 ,  0.35618937],
        [-0.5041675 ,  0.04745362, -0.06152503, ...,  0.22828154,
          0.21252683,  0.31339386],
        [-0.10899797, -0.11940696,  0.0282459 , ...,  0.1588327 ,
          0.1740753 ,  0.307178  ]],

       [[-0.5221435 ,  0.298836  , -0.28603116, ...,  0.2141225 ,
          0.5667204 ,  0.6360429 ],
        [ 1.3777694 , -0.3641031 ,  0.45124036, ...,  0.04624242,
          0.4333403 ,  0.48436096],
        [ 0.08393332, -0.23064086, -0.03523014, ..., -0.2599762 ,
         -0.41654408, -0.08601542],
        ...,
 

In [75]:
features = output.last_hidden_state
features_list = tf.reshape(features, [features.shape[0], -1])

In [78]:
features

<tf.Tensor: shape=(3, 7, 768), dtype=float32, numpy=
array([[[-0.54378474,  0.2409769 , -0.24549067, ...,  0.23936334,
          0.44163346,  0.41809237],
        [ 0.25412276, -0.09595229, -0.21622771, ..., -0.31149828,
          0.15741283, -0.5495563 ],
        [ 1.1260716 ,  0.10477532,  0.58316606, ..., -0.58053017,
         -0.38747334, -0.6490364 ],
        ...,
        [-0.20509365, -0.15304819,  0.01746574, ...,  0.11709079,
          0.2677412 ,  0.35618937],
        [-0.5041675 ,  0.04745362, -0.06152503, ...,  0.22828154,
          0.21252683,  0.31339386],
        [-0.10899797, -0.11940696,  0.0282459 , ...,  0.1588327 ,
          0.1740753 ,  0.307178  ]],

       [[-0.5221435 ,  0.298836  , -0.28603116, ...,  0.2141225 ,
          0.5667204 ,  0.6360429 ],
        [ 1.3777694 , -0.3641031 ,  0.45124036, ...,  0.04624242,
          0.4333403 ,  0.48436096],
        [ 0.08393332, -0.23064086, -0.03523014, ..., -0.2599762 ,
         -0.41654408, -0.08601542],
        ...,
 

In [63]:
output.pooler_output

<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
array([[-7.17280507e-01, -1.76421538e-01,  3.32592070e-01,
         3.74702960e-01, -4.11120862e-01, -7.74121135e-02,
         6.02245152e-01,  1.30732611e-01,  2.29897082e-01,
        -9.99700367e-01, -1.33951187e-01,  3.52798700e-01,
         9.64773238e-01, -3.39830637e-01,  7.27449417e-01,
        -2.94809639e-01, -3.05940136e-02, -4.45753813e-01,
         3.95324856e-01, -1.86041817e-01,  3.07913929e-01,
         9.29253757e-01,  3.82708579e-01,  1.57621950e-01,
         3.30947280e-01,  2.25208014e-01, -5.28438330e-01,
         7.90538669e-01,  8.86050940e-01,  5.85440636e-01,
        -3.84449661e-01,  1.45747378e-01, -9.77239013e-01,
        -1.30338177e-01,  1.71480313e-01, -9.81212378e-01,
         7.22568929e-02, -6.06909394e-01,  1.15858190e-01,
         8.72076675e-02, -7.51980364e-01,  2.25891143e-01,
         9.96358871e-01,  2.07225367e-01, -3.77747007e-02,
        -3.50273728e-01, -9.97534692e-01,  1.68136448e-01,
      

In [11]:
[[1,2,3],
 [3,4,5,6],
 [7,8,9]]

(<tf.Tensor: shape=(1, 12, 768), dtype=float32, numpy=
 array([[[ 0.13862704,  0.15826836, -0.2966649 , ..., -0.27084973,
          -0.28436327,  0.45808414],
         [ 0.53636336, -0.23269622,  0.17541951, ...,  0.55402565,
           0.49807116, -0.00240759],
         [ 0.30023715, -0.34751177,  0.12084441, ..., -0.45624903,
           0.32880178,  0.87728167],
         ...,
         [ 0.37985945,  0.12028794,  0.82829225, ..., -0.86237276,
          -0.59569633,  0.04711594],
         [-0.02524202, -0.7176754 , -0.6950472 , ...,  0.07574195,
          -0.66678166, -0.3400749 ],
         [ 0.75353885,  0.23910885,  0.07174353, ...,  0.24671493,
          -0.64580613, -0.32129788]]], dtype=float32)>,
 <tf.Tensor: shape=(1, 768), dtype=float32, numpy=
 array([[-0.9376786 , -0.5042589 , -0.979893  ,  0.9030439 ,  0.9329325 ,
         -0.24377505,  0.8925754 ,  0.228806  , -0.9531208 , -0.99999535,
         -0.8862303 ,  0.99055725,  0.9855201 ,  0.71552855,  0.9454762 ,
         -0.864

In [79]:
pip show transformers

Name: transformers
Version: 4.36.2
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: c:\users\rory\anaconda3\envs\tf\lib\site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 
Note: you may need to restart the kernel to use updated packages.
