In [1]:
!pip install transformers
!pip install nlpaug
!pip install nltk
!pip install torch
!pip install pandas

Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk
Successfully installed nltk-3.8.1


In [2]:
!pip install pandas



In [3]:

import pandas as pd
import nltk
import collections
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer
from sklearn.utils import shuffle
from transformers import BertForSequenceClassification

In [15]:
from tqdm.auto import tqdm

In [4]:
df = pd.read_csv("gptsentiment.csv")
df = df[["tweets", "labels"]].dropna().reset_index(drop=True)
df = df[:5000]
labelset = df.labels.unique()
label_map = {labelset[i]:i for i in range(len(labelset))}
print(label_map)
reverse_map = {i:labelset[i] for i in range(len(labelset))}
print(df.head())

{'neutral': 0, 'good': 1, 'bad': 2}
                                              tweets   labels
0  ChatGPT: Optimizing Language Models for Dialog...  neutral
1  Try talking with ChatGPT, our new AI system wh...     good
2  ChatGPT: Optimizing Language Models for Dialog...  neutral
3  THRILLED to share that ChatGPT, our new model ...     good
4  As of 2 minutes ago, @OpenAI released their ne...      bad


In [5]:
df['labels'].value_counts()

labels
bad        2343
good       1355
neutral    1302
Name: count, dtype: int64

In [6]:
df['labels'] = df['labels'].map(label_map)
df.head()
oglist = list(df.labels.unique())

In [7]:
import nlpaug.augmenter.word.context_word_embs as aug

In [8]:
augmenter = aug.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
token = tokenizer.encode_plus(
    df['tweets'].iloc[0],
    max_length=256,
    truncation=True,
    padding='max_length',
    add_special_tokens=True,
    return_tensors='tf'
)

2023-09-26 11:34:03.612909: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2023-09-26 11:34:03.612946: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2023-09-26 11:34:03.612960: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2023-09-26 11:34:03.613028: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-09-26 11:34:03.613065: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [11]:
X_input_ids = np.zeros((len(df), 256))
X_attn_masks = np.zeros((len(df), 256))

In [12]:
X_input_ids.shape

(5000, 256)

In [13]:
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['tweets'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [16]:
 X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer)

0it [00:00, ?it/s]

In [17]:
labels = np.zeros((len(df), len(df.labels.unique())))
labels.shape

(5000, 3)

In [18]:
labels[np.arange(len(df)), df['labels'].values] = 1 # one-hot encoded target tensor

In [19]:
labels

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [20]:
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))

In [21]:
def LabelDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [22]:
dataset = dataset.map(LabelDatasetMapFunction)

In [23]:
dataset = dataset.shuffle(10000).batch(3, drop_remainder=True)

In [24]:
p = 0.8
train_size = int((len(df)//3)*p)

In [25]:
train_size

1332

In [26]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [27]:
from transformers import TFBertModel

In [28]:
model = TFBertModel.from_pretrained('bert-base-cased')

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [29]:
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(len(df.labels.unique()), activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

tickets_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
tickets_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 256)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 256)]                0         []                            
 )                                                                                                
                                                                                                  
 bert (TFBertMainLayer)      TFBaseModelOutputWithPooli   1083102   ['input_ids[0][0]',           
                             ngAndCrossAttentions(last_   72         'attention_mask[0][0]']      
                             hidden_state=(None, 256, 7                                       

In [30]:
optim = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
optim



<keras.src.optimizers.adam.Adam at 0x30b2dfdf0>

In [31]:
import tensorflow as tf
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.01,
    decay_steps=10000,
    decay_rate=0.9)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)



In [32]:
tickets_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])




In [33]:
hist = tickets_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5
)

Epoch 1/5


2023-09-26 11:34:57.776387: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-09-26 11:40:54.194428: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [34]:
tickets_model.save('tickets_model')

INFO:tensorflow:Assets written to: tickets_model/assets


INFO:tensorflow:Assets written to: tickets_model/assets


In [35]:
tickets_model = tf.keras.models.load_model('tickets_model')

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=256,
        truncation=True,
        padding='max_length',
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

def make_prediction(model, processed_data, classes=oglist):
    probs = model.predict(processed_data)[0]
    return classes[np.argmax(probs)]

In [37]:
input_text = input('Enter gpt sentiment analysis here: ')
processed_data = prepare_data(input_text, tokenizer)
result = make_prediction(tickets_model, processed_data=processed_data)
print(f"Predicted Service: {reverse_map[result]}")

2023-09-26 12:09:36.905797: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Predicted Service: bad


In [40]:
print(oglist)

[0, 1, 2]
