In [None]:
import tensorflow as tf
import pandas as pd
import re
import nltk

In [None]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.TPUStrategy(tpu)

In [None]:
df = pd.read_excel('Comments_all_bank.xlsx')

In [None]:
def handle_mistype(sentiment):
    if not isinstance(sentiment, str):
        return sentiment
    pattern = r'(?i)\b(neu(tral)?|pos(itive)?|neg(ative)?)\w*\b'
    handled_sentiment = re.sub(pattern, lambda match: 'Neutral' if 'neu' in match.group(0).lower() else
                          ('Positive' if 'pos' in match.group(0).lower() else
                          ('Negative' if 'neg' in match.group(0).lower() else match.group(0))), sentiment)
    return handled_sentiment

df['Sentiment'] = df['Sentiment'].apply(handle_mistype)

In [None]:
sentiment_mapping = {'Negative': 0, 'Neutral': 1 , 'Positive': 2 }

In [None]:
df['Sentiment'] = df['Sentiment'].replace(sentiment_mapping)

In [None]:
df['Sentiment'] = df['Sentiment'].replace({value: None for value in df['Sentiment'].unique()
                                           if value not in sentiment_mapping.values()})

In [None]:
df = df.dropna(subset = ['Comment','Sentiment'])

In [None]:
texts = df['Comment'].tolist()
labels = df['Sentiment']

In [None]:
def cleaned_text(text):
    text = str(text)
    #remove symbols and special characters
    text = re.sub(r"^a-zA-Z0-9ğöəışçüĞÖƏIŞÇÜ\s", "", text)
    return text

cleaned_texts = [cleaned_text(text) for text in texts]

df['Comment'] = cleaned_texts
texts = df['Comment'].tolist()

In [None]:
def cleaned_sentiment(label):
    label = str(label)
    label = re.sub(r"^0-9", "", label)
    return label
cleaned_labels = [int(cleaned_sentiment(label)) for label in labels]

df['Sentiment'] = cleaned_labels
labels = df['Sentiment']

In [None]:
df.isnull().sum()

Comment      0
Sentiment    0
dtype: int64

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m39.5 MB/s[0m eta [36m0:00:0

In [None]:
from transformers import TFGPT2ForSequenceClassification , GPT2Tokenizer

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
labels = df['Sentiment'].tolist()

In [None]:
train_texts, temp_texts, train_labels, temp_labels = train_test_split(texts , labels , test_size = 0.2 , random_state = 42)
val_texts, test_texts, val_labels , test_labels = train_test_split(temp_texts,temp_labels , test_size = 0.5, random_state = 42)

In [None]:
train_inputs = tokenizer(train_texts, padding=True, truncation=True, return_tensors="tf")
val_inputs = tokenizer(val_texts, padding=True, truncation=True, return_tensors="tf")
test_inputs = tokenizer(test_texts, padding=True, truncation=True, return_tensors="tf")

In [None]:
train_input_ids = train_inputs['input_ids']
train_attention_masks = train_inputs['attention_mask']
val_input_ids = val_inputs['input_ids']
val_attention_masks = val_inputs['attention_mask']
test_input_ids = test_inputs['input_ids']
test_attention_masks = test_inputs['attention_mask']
train_labels = tf.constant(train_labels)
val_labels = tf.constant(val_labels)
test_labels = tf.constant(test_labels)

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_input_ids, train_attention_masks, train_labels)).shuffle(1000).batch(64)
val_dataset = tf.data.Dataset.from_tensor_slices((val_input_ids, val_attention_masks, val_labels)).batch(64)
test_dataset = tf.data.Dataset.from_tensor_slices((test_input_ids, test_attention_masks, test_labels)).batch(64)

In [None]:
# model = TFGPT2ForSequenceClassification.from_pretrained("gpt2", num_labels = 3)

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 10039341110625147071
xla_global_id: -1
]


In [None]:
with strategy.scope():
  model = TFGPT2ForSequenceClassification.from_pretrained("gpt2", num_labels = 3)
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')],
                weighted_metrics = [])

All PyTorch model weights were used when initializing TFGPT2ForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFGPT2ForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.fit(train_dataset, validation_data=val_dataset, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x799406c97f10>

In [None]:
test_loss, test_accuracy = model.evaluate(test_dataset)

print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Test Loss: 3.924664270016365e-05
Test Accuracy: 1.0
