In [None]:
import tensorflow as tf
import pandas as pd
from transformers import BertTokenizer , TFBertForSequenceClassification
import re
import nltk

In [None]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.TPUStrategy(tpu)

In [None]:
df = pd.read_excel('Comments_all_bank.xlsx')

In [None]:
def handle_mistype(sentiment):
    if not isinstance(sentiment, str):
        return sentiment
    pattern = r'(?i)\b(neu(tral)?|pos(itive)?|neg(ative)?)\w*\b'
    handled_sentiment = re.sub(pattern, lambda match: 'Neutral' if 'neu' in match.group(0).lower() else
                          ('Positive' if 'pos' in match.group(0).lower() else
                          ('Negative' if 'neg' in match.group(0).lower() else match.group(0))), sentiment)
    return handled_sentiment

df['Sentiment'] = df['Sentiment'].apply(handle_mistype)

In [None]:
sentiment_mapping = {'Negative': 0, 'Neutral': 1 , 'Positive': 2 }

In [None]:
df['Sentiment'] = df['Sentiment'].replace(sentiment_mapping)

In [None]:
df['Sentiment'] = df['Sentiment'].replace({value: None for value in df['Sentiment'].unique()
                                           if value not in sentiment_mapping.values()})


In [None]:
df = df.dropna(subset = ['Comment','Sentiment'])

In [None]:
texts = df['Comment'].tolist()
labels = df['Sentiment'].tolist()

In [None]:
def cleaned_text(text):
    text = str(text)
    #remove symbols and special characters
    text = re.sub(r"^a-zA-Z0-9ğöəışçüĞÖƏIŞÇÜ\s", "", text)
    return text

cleaned_texts = [cleaned_text(text) for text in texts]

df['Comment'] = cleaned_texts
texts = df['Comment'].tolist()

In [None]:
def cleaned_sentiment(label):
    label = str(label)
    label = re.sub(r"^0-9", "", label)
    return label
cleaned_labels = [int(cleaned_sentiment(label)) for label in labels]

df['Sentiment'] = cleaned_labels
labels = df['Sentiment'].tolist()

In [None]:
df.isnull().sum()

Comment      0
Sentiment    0
dtype: int64

In [None]:
labels = df['Sentiment'].tolist()

In [None]:
df.head()

Unnamed: 0,Comment,Sentiment
0,Men 439 rubl depozitim olubdu,1
1,Ele sey olur,1
2,Hele 100 dende cox,1
3,100 rubl elave,1
4,Rubl demek istirsizsiz rubl 0.28 gepik deyer...,1


In [None]:

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

In [None]:
inputs = tokenizer(texts,padding=True , truncation = True , return_tensors = "tf")

In [None]:
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

In [None]:
from tensorflow.keras.optimizers.legacy import Adam

with strategy.scope():
  model = TFBertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels = 3)
  optimizer  = tf.keras.optimizers.legacy.Adam(learning_rate = 3e-5)
  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
  metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
  model.compile(optimizer = optimizer , loss = loss , metrics = [metric])

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_ratio , val_ratio , test_ratio = 0.8 , 0.1 , 0.1

In [None]:
num_samples = len(texts)
num_train_samples = int(train_ratio * num_samples )
num_val_samples = int(val_ratio* num_samples )

In [None]:
train_input_ids = input_ids[:num_train_samples]
train_attention_mask = attention_mask[:num_train_samples]
train_labels = labels[:num_train_samples]

In [None]:
val_input_ids = input_ids[num_train_samples: num_train_samples + num_val_samples]
val_attention_mask = attention_mask[num_train_samples: num_train_samples + num_val_samples]
val_labels = labels[num_train_samples: num_train_samples + num_val_samples]

In [None]:
test_input_ids = input_ids[num_train_samples + num_val_samples:]
test_attention_mask = attention_mask[num_train_samples + num_val_samples:]
test_labels = labels[num_train_samples + num_val_samples:]

In [None]:
train_labels = tf.constant(train_labels, dtype=tf.int32)
val_labels = tf.constant(val_labels, dtype=tf.int32)
test_labels = tf.constant(test_labels, dtype=tf.int32)

In [None]:
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy',
    patience=10,
    restore_best_weights = True
)

In [None]:
batch_size = 64

In [None]:
history = model.fit(
    [train_input_ids, train_attention_mask],
    train_labels,
    validation_data=([val_input_ids, val_attention_mask], val_labels),
    epochs=100,
    batch_size=batch_size,
    callbacks=[early_stopping_callback]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100


In [None]:
test_loss, test_accuracy = model.evaluate(
    [test_input_ids, test_attention_mask],
    test_labels,
    batch_size=batch_size
)

print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Test Loss: 0.3999171853065491
Test Accuracy: 0.9091544151306152
