In [15]:
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

In [16]:
# ─── 1) TEXT CLEANER
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\.\S+", "", text)
    text = re.sub(r"[^a-z0-9\s']", " ", text)
    return re.sub(r"\s+", " ", text).strip()

In [None]:
# ─── 2) LOAD & PREPARE LABELED DATA
labeled = pd.read_csv("FB_posts_labeled.txt", sep="\t")
labeled['msg'] = labeled['message'].astype(str).map(clean_text)
# one-hot → single label
labeled['label'] = labeled[['Appreciation','Complaint','Feedback']].values.argmax(axis=1)

In [18]:
# ─── 3) SPLIT TRAIN / VAL
train_df, val_df = train_test_split(
    labeled, test_size=0.2, stratify=labeled['label'], random_state=42
)

In [19]:
# ─── 4) TOKENIZER
MODEL_ID = "vinai/bertweet-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False)

def tokenize_texts(texts, max_len=128):
    return tokenizer(
        texts.tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors='tf'
    )

In [20]:
# ─── 5) BUILD TF.DATA.DATASET
def make_dataset(df, batch_size=16, shuffle=False):
    toks = tokenize_texts(df['msg'])
    labels = tf.convert_to_tensor(df['label'].values)
    ds = tf.data.Dataset.from_tensor_slices((
        {'input_ids': toks['input_ids'],
         'attention_mask': toks['attention_mask']},
        labels
    ))
    if shuffle:
        ds = ds.shuffle(len(df))
    return ds.batch(batch_size)

train_ds = make_dataset(train_df, batch_size=16, shuffle=True)
val_ds   = make_dataset(val_df,   batch_size=32)

In [21]:
# ─── 6) LOAD TF MODEL
with tf.device('/GPU:0'):   # remove or change if no GPU
    model = TFAutoModelForSequenceClassification.from_pretrained(
        MODEL_ID,
        num_labels=3
    )

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# ─── 7) COMPILE & TRAIN
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss      = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
model.fit(train_ds, validation_data=val_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tf_keras.src.callbacks.History at 0x7e23c45949d0>

In [23]:
# ─── 8) EVALUATE F1 ON VAL
y_true = np.concatenate([y for x,y in val_ds], axis=0)
logits = model.predict(val_ds)['logits']
y_pred = np.argmax(logits, axis=1)
print("Validation macro-F1:", f1_score(y_true, y_pred, average='macro'))

Validation macro-F1: 0.885706774752427


In [26]:
# ─── 9) RETRAIN ON FULL LABELED SET
full_ds = make_dataset(labeled, batch_size=16, shuffle=True)
model.fit(full_ds, epochs=10)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tf_keras.src.callbacks.History at 0x7e220cd72650>

In [None]:
# ─── 10) PREDICT UNLABELED
unlabeled = pd.read_csv("FB_posts_unlabeled.txt", sep="\t")
unlabeled['msg'] = unlabeled['message'].astype(str).map(clean_text)
un_toks = tokenize_texts(unlabeled['msg'])
un_ds = tf.data.Dataset.from_tensor_slices({
    'input_ids':     un_toks['input_ids'],
    'attention_mask':un_toks['attention_mask']
}).batch(32)

logits = model.predict(un_ds)['logits']
labels = np.argmax(logits, axis=1)

# one-hot encode
one_hot = np.zeros((len(labels), 3), dtype=int)
one_hot[np.arange(len(labels)), labels] = 1

# ─── 11) SAVE PREDICTIONS
out = pd.DataFrame({
    'postId':            unlabeled['postId'],
    'Appreciation_pred': one_hot[:,0],
    'Complaint_pred':    one_hot[:,1],
    'Feedback_pred':     one_hot[:,2],
})
out.to_csv("predictions.csv", index=False)
print("Wrote predictions.csv with", len(out), "rows.")

Wrote predictions.csv with 2039 rows.
