In [17]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
from sklearn.metrics import classification_report

In [18]:
train = pd.read_csv(r'/content/train.csv')
test  = pd.read_csv(r'/content/test.csv')
train.dropna(inplace=True)
test.dropna(inplace=True)


In [19]:
train.head()

Unnamed: 0,cleaned_text,target
0,wondered around surry hills ages today nat bou...,0
1,jus skinned knee wat hurts lyke hell tho alway...,0
2,really really miss boyfriend,0
3,firstlisting thank robert,1
4,today going good day im happy person,1


In [20]:
test.head()

Unnamed: 0,cleaned_text,target
0,teeniewahine im sorry hope day gets better,0
1,feel ill shouldnt eat currychickenas reheat gt...,0
2,erikw like moonshine also want send,1
3,therefortmrw one week tomorrow thats whats goo...,1
4,goodnight twitterland time go bed tomorrow gre...,1


In [21]:
train.shape,test.shape

((1246623, 2), (155830, 2))

In [22]:
X_train, y_train = train['cleaned_text'].tolist()[:int(len(train) * 0.05)], train['target'].tolist()[:int(len(train) * 0.05)]
X_test, y_test   = test['cleaned_text'].tolist()[:int(len(test) * 0.05)], test['target'].tolist()[:int(len(test) * 0.05)]

In [23]:
MODEL_NAME = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)

In [24]:
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=128)
test_encodings  = tokenizer(X_test, truncation=True, padding=True, max_length=128)


In [25]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).shuffle(10000).batch(8)
test_dataset  = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(8)


In [26]:
model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2, from_pt=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'cla

In [27]:
model.fit(train_dataset, validation_data=test_dataset, epochs=5)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tf_keras.src.callbacks.History at 0x7f8584bd2ae0>

In [28]:
y_pred_logits = model.predict(test_dataset).logits
y_pred = np.argmax(y_pred_logits, axis=1)
print(classification_report(y_test, y_pred, target_names=["Negative", "Positive"]))


              precision    recall  f1-score   support

    Negative       0.73      0.79      0.76      3836
    Positive       0.78      0.72      0.75      3955

    accuracy                           0.76      7791
   macro avg       0.76      0.76      0.76      7791
weighted avg       0.76      0.76      0.76      7791

