In [5]:
import pandas as pd

train_df = pd.read_csv("tweet_eval_train.csv")
val_df   = pd.read_csv("tweet_eval_validation.csv")
test_df  = pd.read_csv("tweet_eval_test.csv")


In [6]:
def minimal_clean(df):
    df = df.dropna(subset=["text"])
    df["clean_text"] = df["text"].str.strip().str.lower()
    return df


In [7]:
train_df = minimal_clean(train_df)
val_df   = minimal_clean(val_df)
test_df  = minimal_clean(test_df)

In [8]:
print(train_df[["text", "clean_text"]].head())
print(val_df[["text", "clean_text"]].head())
print(test_df[["text", "clean_text"]].head())

                                                text  \
0  "QT @user In the original draft of the 7th boo...   
1  "Ben Smith / Smith (concussion) remains out of...   
2  Sorry bout the stream last night I crashed out...   
3  Chase Headley's RBI double in the 8th inning o...   
4  @user Alciato: Bee will invest 150 million in ...   

                                          clean_text  
0  "qt @user in the original draft of the 7th boo...  
1  "ben smith / smith (concussion) remains out of...  
2  sorry bout the stream last night i crashed out...  
3  chase headley's rbi double in the 8th inning o...  
4  @user alciato: bee will invest 150 million in ...  
                                                text  \
0  Dark Souls 3 April Launch Date Confirmed With ...   
1  "National hot dog day, national tequila day, t...   
2  When girls become bandwagon fans of the Packer...   
3  @user I may or may not have searched it up on ...   
4  Here's your starting TUESDAY MORNING Line up a... 

In [9]:
print(train_df.columns)

Index(['text', 'label', 'clean_text'], dtype='object')


In [10]:
train_df["label"].value_counts(normalize=True)*100

label
1    45.320618
2    39.129672
0    15.549710
Name: proportion, dtype: float64

In [11]:
val_df["label"].value_counts(normalize=True)*100

label
1    43.45
2    40.95
0    15.60
Name: proportion, dtype: float64

In [12]:
test_df["label"].value_counts(normalize=True)*100

label
1    48.331162
0    32.334744
2    19.334093
Name: proportion, dtype: float64

In [13]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [14]:
sample_text = train_df["clean_text"].iloc[0]
print("Text:",sample_text)
print("Tokens:",tokenizer.tokenize(sample_text))

Text: "qt @user in the original draft of the 7th book, remus lupin survived the battle of hogwarts. #happybirthdayremuslupin"
Tokens: ['"', 'q', '##t', '@', 'user', 'in', 'the', 'original', 'draft', 'of', 'the', '7th', 'book', ',', 're', '##mus', 'lu', '##pin', 'survived', 'the', 'battle', 'of', 'hog', '##wart', '##s', '.', '#', 'happy', '##bir', '##th', '##day', '##rem', '##us', '##lu', '##pin', '"']


In [15]:
encoded = tokenizer(
    sample_text,
    padding = "max_length",
    truncation = True,
    max_length = 20,
    return_tensors = "pt"
)
encoded

{'input_ids': tensor([[  101,  1000,  1053,  2102,  1030,  5310,  1999,  1996,  2434,  4433,
          1997,  1996,  5504,  2338,  1010,  2128,  7606, 11320,  8091,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [16]:
encoded["input_ids"]
encoded["attention_mask"]

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [22]:
MAX_LEN=128

train_encodings = tokenizer(
    train_df["clean_text"].tolist(),
    truncation=True,
    padding=True,
    max_length=MAX_LEN
)
val_encodings = tokenizer(
    val_df["clean_text"].tolist(),
    truncation=True,
    padding=True,
    max_length=MAX_LEN
)
test_encodings= tokenizer(
    test_df["clean_text"].tolist(),
    truncation=True,
    padding=True,
    max_length=MAX_LEN
)


In [23]:
train_labels = train_df["label"].tolist()
val_labels = val_df["label"].tolist()
test_labels = test_df["label"].tolist()

In [34]:
import torch
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [35]:
train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)

In [36]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=3
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
sample = train_dataset[0]

with torch.no_grad():
    output = model(
        input_ids = sample["input_ids"].unsqueeze(0),
        attention_mask = sample["attention_mask"].unsqueeze(0)
    )
output.logits

tensor([[-0.0884,  0.0145, -0.0277]])

In [38]:
tokenizer.save_pretrained("save_tokenizer")

('save_tokenizer\\tokenizer_config.json',
 'save_tokenizer\\special_tokens_map.json',
 'save_tokenizer\\vocab.txt',
 'save_tokenizer\\added_tokens.json',
 'save_tokenizer\\tokenizer.json')

In [39]:
import pickle

with open("train_encodings.pkl", "wb") as f:
    pickle.dump(train_encodings, f)

with open("val_encodings.pkl", "wb") as f:
    pickle.dump(val_encodings, f)

with open("test_encodings.pkl", "wb") as f:
    pickle.dump(test_encodings, f)

with open("labels.pkl", "wb") as f:
    pickle.dump((train_labels, val_labels, test_labels), f)
