# Tokenization and Encoding

## Loading Preprocessed Data

### Import Libraries

In [1]:
import pandas as pd
from transformers import BertTokenizer
import torch
import pickle

### Load Preprocessed Data

In [2]:
train_df = pd.read_csv('../data/preprocessed_train.csv')
test_df = pd.read_csv('../data/preprocessed_test.csv')

## Tokenization

### Initialize Tokenizer

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

### Tokenize and Encode

In [4]:
def tokenize_and_encode(texts):
    return tokenizer(
        texts.tolist(),
        add_special_tokens=True,
        max_length=256,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

# Tokenize training data
train_encodings = tokenize_and_encode(train_df['clean_text'])

# Tokenize testing data
test_encodings = tokenize_and_encode(test_df['clean_text'])

## Saving Tokenized Data

### Save Encodings and Labels

In [5]:
# Save training encodings and labels
with open('../data/train_encodings.pkl', 'wb') as f:
    pickle.dump(train_encodings, f)

train_labels = train_df['label'].tolist()
with open('../data/train_labels.pkl', 'wb') as f:
    pickle.dump(train_labels, f)

# Save testing encodings and labels
with open('../data/test_encodings.pkl', 'wb') as f:
    pickle.dump(test_encodings, f)

test_labels = test_df['label'].tolist()
with open('../data/test_labels.pkl', 'wb') as f:
    pickle.dump(test_labels, f)