# Fine-Tuning a Transformer Model for Regression

In [4]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AdamW
import pandas as pd
from sklearn.model_selection import train_test_split
import json
import zipfile
from scipy.sparse import load_npz

In [5]:
# Unzipping files
with zipfile.ZipFile('../02_data/article_embeddings.zip', 'r') as zip_ref:
    zip_ref.extractall('../02_data/article_embeddings')

article_embeddings_path = '../02_data/article_embeddings/article_embeddings.txt'
    
with open(article_embeddings_path, 'r') as file:
    article_data = json.load(file)
    
article_embeddings = pd.DataFrame(article_data)
article_embeddings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1091004 entries, 0 to 1091003
Data columns (total 4 columns):
 #   Column             Non-Null Count    Dtype 
---  ------             --------------    ----- 
 0   url_index          1091004 non-null  int64 
 1   url                1091004 non-null  object
 2   article            1091004 non-null  object
 3   article_embedding  1091004 non-null  object
dtypes: int64(1), object(3)
memory usage: 33.3+ MB


In [7]:
url_domains = pd.read_csv('../02_data/url_domains.csv')
url_domains.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18141615 entries, 0 to 18141614
Data columns (total 19 columns):
 #   Column               Dtype  
---  ------               -----  
 0   index                int64  
 1   url                  object 
 2   domain               object 
 3   messages             int64  
 4   chats                int64  
 5   avalanches           int64  
 6   mean_avalanche_size  float64
 7   top_avalanche_size   int64  
 8   virality             float64
 9   top_shares_1h        int64  
 10  top_shares_6h        int64  
 11  top_shares_1d        int64  
 12  top_shares_3d        int64  
 13  top_shares_14d       int64  
 14  first_share_date     object 
 15  final_share_date     object 
 16  pc1                  float64
 17  rank                 float64
 18  year                 int64  
dtypes: float64(4), int64(11), object(4)
memory usage: 2.6+ GB


In [8]:
data = pd.merge(
    article_embeddings,
    url_domains[['url', 'domain', 'index', 'pc1']],
    on='url',
    how='inner'
)#.rename(columns={'index': 'url_index'})

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1091004 entries, 0 to 1091003
Data columns (total 7 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   url_index          1091004 non-null  int64  
 1   url                1091004 non-null  object 
 2   article            1091004 non-null  object 
 3   article_embedding  1091004 non-null  object 
 4   domain             1091004 non-null  object 
 5   index              1091004 non-null  int64  
 6   pc1                1091004 non-null  float64
dtypes: float64(1), int64(2), object(4)
memory usage: 58.3+ MB


In [15]:
# Step 1: Import libraries and setup

# Check if GPU is available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['article'], data['pc1'], test_size=0.2, random_state=42
)

## remove special symbols
maybe balancing 

In [16]:
# Step 3: Tokenize data
model_name = "bert-base-uncased"  # Replace with your desired model
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512, return_tensors="pt")
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=512, return_tensors="pt")


In [17]:
# Step 4: Create datasets
class PC1Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

train_dataset = PC1Dataset(train_encodings, train_labels.tolist())
val_dataset = PC1Dataset(val_encodings, val_labels.tolist())

In [18]:
# Step 5: Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [19]:
# Step 6: Prepare optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.MSELoss()



In [None]:
# Step 7: Train the model
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()
        inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
        labels = batch['labels'].to(device)
        outputs = model(**inputs)
        loss = loss_fn(outputs.logits.squeeze(), labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Training Loss: {total_loss / len(train_loader)}")

In [None]:
# Step 8: Evaluate the model
model.eval()
val_loss = 0

with torch.no_grad():
    for batch in val_loader:
        inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
        labels = batch['labels'].to(device)
        outputs = model(**inputs)
        loss = loss_fn(outputs.logits.squeeze(), labels)
        val_loss += loss.item()

print(f"Validation Loss: {val_loss / len(val_loader)}")

In [None]:
# Step 9: Make predictions
test_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=512, return_tensors="pt")
test_dataset = PC1Dataset(test_encodings, val_labels.tolist())
test_loader = DataLoader(test_dataset, batch_size=64)

predictions = []

model.eval()
with torch.no_grad():
    for batch in test_loader:
        inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
        outputs = model(**inputs)
        preds = outputs.logits.squeeze().cpu().numpy()
        predictions.extend(preds)

print(predictions)