In [None]:
import os
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import joblib

In [None]:
df = pd.read_csv("author.csv")

df = df.dropna()

In [None]:
df.shape

(16635, 2)

In [None]:
df.head()

Unnamed: 0,tweets,user_id,label
0,"Sharing dressing room with #Afridi, #Tahir and...",2367595410,34
1,"Luckily you are not in the #PSL anymore, #Afri...",2367595410,34
2,"Test cricket is first choice, #Dhani confirms ...",2367595410,34
3,"Don't forget he can keep as well, #Yasir tells...",2367595410,34
4,"Not exactly where we want him to be, but there...",2367595410,34


In [None]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['user_id'])

In [None]:
joblib.dump(label_encoder, 'label_encoder.joblib')

['label_encoder.joblib']

In [None]:
df.head()

Unnamed: 0,tweets,user_id,label
0,"Sharing dressing room with #Afridi, #Tahir and...",2367595410,34
1,"Luckily you are not in the #PSL anymore, #Afri...",2367595410,34
2,"Test cricket is first choice, #Dhani confirms ...",2367595410,34
3,"Don't forget he can keep as well, #Yasir tells...",2367595410,34
4,"Not exactly where we want him to be, but there...",2367595410,34


In [None]:
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

In [None]:
class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = TweetDataset(
        texts=df.tweets.to_numpy(),
        labels=df.label.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=4,
        sampler=RandomSampler(ds) if df is train_df else SequentialSampler(ds)
    )

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
MAX_LEN = 128
BATCH_SIZE = 16

train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(val_df, tokenizer, MAX_LEN, BATCH_SIZE)



In [None]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_encoder.classes_)
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * 5  # Assuming 5 epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)



In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

### Training Loop with Best Model Saving

In [None]:
EPOCHS = 5
best_accuracy = 0.0
best_model_dir = '/content/best_model/'

if not os.path.exists(best_model_dir):
    os.makedirs(best_model_dir)

In [None]:
def train_epoch(model, data_loader, optimizer, device, scheduler):
    model.train()
    losses = []
    correct_predictions = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

In [None]:
def eval_model(model, data_loader, device):
    model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

In [None]:
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        optimizer,
        device,
        scheduler
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        device
    )

    print(f'Val   loss {val_loss} accuracy {val_acc}')

    # Save the best model
    if val_acc > best_accuracy:
        best_accuracy = val_acc
        model.save_pretrained(best_model_dir)
        tokenizer.save_pretrained(best_model_dir)
        print(f"Best model saved with accuracy: {best_accuracy}")

    print()

Epoch 1/5
----------




Train loss 1.0874571474537684 accuracy 0.7439885782987676
Val   loss 0.4258893792732404 accuracy 0.8632401562969642
Best model saved with accuracy: 0.8632401562969642

Epoch 2/5
----------
Train loss 0.35691826867584425 accuracy 0.8823264201983769
Val   loss 0.3030940950848162 accuracy 0.8887886985272017
Best model saved with accuracy: 0.8887886985272017

Epoch 3/5
----------
Train loss 0.2587697493928807 accuracy 0.9070483919446949
Val   loss 0.2697473428427027 accuracy 0.8975052599939886
Best model saved with accuracy: 0.8975052599939886

Epoch 4/5
----------
Train loss 0.21718999505705702 accuracy 0.9168169522091975
Val   loss 0.26137147893538126 accuracy 0.8996092575894199
Best model saved with accuracy: 0.8996092575894199

Epoch 5/5
----------
Train loss 0.19753888444728757 accuracy 0.9226029455966336
Val   loss 0.2543978324437586 accuracy 0.8975052599939886



### Load and Evaluate the Best Model on the Validation Set

In [None]:
model = BertForSequenceClassification.from_pretrained(best_model_dir)
model = model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
y_preds = []
y_true = []

with torch.no_grad():
    for batch in val_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        _, preds = torch.max(outputs.logits, dim=1)

        y_preds.extend(preds)
        y_true.extend(labels)

y_preds = torch.stack(y_preds).cpu()
y_true = torch.stack(y_true).cpu()

print('Classification Report:')
print(classification_report(y_true, y_preds, target_names=label_encoder.classes_))

Classification Report:
                     precision    recall  f1-score   support

         1017531362       0.93      0.93      0.93        60
1046204298370080768       1.00      1.00      1.00        60
1060138959064449025       0.97      1.00      0.98        60
1102222309266522114       0.98      1.00      0.99        60
1125727031407173634       1.00      1.00      1.00        60
1154699420136595456       1.00      1.00      1.00        60
1222433056012660737       1.00      1.00      1.00        60
         1228093296       0.98      0.88      0.93        60
          124091966       1.00      1.00      1.00        60
1248671956087570433       1.00      1.00      1.00        60
1257065823010725891       1.00      1.00      1.00        60
1281947625512751105       0.98      1.00      0.99        60
1331461667750817794       0.98      0.98      0.98        60
1373010725531234308       0.92      0.93      0.93        60
         1377002137       0.95      0.97      0.96        60


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Testing

In [None]:
# Load the best model and tokenizer
model_dir = '/content/best_model'
tokenizer = BertTokenizer.from_pretrained(model_dir)
model = BertForSequenceClassification.from_pretrained(model_dir)
# Ensure the model is in evaluation mode
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# Move the model to the device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
# Load the label encoder used during training
label_encoder = joblib.load('label_encoder.joblib')

In [None]:
def preprocess_tweet(tweet):
    encoding = tokenizer.encode_plus(
        tweet,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    return encoding['input_ids'].to(device), encoding['attention_mask'].to(device)

In [None]:
def predict_user(tweet):
    input_ids, attention_mask = preprocess_tweet(tweet)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    predicted_class = torch.argmax(probabilities, dim=1).cpu().numpy()[0]

    predicted_user_id = label_encoder.inverse_transform([predicted_class])[0]

    return predicted_user_id

In [None]:
tweet = "Excited to see the new features in the upcoming PSL season!"
predicted_user = predict_user(tweet)
print(f"Predicted User ID: {predicted_user}")

Predicted User ID: 1060138959064449025


### Testing from dataframe

In [None]:
# Small dataframe from original df
small_df = df.sample(n=5, random_state=42)  # Randomly sample 5 rows for testing
small_df.head()

Unnamed: 0,tweets,user_id,label
2554,"USGS reports a M1.43 earthquake, 16km NNW of W...",1414684496,16
9976,RUPERT SPIRA VIDEO: AWARENESS EXPERIENCES THE ...,3366327531,39
15262,#Fitness You are really risking all your good ...,892146867995332608,52
1199,You must have a strong WHY – Weight loss succe...,author1,56
1056,Eating Garlic Makes Men Smell More Attractive ...,author1,56


In [None]:
# Function to verify the predictions
def verify_predictions(small_df):
    predictions = []
    for tweet in small_df['tweets']:
        predicted_user_id = predict_user(tweet)
        predictions.append(predicted_user_id)

    small_df['predicted_user_id'] = predictions
    return small_df

In [None]:
# Verify the predictions
verified_df = verify_predictions(small_df)
verified_df[['user_id', 'predicted_user_id', 'tweets']]

Unnamed: 0,user_id,predicted_user_id,tweets
2554,1414684496,1414684496,"USGS reports a M1.43 earthquake, 16km NNW of W..."
9976,3366327531,3366327531,RUPERT SPIRA VIDEO: AWARENESS EXPERIENCES THE ...
15262,892146867995332608,892146867995332608,#Fitness You are really risking all your good ...
1199,author1,author1,You must have a strong WHY – Weight loss succe...
1056,author1,author1,Eating Garlic Makes Men Smell More Attractive ...


### Saving the model

In [None]:
import shutil
shutil.make_archive('best_model', 'zip', 'best_model')

'/content/best_model.zip'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r best_model /content/drive/MyDrive/