In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
%matplotlib inline
%config InlineBackend.figure_format='retina'
RANDOM_SEED = 42
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from torch.autograd import Variable
from sklearn.metrics import roc_auc_score as auroc

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
from pytorch_lightning.utilities.warnings import LightningDeprecationWarning
warnings.filterwarnings("ignore", category=LightningDeprecationWarning)
import re
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

In [2]:
data = pd.read_csv("demo_50000.csv", index_col=0)

# Data Preprocessing

In [3]:
def clean_str(s):
    regs = [
        r'(\/\/www[^\s]+)',
        r'(pic.twitter.com\/[^\s]+)',
        r'(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)',
        r'(?i)\b((?:https?:\/\/[^\s]+))',
        r'https',
    ]
    
    cleaned_s=s.replace("RT", "")
    cleaned_s=cleaned_s.replace("&amp", "")
    cleaned_s=cleaned_s.replace("\\n", "")
    cleaned_s = " ".join([x[:-2] for x in cleaned_s.split("\\x")])
    cleaned_s=cleaned_s.lower()[1:]
    
    for reg in regs:
        cleaned_s = re.sub(reg, " ", cleaned_s)
    return cleaned_s

In [4]:
data["retweet"] = data["full_text"].str.slice(2,4)=="RT"
data["cleaned_text"]=data["full_text"].apply(clean_str)
data

Unnamed: 0,favorite_count,full_text,hashtags,retweet_count,year,party_id,retweet,cleaned_text
443098,0,"b""If Pres. Obama is serious about lifting the ...",ObamaAboutFace RequireAPlan,5,2013.0,R,False,if pres obama is serious about lifting the s...
136957,27,b'Wishing all those celebrating a happy #Easte...,Easter,7,2018.0,D,False,wishing all those celebrating a happy easter...
459257,0,b'UAM College of Technology in Crossett is hom...,ar4,2,2018.0,R,False,uam college of technology in crossett is home...
29136,1695,b'Helping #PuertoRico should be our primary co...,PuertoRico NotOnePenny,732,2017.0,D,False,helping puertorico should be our primary con...
39352,22,"b""It\xe2\x80\x99s crazy that we have Ohioans w...",POWADA,4,2020.0,R,False,99s crazy that we have ohioans willing abl...
...,...,...,...,...,...,...,...,...
351442,1,b'#Cures is a landmark medical innovation pack...,Cures,2,2016.0,R,False,cures is a landmark medical innovation packa...
274584,2,b'Happy 40th birthday to DFW airport! Honored ...,DFW40,5,2014.0,R,False,happy 40th birthday to dfw airport honored t...
38412,168,b'Reignite Cold War with Russia after email le...,foreignpolicy fail,102,2016.0,R,False,reignite cold war with russia after email lea...
315203,33,b'It was the people who defeated #Trumpcare! W...,Trumpcare BrooklynResists,12,2017.0,D,False,it was the people who defeated trumpcare we...


In [5]:
data.fillna(" ", inplace=True)
data["party_R"]=data["party_id"].apply(lambda x: 0 if x=="D" else 1)
data["party_D"]=data["party_id"].apply(lambda x: 0 if x=="R" else 1)
train_df = data.sample(frac=0.8)
val_df = data.drop(train_df.index)

In [6]:
predict_df = pd.read_csv("test.csv")
predict_df = predict_df.fillna("")
predict_df["retweet"] = predict_df["full_text"].str.slice(2,4)=="RT"
predict_df["cleaned_text"]=predict_df["full_text"].apply(clean_str)

# Network Initialize

In [7]:
BERT_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

## Create Dataset for BERT model

In [8]:
class TwitterDataset(Dataset):
    def __init__(self,
                 data: pd.DataFrame,
                 tokenizer: BertTokenizer,
                 max_token_len: int = 100
                ):
        
        self.tokenizer = tokenizer
        self.data = data
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]
        text = data_row.cleaned_text
        label1, label2 = 0, 0
        if "party_id" in data_row:
            label1 = data_row.party_R
            label2 = data_row.party_D
        encoding = self.tokenizer.encode_plus(
                        text,
                        add_special_tokens=True,
                        max_length=self.max_token_len,
                        return_token_type_ids=False,
                        padding="max_length",
                        truncation=True,
                        return_attention_mask=True,
                        return_tensors='pt',
                    )
        

        return dict(
            text=text,
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            
            labels=torch.FloatTensor(np.array([label1, label2]))
        )

In [9]:
train_dataset = TwitterDataset(data, tokenizer, 100)

## Load pretrained weights

In [10]:
bert_model = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Create DataModule (Combination of DataLoaders)

In [11]:
class TwitterDataModule(pl.LightningDataModule):
    def __init__(self, train_df, val_df, predict_df, tokenizer, batch_size=50, max_token_len=100):
        super().__init__()
        self.batch_size = batch_size
        self.train_df = train_df
        self.val_df = val_df
        self.predict_df = predict_df
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def setup(self, stage=None):
        self.train_dataset = TwitterDataset(
            self.train_df,
            self.tokenizer,
            self.max_token_len
        )
        self.val_dataset = TwitterDataset(
            self.val_df,
            self.tokenizer,
            self.max_token_len
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
        )




# Hyper parameters for Network

In [12]:
N_EPOCHS = 5
BATCH_SIZE = 30
data_module = TwitterDataModule(
    train_df,
    val_df,
    predict_df,
    tokenizer,
    batch_size=BATCH_SIZE,
    max_token_len=100
)

# Define Training/Testing/Validation steps

In [13]:
class TwitterTagger(pl.LightningModule):
    def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
        super().__init__()
        self.bert = BertModel.from_pretrained(
            BERT_MODEL_NAME, return_dict=True)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.n_training_steps = n_training_steps
        self.n_warmup_steps = n_warmup_steps
        self.criterion = nn.BCELoss()

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.bert(input_ids, attention_mask=attention_mask)
        output = self.classifier(output.pooler_output)
        output = torch.sigmoid(output)
        loss = 0
        if labels is not None:
            loss = self.criterion(output, labels)
        return loss, output
    
    def predict_dataloader(self):
        return DataLoader(
            TwitterDataset(
                predict_df,
                tokenizer,
                100
            ),
            batch_size = 50
        )
    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True, batch_size=BATCH_SIZE)
        return {"loss": loss, "predictions": outputs, "labels": labels}

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True, batch_size=BATCH_SIZE)
        return loss

    def predict_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        _, preds = self(input_ids, attention_mask,)
        return preds.cpu().detach().numpy()

    def training_epoch_end(self, outputs):
        labels = []
        predictions = []
        for output in outputs:
            for out_labels in output["labels"].detach().cpu():
                labels.append(out_labels)
            for out_predictions in output["predictions"].detach().cpu():
                predictions.append(out_predictions)
        labels = torch.stack(labels).int()
        predictions = torch.stack(predictions)
        
        for i, name in enumerate(["party_R", "party_D"]):
            class_roc_auc = auroc(labels[:, i].numpy(), (predictions[:, i].numpy()>0.5).astype(int))
            self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)


    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=2e-5)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.n_warmup_steps,
            num_training_steps=self.n_training_steps
        )
        return dict(
            optimizer=optimizer,
            lr_scheduler=dict(
                scheduler=scheduler,
                interval='step'
            )
        )

# Calculate other numbers

In [14]:
steps_per_epoch=len(train_df) // BATCH_SIZE
total_training_steps = steps_per_epoch * N_EPOCHS

In [15]:
warmup_steps = total_training_steps // 5
warmup_steps, total_training_steps

(1333, 6665)

## Create model

In [16]:
model = TwitterTagger(
  n_classes=2,
  n_warmup_steps=warmup_steps,
  n_training_steps=total_training_steps
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Define callbacks

In [17]:
checkpoint_callback = ModelCheckpoint(
  dirpath="checkpoints",
  filename="best-checkpoint",
  save_top_k=2,
  verbose=True,
  monitor="val_loss",
  mode="min"
)
early_stopping_callback = EarlyStopping(monitor="val_loss", mode="min")

In [18]:
logger = TensorBoardLogger("lightning_logs", name="Twitter")

# Initialize trainer

In [19]:
trainer = pl.Trainer(
  logger=logger,
  checkpoint_callback=[checkpoint_callback,early_stopping_callback],
  max_epochs=N_EPOCHS,
  gpus=1,
  progress_bar_refresh_rate=30,
)

  rank_zero_deprecation(
  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


# Model Fitting

In [20]:
trainer.fit(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type      | Params
-----------------------------------------
0 | bert       | BertModel | 108 M 
1 | classifier | Linear    | 1.5 K 
2 | criterion  | BCELoss   | 0     
-----------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
433.247   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

# Model Prediction

In [21]:
trainer.checkpoint_callback.best_model_path

'lightning_logs\\Twitter\\version_16\\checkpoints\\epoch=4-step=6670.ckpt'

In [22]:
trained_model = TwitterTagger.load_from_checkpoint(trainer.checkpoint_callback.best_model_path, n_classes=2)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
trained_model.eval()
trained_model.freeze()

In [24]:
result = trainer.predict()
result = np.concatenate(result, axis=0)

Restoring states from the checkpoint path at lightning_logs\Twitter\version_16\checkpoints\epoch=4-step=6670.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at lightning_logs\Twitter\version_16\checkpoints\epoch=4-step=6670.ckpt


Predicting: 1334it [00:00, ?it/s]

In [25]:
result = ["R" if x else "D" for x in result[:,0]>result[:,1]]

In [26]:
pd.DataFrame({"party":result}).reset_index().rename(columns={"index":"id"}).to_csv("submission_bert.csv", index=False)