## Data Ingestion

In [5]:
import pandas as pd

column_names = ['id', 'text', 'label']
twitter_15_train = pd.read_csv("../twitter15/twitter15.train", sep="\t", 
                               header=None, names=column_names)
twitter_15_test = pd.read_csv("../twitter15/twitter15.test", sep="\t", 
                              header=None, names=column_names)
twitter_15_dev = pd.read_csv("../twitter15/twitter15.dev", sep="\t", 
                             header=None, names=column_names)

In [6]:
twitter_15_train.head()

Unnamed: 0,id,text,label
0,724703995147751424,"american family association gets 500,000 to si...",unverified
1,358591089462099968,this week's top story: george zimmerman wins f...,false
2,775672628493357057,clinton hides failing health? full disclosure ...,unverified
3,364589696573124609,fukushima: highly radioactive water seeping in...,false
4,549927969032916993,a transgender 17-year old left a suicide note ...,unverified


In [7]:
twitter_15_train.shape, twitter_15_test.shape, twitter_15_dev.shape

((1005, 3), (336, 3), (149, 3))

In [8]:
twitter_16_train = pd.read_csv("../twitter16/twitter16.train", sep="\t", 
                               header=None, names=column_names)
twitter_16_test = pd.read_csv("../twitter16/twitter16.test", sep="\t", 
                              header=None, names=column_names)
twitter_16_dev = pd.read_csv("../twitter16/twitter16.dev", sep="\t", 
                             header=None, names=column_names)

In [9]:
twitter_16_train.head()

Unnamed: 0,id,text,label
0,692929779696275456,ohio lawmakers want to know why state’s epa di...,non-rumor
1,693858804279201794,poor women in india are fighting for the right...,non-rumor
2,693648684857323521,spoiler alert: leo and kate were ridiculously ...,non-rumor
3,620367840902782976,translucent butterfly - beautiful! ' URL,false
4,693939356390653952,michael oher got a text from cam newton during...,non-rumor


In [10]:
twitter_16_train.shape, twitter_16_test.shape, twitter_16_dev.shape

((552, 3), (184, 3), (82, 3))

In [13]:
twitter_train = pd.concat([twitter_15_train, twitter_16_train], ignore_index=True, axis=0)
twitter_test = pd.concat([twitter_15_test, twitter_16_test], ignore_index=True, axis=0)
twitter_dev = pd.concat([twitter_15_dev, twitter_15_test], ignore_index=True, axis=0)

In [14]:
twitter_train.shape, twitter_test.shape, twitter_dev.shape

((1557, 3), (520, 3), (485, 3))

## Data Preparation

In [18]:
LABELS = twitter_train['label'].unique().tolist()
train_label = twitter_train['label'].replace(LABELS, [0, 1, 2, 3]).tolist()

  train_label = twitter_train['label'].replace(LABELS, [0, 1, 2, 3]).tolist()


In [15]:
train_data = twitter_train['text'].tolist()

In [20]:
dev_data = twitter_dev['text'].tolist()
dev_label = twitter_dev['label'].replace(LABELS, [0, 1, 2, 3]).tolist()

  dev_label = twitter_dev['label'].replace(LABELS, [0, 1, 2, 3]).tolist()


In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

class ClassificationDataset(Dataset):
    def __init__(self, texts: list[str], labels: list[int], tokenizer, max_length: int):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True, 
            max_length=self.max_length, 
            return_token_type_ids=False, 
            padding="max_length",
            truncation=True, 
            return_attention_mask=True, 
            return_tensors="pt"
        )
        
        return {
            "input_ids": encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

## Model Preparation