In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/stance-detection-dataset/fnc-1-master/scorer.py
/kaggle/input/stance-detection-dataset/fnc-1-master/README.md
/kaggle/input/stance-detection-dataset/fnc-1-master/train_stances.csv
/kaggle/input/stance-detection-dataset/fnc-1-master/test_stances_unlabeled.csv
/kaggle/input/stance-detection-dataset/fnc-1-master/train_bodies.csv
/kaggle/input/stance-detection-dataset/fnc-1-master/competition_test_stances.csv
/kaggle/input/stance-detection-dataset/fnc-1-master/train_stances.random.csv
/kaggle/input/stance-detection-dataset/fnc-1-master/competition_test_bodies.csv
/kaggle/input/stance-detection-dataset/fnc-1-master/competition_test_stances_unlabeled.csv
/kaggle/input/stance-detection-dataset/fnc-1-master/test_bodies.csv


In [2]:
# Importing stock ml libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import XLNetModel, XLNetTokenizer

# Preparing for TPU usage
# import torch_xla
# import torch_xla.core.xla_model as xm
# device = xm.xla_device()

# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'



In [3]:
#load data
df_body_train = pd.read_csv("/kaggle/input/stance-detection-dataset/fnc-1-master/train_bodies.csv")
df_stance_train = pd.read_csv("/kaggle/input/stance-detection-dataset/fnc-1-master/train_stances.csv")
df_body_test = pd.read_csv("/kaggle/input/stance-detection-dataset/fnc-1-master/competition_test_bodies.csv")
df_stance_test = pd.read_csv("/kaggle/input/stance-detection-dataset/fnc-1-master/competition_test_stances.csv")


# merge the tables by Body ID
train_df = pd.merge(df_body_train, df_stance_train, on='Body ID', how='inner')
test_df = pd.merge(df_body_test, df_stance_test, on='Body ID', how='inner')

# null_counts_train = train_df.isnull().sum() #no nulls
# null_counts_test = test_df.isnull().sum()  #no nulls

total_rows_train = len(train_df)
total_rows_test = len(test_df)

unique_body_ids_train = train_df['Body ID'].nunique()
unique_body_ids_test = test_df['Body ID'].nunique()

print("TRAIN: Total number of rows: ",total_rows_train,", Unique Body IDs:",unique_body_ids_train)
print("TEST: Total number of rows: ",total_rows_test,", Unique Body IDs:",unique_body_ids_test)

# print(train_df.head())
# print(test_df.head())

# convert the last column i.e. the categorical column to a one hot encoded list. 
train_df['list'] = pd.get_dummies(train_df['Stance'],columns=train_df.columns).astype(int).values.tolist()
new_df_train = train_df[['articleBody','Headline', 'list']].copy()
# Passing colums as train.columns so that the encoding is consistent among train and test
test_df['list'] = pd.get_dummies(test_df['Stance'],columns=train_df.columns).astype(int).values.tolist()
new_df_test = test_df[['articleBody','Headline', 'list']].copy()

TRAIN: Total number of rows:  49972 , Unique Body IDs: 1683
TEST: Total number of rows:  25413 , Unique Body IDs: 904


In [4]:
new_df_train.head()

Unnamed: 0,articleBody,Headline,list
0,A small meteorite crashed into a wooded area i...,"Soldier shot, Parliament locked down after gun...","[0, 0, 0, 1]"
1,A small meteorite crashed into a wooded area i...,Tourist dubbed ‘Spider Man’ after spider burro...,"[0, 0, 0, 1]"
2,A small meteorite crashed into a wooded area i...,Luke Somers 'killed in failed rescue attempt i...,"[0, 0, 0, 1]"
3,A small meteorite crashed into a wooded area i...,BREAKING: Soldier shot at War Memorial in Ottawa,"[0, 0, 0, 1]"
4,A small meteorite crashed into a wooded area i...,Giant 8ft 9in catfish weighing 19 stone caught...,"[0, 0, 0, 1]"


In [5]:
new_df_test.head()

Unnamed: 0,articleBody,Headline,list
0,Al-Sisi has denied Israeli reports stating tha...,Apple installing safes in-store to protect gol...,"[0, 0, 0, 1]"
1,Al-Sisi has denied Israeli reports stating tha...,El-Sisi denies claims he'll give Sinai land to...,"[1, 0, 0, 0]"
2,Al-Sisi has denied Israeli reports stating tha...,Apple to keep gold Watch Editions in special i...,"[0, 0, 0, 1]"
3,Al-Sisi has denied Israeli reports stating tha...,Apple Stores to Keep Gold “Edition” Apple Watc...,"[0, 0, 0, 1]"
4,Al-Sisi has denied Israeli reports stating tha...,South Korean woman's hair 'eaten' by robot vac...,"[0, 0, 0, 1]"


In [6]:
new_df_train['articleBody'].apply(lambda x: len(str(x).split())).max()

4788

In [7]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 4
LEARNING_RATE = 1e-05
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

Downloading spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

In [8]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.article_body = self.data["articleBody"]
        self.headline = self.data["Headline"]
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.article_body)

    def __getitem__(self, index):
        article_body = str(self.article_body[index])
        article_body = " ".join(article_body.split())
        headline = str(self.headline[index])
        headline = " ".join(headline.split())


        inputs = self.tokenizer(
            article_body, 
            headline,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation='only_first', 
            return_overflowing_tokens=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [9]:
# Creating the dataset and dataloader for the neural network

train_dataset=new_df_train.sample(frac=1,random_state=200).reset_index(drop=True)
test_dataset=new_df_test.sample(frac=1,random_state=200).reset_index(drop=True)


# print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

TRAIN Dataset: (49972, 3)
TEST Dataset: (25413, 3)


In [10]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [11]:
class XLNetClass(torch.nn.Module):
    def __init__(self):
        super(XLNetClass, self).__init__()
        self.l1 = XLNetModel.from_pretrained("xlnet-base-cased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 4)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output
    
model = XLNetClass()
model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

XLNetClass(
  (l1): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (pre_classifier): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (classifier): Linear(in_features=768, out_features=4, bias=True)
)

In [12]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [13]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [14]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [15]:
accuracy_tracker = []
f1_micro_tracker = []
f1_macro_tracker = []
precision_tracker = []
recall_tracker = []
mcc_tracker = []
logloss_tracker = []
hammingloss_tracker = []

for epoch in range(EPOCHS):
    print("Epoch ",epoch)
    train(epoch)
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    precision_score = metrics.precision_score(targets, outputs, average = 'samples', zero_division = 0)
    recall_score = metrics.recall_score(targets, outputs, average = 'samples')
    # MCC not supported for multiclass
#     mcc_score = metrics.matthews_corrcoef(targets,outputs)
    logloss_score = metrics.log_loss(targets, outputs)
    hammingloss_score = metrics.hamming_loss(targets, outputs)
    accuracy_tracker.append(accuracy)
    f1_micro_tracker.append(f1_score_micro)
    f1_macro_tracker.append(f1_score_macro)
    precision_tracker.append(precision_score)
    recall_tracker.append(recall_score)
#     mcc_tracker.append(mcc_score)
    logloss_tracker.append(logloss_score)
    hammingloss_tracker.append(hammingloss_score)
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    print(f"Precision = {precision_score}")
    print(f"Recall = {recall_score}")
#     print(f"MCC = {mcc_score}")
    print(f"LogLoss = {logloss_score}")
    print(f"Hamming Loss = {hammingloss_score}")

Epoch  0
Epoch: 0, Loss:  0.971224844455719
Epoch: 0, Loss:  0.014055445790290833
Accuracy Score = 0.9030811002242947
F1 Score (Micro) = 0.9072997734213378
F1 Score (Macro) = 0.6615435731854005
Precision = 0.9045567229370794
Recall = 0.9060323456498642
LogLoss = 3.177845118917059
Hamming Loss = 0.04628536575768308
Epoch  1
Epoch: 1, Loss:  0.00027071835938841105
Epoch: 1, Loss:  0.1473044455051422
Accuracy Score = 0.9176405776571046
F1 Score (Micro) = 0.9239319620253165
F1 Score (Macro) = 0.7396948730848475
Precision = 0.9183685515287451
Recall = 0.9190965254003857
LogLoss = 2.503845198450495
Hamming Loss = 0.03783496635580215
Epoch  2
Epoch: 2, Loss:  0.00010973168537020683
Epoch: 2, Loss:  0.013426825404167175
Accuracy Score = 0.9224019202770236
F1 Score (Micro) = 0.925365488434409
F1 Score (Macro) = 0.7286605206843175
Precision = 0.923228268996183
Recall = 0.9240546177153426
LogLoss = 2.579015923261545
Hamming Loss = 0.037264392240192026
Epoch  3
Epoch: 3, Loss:  0.00010826947982423

KeyboardInterrupt: 