# Transfer Learning - Fake News Detection
Seminar Aktuelle Themen der KI (SS 2023)

Gruppenmitglieder:
- Miguel Meindl
- Simon Wolf
- Tim Staudinger

![](static/steps.png)

### Setup

In [None]:
!pip3 install -r requirements.txt

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoModel, BertTokenizerFast, BertTokenizer, BertForNextSentencePrediction, pipeline, AdamW

## 1. Daten
### 1.1 Laden, Zusammenfügen, Labeln

![](static/steps1.png)

In [None]:
true_data = pd.read_csv('data/True.csv')
fake_data = pd.read_csv('data/Fake.csv')
print(true_data.head())

In [None]:
true_data['Target'] = ['True']*len(true_data)
fake_data['Target'] = ['Fake']*len(fake_data)

data = pd.concat([true_data, fake_data]).sample(frac=1).reset_index(drop=True)

print(data.shape)
data.head()

In [None]:
data['label'] = pd.get_dummies(data.Target)['Fake']
data.head()

In [None]:
label_size = [data['label'].sum(),len(data['label'])-data['label'].sum()]
plt.pie(label_size,explode=[0.1,0.1],colors=['firebrick','navy'],startangle=90,shadow=True,labels=['Fake','True'],autopct='%1.1f%%')

### 1.2 Formatieren der Daten für Torch Modell

In [None]:
train_text, temp_text, train_labels, temp_labels = train_test_split(data['title'], data['label'],
                                                                    random_state=2018,
                                                                    test_size=0.3,
                                                                    stratify=data['Target'])

val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
                                                                random_state=2018,
                                                                test_size=0.5,
                                                                stratify=temp_labels)

#### 1.2.1 Tokenizer

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

sample_data = ["Build fake news model.",
               "Using bert."]

tokenized_sample_data = tokenizer.batch_encode_plus(sample_data, padding=True)
print(tokenized_sample_data)

#### 1.2.2 Formatieren der Daten

In [None]:
seq_len = [len(title.split()) for title in train_text]

pd.Series(seq_len).hist(bins = 40,color='firebrick')
plt.xlabel('Number of Words')
plt.ylabel('Number of texts')

In [None]:
MAX_LENGTH = 15

# Tokenize and encode sequences in the train set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = MAX_LENGTH,
    padding=True,
    truncation=True
)

# Tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = MAX_LENGTH,
    padding=True,
    truncation=True
)
# Tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = MAX_LENGTH,
    padding=True,
    truncation=True
)

for i in tokens_train['input_ids'][:10]:
    print(i)
    
for i in tokens_train['attention_mask'][:10]:
    print(i)

In [None]:
# Convert lists to tensors
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

In [None]:
# Data Loader structure definition

batch_size = 32

train_data = TensorDataset(train_seq, train_mask, train_y)    # wrap tensors
train_sampler = RandomSampler(train_data)                     # sampler for sampling the data during training
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) # dataloader for train set

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

## 2. BERT (Bidirectional Encoder Representations from Transformers)

![](static/steps2.png)

- Veröffentlicht von Google im Jahr 2018
- Basiert auf einer Transformer-Architektur
- Vortrainiert auf einen großen Korpus nicht gelabelten Text, inklusive der gesamten Wikipedia (2.500 Millionen Wörter) und des BookCorpus
- BookCorpus: Text von ca. 11.000 nicht veröffentlichen Büchern (ca. 985 Millionen Wörter)

Wie wurde BERT trainiert?
- Masked Language Model:
    - 15% der Wörter wurde zufällige während des Trainings verdeckt
    - BERT muss die verdeckten Wörter vorhersagen
    - Ziel: Erlernen der englischen Sprache und der verwendeten Wörter

- Next Sentence Prediction:
    - BERT muss klassifizieren, ob zwei Sätze aufeinander folgen
    - Ziel: Beziehungen zwischen Sätzen verstehen

### 2.1 Masked Language Model

In [None]:
unmasker = pipeline('fill-mask', model='bert-base-uncased')

text = "Artificial Intelligence [MASK] will take over the world."
unmasker(text)

### 2.2 Next Sentence Prediction

In [None]:
model = BertForNextSentencePrediction.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

sentences = ["Elon Musk lives in California.", "You can't buy anything on sundays in germany.", "You are not supposed to work on sundays in germany."]

for i in range(2):
    inputs = tokenizer(sentences[i], sentences[i + 1], return_tensors="pt")
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits)

    if prediction == 0:
        print("The sentences belong together.")
    else:
        print("The sentences do not belong together.")

## 3. Base Modell

![](static/steps3.png)

In [None]:
class BERT_Arch(nn.Module):
    def __init__(self, bert):  
      super(BERT_Arch, self).__init__()
      self.bert = bert   
      self.dropout = nn.Dropout(0.1)
      self.relu =  nn.ReLU()
      self.fc1 = nn.Linear(768,512)             # dense layer 1
      self.fc2 = nn.Linear(512,2)               # dense layer 2 (Output layer)
      self.softmax = nn.LogSoftmax(dim=1)
    
    
    def forward(self, sent_id, mask):
      cls_hs = self.bert(sent_id, attention_mask=mask)['pooler_output']
      x = self.fc1(cls_hs)
      x = self.relu(x)
      x = self.dropout(x)
      x = self.fc2(x)
      x = self.softmax(x)
      return x

In [None]:
bert = AutoModel.from_pretrained('bert-base-uncased')

model = BERT_Arch(bert)

optimizer = AdamW(model.parameters(),lr = 1e-5) 
cross_entropy  = nn.NLLLoss() 
epochs = 2

## 4. Freeze Layers

![](static/steps4.png)

In [None]:
for param in bert.parameters():
    param.requires_grad = False

## 5. Neues Modell trainieren

![](static/steps5.png)

### 5.1 Train- und Evaluierungsfunktion

In [None]:
# Defining training and evaluation functions
def train():  
  model.train()
  total_loss, total_accuracy = 0, 0
  
  for step,batch in enumerate(train_dataloader):
    if step % 50 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
    batch = [r for r in batch]
    sent_id, mask, labels = batch 
    model.zero_grad()
    preds = model(sent_id, mask)
    loss = cross_entropy(preds, labels)
    total_loss = total_loss + loss.item()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    preds=preds.detach().cpu().numpy()

  avg_loss = total_loss / len(train_dataloader)

  return avg_loss

def evaluate():  
  print("\nEvaluating...")  
  model.eval()
  total_loss, total_accuracy = 0, 0  
  for step,batch in enumerate(val_dataloader):
    if step % 50 == 0 and not step == 0:


      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

    batch = [t for t in batch]
    sent_id, mask, labels = batch
    with torch.no_grad():
      preds = model(sent_id, mask)
      loss = cross_entropy(preds,labels)
      total_loss = total_loss + loss.item()
      preds = preds.detach().cpu().numpy()
  avg_loss = total_loss / len(val_dataloader)
  return avg_loss

### 5.2 Training

In [None]:
# Train and predict
best_valid_loss = float('inf')
train_losses=[]
valid_losses=[]

for epoch in range(epochs):     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))     
    train_loss = train()
    valid_loss = evaluate()
    if valid_loss < best_valid_loss:              
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'models/model_weights.pt')
    train_losses.append(train_loss)               # append training and validation loss
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

### 5.3 Performance auf Test-Datensatz

In [None]:
# load weights of best model
path = 'models/model_weights.pt'
model.load_state_dict(torch.load(path))

with torch.no_grad():
  preds = model(test_seq, test_mask)
  preds = preds.detach().cpu().numpy()

preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))

## 6. Fake News Detection

![](static/steps6.png)

In [None]:
unseen_news_text = ["Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing",     # Fake
                    "WATCH: George W. Bush Calls Out Trump For Supporting White Supremacy",               # Fake
                    "U.S. lawmakers question businessman at 2016 Trump Tower meeting: sources",           # True
                    "Trump administration issues new rules on U.S. visa waivers"                          # True
                    ]

# tokenize and encode sequences in the test set
MAX_LENGTH = 15
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
tokens_unseen = tokenizer.batch_encode_plus(
    unseen_news_text,
    max_length = MAX_LENGTH,
    pad_to_max_length=True,
    truncation=True
)

unseen_seq = torch.tensor(tokens_unseen['input_ids'])
unseen_mask = torch.tensor(tokens_unseen['attention_mask'])

with torch.no_grad():
  preds = model(unseen_seq, unseen_mask)
  preds = preds.detach().cpu().numpy()

preds = np.argmax(preds, axis = 1)

for pred in preds:
    if pred == 0:
        print("True")
        continue
    print("Fake")