In [1]:
import numpy as np 
import pandas as pd

real_data = pd.read_csv('True.csv')
fake_data = pd.read_csv('Fake.csv') 

In [2]:
fake_data.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [3]:
# fake데이터와 true데이터의 개수 동일하게 설정
nb_articles = min(len(real_data), len(fake_data))
real_data = real_data[:nb_articles]
fake_data = fake_data[:nb_articles]

In [4]:
# row에 'is_fake' row 추가하여 값 초기화
real_data['is_fake'] = False
fake_data['is_fake'] = True

In [5]:
real_data.head()

Unnamed: 0,title,text,subject,date,is_fake
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",False
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",False
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",False
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",False
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",False


In [6]:
fake_data.head()

Unnamed: 0,title,text,subject,date,is_fake
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",True
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",True
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",True
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",True
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",True


In [7]:
from sklearn.utils import shuffle

data = pd.concat([real_data, fake_data])

# fake, true 데이터를 합치고, 무작위 셔플
data = shuffle(data).reset_index(drop=True)
data.head()

Unnamed: 0,title,text,subject,date,is_fake
0,Five killed in sectarian attack in Pakistan,"QUETTA, Pakistan (Reuters) - (This October 9 s...",worldnews,"October 9, 2017",False
1,May's party suspends two EU lawmakers over Bre...,BRUSSELS (Reuters) - Britain s ruling Conserva...,worldnews,"October 8, 2017",False
2,Factbox: Trump on Twitter (Sept 20) - Graham-C...,The following statements were posted to the ve...,politicsNews,"September 20, 2017",False
3,Hostility grows towards Syrian refugees in Leb...,BEIRUT (Reuters) - Abu Yazan has rarely steppe...,worldnews,"August 28, 2017",False
4,Atlantic City mayor terms N.J. takeover plan '...,"ATLANTIC CITY, N.J. (Reuters) - Atlantic City’...",politicsNews,"February 22, 2016",False


In [8]:
# 학습데이터 60%, 검증데이터 20%, 테스트데이터 20% 로 분리
train_data, validate_data, test_data = np.split(data.sample(frac=1), [int(.6*len(data)), int(.8*len(data))])

# reset_index하여 각 데이터별 인덱스 정렬
train_data = train_data.reset_index(drop=True)
validate_data = validate_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

del real_data
del fake_data

print("Size of training set : {}".format(len(train_data)))
print("Size of validation set: {}".format(len(validate_data)))
print("Size of testing set: {}".format(len(test_data)))

Size of training set : 25700
Size of validation set: 8567
Size of testing set: 8567


In [9]:
!conda install -y pytorch torchvision cudatoolkit=10.1 -c pytorch
!pip install transformers

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [10]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

torch.cuda.is_available()
# 쿠다없으면 cpu 사용
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
model.config.num_labels = 1

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [11]:
# Freeze the pre trained parameters
for param in model.parameters():
    param.requires_grad = False

model = model.to(device)

In [12]:
criterion = nn.MSELoss().to(device)
optimizer = optim.SGD(model.classifier.parameters(), lr=0.01)

In [13]:
def preprocess_text(text):
    parts = []

    text_len = len(text.split(' '))
    delta = 300
    max_parts = 5
    nb_cuts = int(text_len / delta)
    nb_cuts = min(nb_cuts, max_parts)
    
    
    for i in range(nb_cuts + 1):
        text_part = ' '.join(text.split(' ')[i * delta: (i + 1) * delta])
        parts.append(tokenizer.encode(text_part, return_tensors="pt", max_length=500).to(device))

    return parts

In [15]:
print_every = 300
total_loss = 0
all_losses = []
CUDA_LAUNCH_BLOCKING=1
model.train()

for idx, row in train_data.iterrows():
    text_parts = preprocess_text(str(row['text']))
    label = torch.tensor([row['is_fake']]).long().to(device)

    optimizer.zero_grad()

    overall_output = torch.zeros((1, 2)).float().to(device)
    for part in text_parts:
        if len(part) > 0:
            try:
                input = part.reshape(-1)[:512].reshape(1, -1)
                # print(input.shape)
                overall_output += model(input, labels=label)[1].float().to(device)
            except Exception as e:
                print(str(e))

#     overall_output /= len(text_parts)
    overall_output = F.softmax(overall_output[0], dim=-1)

    if label == 0:
        label = torch.tensor([1.0, 0.0]).float().to(device)
    elif label == 1:
        label = torch.tensor([0.0, 1.0]).float().to(device)

    # print(overall_output, label)

    loss = criterion(overall_output, label)
    total_loss += loss.item()

    optimizer.step()

    if idx % print_every == 0 and idx > 0:
        average_loss = total_loss / print_every
        print("{}/{}. Average loss: {}".format(idx, len(train_data), average_loss))
        all_losses.append(average_loss)
        total_loss = 0

300/25700. Average loss: 0.2574516536295414
600/25700. Average loss: 0.2625815826530258
900/25700. Average loss: 0.2609694382486244
1200/25700. Average loss: 0.27644607910265523
1500/25700. Average loss: 0.26010702732329566
1800/25700. Average loss: 0.2695156510019054
2100/25700. Average loss: 0.2676220900642996
2400/25700. Average loss: 0.26177932251865665
2700/25700. Average loss: 0.2726197820405165
3000/25700. Average loss: 0.2647109723463654
3300/25700. Average loss: 0.2547325244049231
3600/25700. Average loss: 0.2635251714165012
3900/25700. Average loss: 0.2631588336452842
4200/25700. Average loss: 0.27215076972730456
4500/25700. Average loss: 0.2591610004007816
4800/25700. Average loss: 0.2656474148730437
5100/25700. Average loss: 0.2678726223980387
5400/25700. Average loss: 0.2655232570009927
5700/25700. Average loss: 0.2728103396855295
6000/25700. Average loss: 0.25621355017026265
6300/25700. Average loss: 0.2680250924422095
6600/25700. Average loss: 0.2600965336461862
6900/257

In [16]:
total = len(test_data)
number_right = 0
model.eval()
# 서버의 메모리를 아끼기 위해
# 예측을 실행하기 전에 그라디언트를 정보를 저장하지 않는다.
# torch.no_grad()
with torch.no_grad():
    for idx, row in test_data.iterrows():
        text_parts = preprocess_text(str(row['text']))
        label = torch.tensor([row['is_fake']]).float().to(device)
        
        overall_output = torch.zeros((1,2)).to(device)
        try:
            for part in text_parts:
                if len(part) > 0:
                    overall_output += model(part.reshape(1, -1))[0]
        except RuntimeError:
            print("GPU out of memory, skipping this entry.")
            continue
            
        overall_output = F.softmax(overall_output[0], dim=-1)
            
        result = overall_output.max(0)[1].float().item()
 
        if result == label.item():
            number_right += 1
            
        if idx % print_every == 0 and idx > 0:
            print("{}/{}. Current accuracy: {}".format(idx, total, number_right / idx))
            
print("Accuracy on test data: {}".format(number_right / total))

300/8567. Current accuracy: 0.5133333333333333
600/8567. Current accuracy: 0.49833333333333335
900/8567. Current accuracy: 0.4922222222222222
1200/8567. Current accuracy: 0.49666666666666665
1500/8567. Current accuracy: 0.49466666666666664
1800/8567. Current accuracy: 0.49444444444444446
2100/8567. Current accuracy: 0.48857142857142855
2400/8567. Current accuracy: 0.49083333333333334
2700/8567. Current accuracy: 0.4862962962962963
3000/8567. Current accuracy: 0.49166666666666664
3300/8567. Current accuracy: 0.4896969696969697
3600/8567. Current accuracy: 0.4938888888888889
3900/8567. Current accuracy: 0.4958974358974359
4200/8567. Current accuracy: 0.49547619047619046
4500/8567. Current accuracy: 0.4955555555555556
4800/8567. Current accuracy: 0.49395833333333333
5100/8567. Current accuracy: 0.49745098039215685
5400/8567. Current accuracy: 0.4998148148148148
5700/8567. Current accuracy: 0.4992982456140351
6000/8567. Current accuracy: 0.499
6300/8567. Current accuracy: 0.500317460317460

In [17]:
model_softmax = BertForSequenceClassification.from_pretrained("bert-base-uncased")
model_softmax.config.num_labels = 1

model_softmax.classifier = nn.Sequential(
    nn.Softmax(dim=1)
)

model_softmax = model_softmax.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [20]:
print_every = 300
total_loss = 0
all_losses_softmax = []
CUDA_LAUNCH_BLOCKING=1
model_softmax.train()

for idx, row in train_data.iterrows():
    text_parts = preprocess_text(str(row['text']))
    label = torch.tensor([row['is_fake']]).long().to(device)

    optimizer.zero_grad()

    overall_output = torch.zeros((1, 2)).float().to(device)
    for part in text_parts:
        if len(part) > 0:
            try:
                input = part.reshape(-1)[:512].reshape(1, -1)
                # print(input.shape)
                overall_output += model(input, labels=label)[1].float().to(device)
            except Exception as e:
                print(str(e))

#     overall_output /= len(text_parts)
    overall_output = F.softmax(overall_output[0], dim=-1)

    if label == 0:
        label = torch.tensor([1.0, 0.0]).float().to(device)
    elif label == 1:
        label = torch.tensor([0.0, 1.0]).float().to(device)

    # print(overall_output, label)

    loss = criterion(overall_output, label)
    total_loss += loss.item()

    optimizer.step()

    if idx % print_every == 0 and idx > 0:
        average_loss = total_loss / print_every
        print("{}/{}. Average loss: {}".format(idx, len(train_data), average_loss))
        all_losses.append(average_loss)
        total_loss = 0

300/25700. Average loss: 0.2631356908380985
600/25700. Average loss: 0.27407419759159285
900/25700. Average loss: 0.265136612476781
1200/25700. Average loss: 0.28613990544651946
1500/25700. Average loss: 0.26613716809699933
1800/25700. Average loss: 0.2824543362048765
2100/25700. Average loss: 0.2808614862306664
2400/25700. Average loss: 0.26764260774788756
2700/25700. Average loss: 0.2842236883038034
3000/25700. Average loss: 0.27545256599163015
3300/25700. Average loss: 0.25777502354234455
3600/25700. Average loss: 0.2715592804861565
3900/25700. Average loss: 0.26904851710423827
4200/25700. Average loss: 0.28042380169034004
4500/25700. Average loss: 0.2626433078447978
4800/25700. Average loss: 0.26775248725588124
5100/25700. Average loss: 0.27470680040307344
5400/25700. Average loss: 0.2691542482453709
5700/25700. Average loss: 0.27795392227359117
6000/25700. Average loss: 0.2630182924432059
6300/25700. Average loss: 0.27478826384991406
6600/25700. Average loss: 0.2676382445109387
69

In [21]:
total = len(test_data)
number_right = 0
model_softmax.eval()
# 서버의 메모리를 아끼기 위해
# 예측을 실행하기 전에 그라디언트를 정보를 저장하지 않는다.
# torch.no_grad()
with torch.no_grad():
    for idx, row in test_data.iterrows():
        text_parts = preprocess_text(str(row['text']))
        label = torch.tensor([row['is_fake']]).float().to(device)
        
        overall_output = torch.zeros((1,2)).to(device)
        try:
            for part in text_parts:
                if len(part) > 0:
                    overall_output += model(part.reshape(1, -1))[0]
        except RuntimeError:
            print("GPU out of memory, skipping this entry.")
            continue
            
        overall_output = F.softmax(overall_output[0], dim=-1)
            
        result = overall_output.max(0)[1].float().item()
 
        if result == label.item():
            number_right += 1
            
        if idx % print_every == 0 and idx > 0:
            print("{}/{}. Current accuracy: {}".format(idx, total, number_right / idx))
            
print("Accuracy on test data: {}".format(number_right / total))

300/8567. Current accuracy: 0.5133333333333333
600/8567. Current accuracy: 0.49833333333333335


KeyboardInterrupt: 