In [2]:
!pip install simpletransformers



In [3]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import numpy as np
import logging
import torch
from sklearn.model_selection import train_test_split

In [4]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stopword_list = stopwords.words("english")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [5]:

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)
cuda_available = torch.cuda.is_available()

In [2]:
#from google.colab import files
#uploaded = files.upload()

In [10]:
# load the data
df_true = pd.read_csv("True.csv")
df_fake = pd.read_csv("Fake.csv")

In [11]:
# add a target class column to indicate whether the news is real or fake
df_true['isfake'] = 1
df_true.head()

Unnamed: 0,title,text,subject,date,isfake
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [12]:
df_fake['isfake'] = 0
df_fake.head()

Unnamed: 0,title,text,subject,date,isfake
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [13]:
# Concatenate Real and Fake News
df = pd.concat([df_true, df_fake]).reset_index(drop = True)
df

Unnamed: 0,title,text,subject,date,isfake
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1
...,...,...,...,...,...
44893,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",0
44894,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",0
44895,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",0
44896,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",0


In [14]:
df.drop(columns = ['date'], inplace = True)

In [15]:
# combine title and text together
df['original'] = df['title'] + ' ' + df['text']
df.head()

Unnamed: 0,title,text,subject,isfake,original
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,1,"As U.S. budget fight looms, Republicans flip t..."
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,1,U.S. military to accept transgender recruits o...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,1,Senior U.S. Republican senator: 'Let Mr. Muell...
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,1,FBI Russia probe helped by Australian diplomat...
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,1,Trump wants Postal Service to charge 'much mor...


In [None]:
columns = ['original','isfake']
X = df[columns]
X = X.dropna(subset = ['isfake'])
y = X['isfake']

In [20]:

#filter only 'isfake' related and then reset the index.
data = df[df['isfake']== 1]
data = df.reset_index(drop = True)

In [21]:
#this will be applied to the DF to pre-process text
def ProcessText3(text):
    """tokenizes, remove stopwords and punct and recreate text """
    tokens = nltk.word_tokenize(text.lower())
    remove_punct =[word for word in tokens if word.isalpha()]
    remove_stop_words =[word for word in remove_punct if not word in stopword_list]
    lemmatizer = WordNetLemmatizer()
    stemmed_text = []
    for word in remove_stop_words:
        stemmed_text.append(lemmatizer.lemmatize(word))
    filtered_text = (" ").join(stemmed_text)
    return filtered_text

In [22]:
 # Adding additional stopwords to stopword_list
meaningless_words = ['words', 'business wire', 'bwr english', 'copyright', 'businesswire.com',
                         'dow jones newswires','djdn english','dow jones institutional news', 
                         'all rights reserved', 'dow jones company inc', 'pr Newswire',
                         'prn english', 'the wall street journal',  'dow jones & company inc',
                         'j b4 english']    

stopword_list.extend(meaningless_words) 
print(stopword_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [23]:
# Process the full article column
data['clean_article']= data['original'].apply(lambda text: ProcessText3(str(text)))

In [24]:
#check shape, confirm the process is working as expected.
data.shape


(44898, 6)

Preparing Train/Test Split 

In [25]:
#For most of the data, this is used to split test and train
# randomly sample 80% of th dataframe
Train = data.sample(frac = 0.80, random_state=2).reset_index(drop=True)
  
# Creating test dataframe with the remaining 20%
Test = data.drop(Train.index).reset_index(drop=True)

In [26]:
!pip install imbalanced-learn



In [27]:
#this is for if we want to stratify the data on the target column
from sklearn.model_selection import train_test_split
#set the column of interest - 
col=['clean_article','isfake']
data2 = data[col]
X_train, X_test, Y_train, Y_test = train_test_split(data2,
                                                    data2['isfake'],
                                                    test_size= 0.2,
                                                    stratify = data2['isfake'],
                                                    random_state = 1)

In [28]:
#this is the oversampling used - 
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler()
X_os, y_os = ros.fit_resample(X_train,Y_train)

In [29]:
#not used.
from imblearn.over_sampling import SMOTEN
sm = SMOTEN()
X_os, y_os = sm.fit_resample(X_train,Y_train)

tcmalloc: large alloc 2348318720 bytes == 0x55574689e000 @  0x7f813bacb001 0x7f812d23e1af 0x7f812d294c23 0x7f812d295a87 0x7f812d337823 0x55571eba011c 0x55571eb9fef0 0x55571ec1464d 0x55571ec0ea2e 0x55571eba188a 0x55571ec0fb4f 0x55571eba17aa 0x55571ec0fb4f 0x55571eba17aa 0x55571ec0fb4f 0x55571ec0ea2e 0x55571ec0e723 0x55571ec0cbd0 0x55571eb9fff9 0x55571eb9fef0 0x55571ec139a3 0x55571ec93ce5 0x55571ec10c03 0x55571ec93ce5 0x55571ec10c03 0x55571ec93ce5 0x55571eb9ff9d 0x55571ec91d4d 0x55571ec13ec8 0x55571eba17aa 0x55571ec0f8f6
tcmalloc: large alloc 2348318720 bytes == 0x5557d2826000 @  0x7f813bac91e7 0x7f812d23e0ce 0x7f812d294cf5 0x7f812d294f4f 0x7f812d337673 0x55571eba011c 0x55571eb9fef0 0x55571ec1464d 0x55571ec0ea2e 0x55571eba188a 0x55571ec10719 0x55571ec0ea2e 0x55571eba188a 0x55571ec0fb4f 0x55571eba17aa 0x55571ec0fb4f 0x55571eba17aa 0x55571ec0fb4f 0x55571ec0ea2e 0x55571ec0e723 0x55571ec0cbd0 0x55571eb9fff9 0x55571eb9fef0 0x55571ec139a3 0x55571ec93ce5 0x55571ec10c03 0x55571ec93ce5 0x55571ec1

In [30]:
X_os

Unnamed: 0,clean_article,isfake
0,sander appear going nominee washington reuters...,1
1,illinois budget uncertainty extends new fiscal...,1
2,slain sergeant widow say trump call cry even w...,1
3,trump scotland sturgeon spoke briefly friday s...,1
4,kkk leader medium said endorsed hillary proof ...,0
...,...,...
37565,expands central american refugee screening pro...,1
37566,expands central american refugee screening pro...,1
37567,expands central american refugee screening pro...,1
37568,expands central american refugee screening pro...,1


In [31]:
# Preparing train data - 
train_df = pd.DataFrame(Train[['clean_article','isfake']])
train_df.columns = ["text", "labels"]
# Preparing train data- 
train_df = pd.DataFrame(X_os[['clean_article','isfake']])
train_df.columns = ["text", "labels"]

In [32]:
train_df.head()

Unnamed: 0,text,labels
0,sander appear going nominee washington reuters...,1
1,illinois budget uncertainty extends new fiscal...,1
2,slain sergeant widow say trump call cry even w...,1
3,trump scotland sturgeon spoke briefly friday s...,1
4,kkk leader medium said endorsed hillary proof ...,0


In [33]:
# Preparing eval data- 
eval_df = pd.DataFrame(Test[['clean_article','isfake']])
eval_df.columns = ["text", "labels"]
# Preparing eval data
eval_df = pd.DataFrame(X_test[['clean_article','isfake']])
eval_df.columns = ["text", "labels"]

In [34]:
X_test

Unnamed: 0,clean_article,isfake
38096,gm taxpayer bailout billion forgiven made chin...,0
43940,dallas attack dialectic summer uncle sam jay d...,0
1351,turkey summons consulate worker questioning an...,1
11225,trump cruz tension may provide liveliest repub...,1
16812,south korea president say continue phasing nuc...,1
...,...,...
28568,watch president obama hilariously destroy trum...,0
27260,twitter relentlessly mock rnc chair pretending...,0
38067,see george stephanopoulos reaction hillary say...,0
31848,lockhimup former oversight chair jason chaffet...,0


In [40]:
eval_df.head()

Unnamed: 0,text,labels
38096,gm taxpayer bailout billion forgiven made chin...,0
43940,dallas attack dialectic summer uncle sam jay d...,0
1351,turkey summons consulate worker questioning an...,1
11225,trump cruz tension may provide liveliest repub...,1
16812,south korea president say continue phasing nuc...,1


In [41]:
# Create a ClassificationModel

model = ClassificationModel(
    "bert", "bert-base-cased", 
    use_cuda = cuda_available,
    args={'num_train_epochs': 4, 'learning_rate': 3e-5}
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [42]:
# Train the model
model.train_model(train_df)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/37570 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_bert_128_2_2


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/4697 [00:00<?, ?it/s]



Running Epoch 1 of 4:   0%|          | 0/4697 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/4697 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/4697 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to outputs/.


(18788, 0.020769243302093)

In [43]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(eval_df)


INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/8980 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_128_2_2


Running Evaluation:   0%|          | 0/1123 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 0.9970990420500918, 'tp': 4275, 'tn': 4692, 'fp': 4, 'fn': 9, 'auroc': 0.9990223019929153, 'auprc': 0.998962283107248, 'eval_loss': 0.01212017825959733}


In [44]:
print(result)

{'mcc': 0.9970990420500918, 'tp': 4275, 'tn': 4692, 'fp': 4, 'fn': 9, 'auroc': 0.9990223019929153, 'auprc': 0.998962283107248, 'eval_loss': 0.01212017825959733}


In [45]:
tp =result['tp']
tn= result['tn']
fp= result['fp']
fn= result['fn']
print('Accuracy: %.3f' %((tp+tn)/(tp+tn+fp+fn)))
print('Precision: %.3f' %(tp/(tp+fp)))
print('Recall %.3f' %(tp/(tp+fn)))
print('F1-Score: %.3f' %(tp/(tp + .5*(fp+fn))))
print('Misclassification Rate: %.3f' %((fp+fn)/(tp+tn+fp+fn)))
print("False Positve: %.3f" %(fp/tn))
print("AUC-ROC: %.3f" %(result['auroc']))

Accuracy: 0.999
Precision: 0.999
Recall 0.998
F1-Score: 0.998
Misclassification Rate: 0.001
False Positve: 0.001
AUC-ROC: 0.999


In [46]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef, roc_auc_score

In [47]:
predictions, raw_outputs = model.predict(eval_df['text'].values.tolist())

print(predictions)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/8980 [00:00<?, ?it/s]

  0%|          | 0/1123 [00:00<?, ?it/s]

[0 0 1 ... 0 0 0]


In [48]:
#this added the predicted value to the eval DF as well as creating a NumPy Array to 
#generate accuracy socre and Classification Report to evaluate the model further
eval_df['Predicted'] = predictions.tolist()
ethics = eval_df['labels'].to_numpy()

In [49]:
print('Accuracy Report: ', accuracy_score(ethics, predictions)) 
print('Classification Report: ', classification_report(ethics, predictions)) 

Accuracy Report:  0.9985523385300669
Classification Report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00      4696
           1       1.00      1.00      1.00      4284

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980

