In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import spacy
from spacy.util import minibatch, compounding
import random

nlp = spacy.load('en_core_web_sm')
# df = pd.read_csv('../data/greek_fake_news.csv')
df = pd.read_csv('../data/train.csv')
df.replace(to_replace='[\n\r\t]', value=' ', regex=True, inplace=True)

In [2]:
def load_data(train_data, limit=0, split=0.8):
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{"REAL": not bool(y), "FAKE": bool(y)} for y in labels]
    split = int(len(train_data) * split)
    
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "FAKE":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    99 non-null     object
 1   text     99 non-null     object
 2   source   94 non-null     object
 3   url      99 non-null     object
 4   is_fake  99 non-null     int64 
dtypes: int64(1), object(4)
memory usage: 4.0+ KB


In [4]:
textcat=nlp.create_pipe( "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"})
nlp.add_pipe(textcat, last=True)
nlp.pipe_names


['tagger', 'parser', 'ner', 'textcat']

In [5]:
textcat.add_label("REAL")
textcat.add_label("FAKE")

1

In [6]:
df['tuples'] = df.apply(lambda row: (row['text'], row['is_fake']), axis=1)
train = df['tuples'].tolist()

In [7]:
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(train, split=0.9)

train_data = list(zip(train_texts,[{'cats': cats} for cats in train_cats]))


In [8]:
n_iter = 20
# Disabling other components
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()

    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))

    # Performing training
    for i in range(n_iter):
        losses = {}
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)

      # Calling the evaluate() function and printing the scores
        with textcat.model.use_params(optimizer.averages):
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  
              .format(losses['textcat'], scores['textcat_p'],
                      scores['textcat_r'], scores['textcat_f']))

Training the model...
LOSS 	  P  	  R  	  F  
0.932	0.625	1.000	0.769
0.440	0.625	1.000	0.769
0.141	1.000	1.000	1.000
0.039	1.000	1.000	1.000
0.000	1.000	1.000	1.000
0.002	1.000	1.000	1.000
0.005	1.000	1.000	1.000
0.018	1.000	1.000	1.000
0.147	1.000	1.000	1.000
0.064	1.000	1.000	1.000
0.044	1.000	1.000	1.000
0.001	1.000	1.000	1.000
0.000	1.000	1.000	1.000
0.000	1.000	1.000	1.000
0.000	1.000	1.000	1.000
0.000	1.000	1.000	1.000
0.000	1.000	1.000	1.000
0.000	1.000	1.000	1.000
0.000	1.000	1.000	1.000
0.000	1.000	1.000	1.000


In [9]:
test = '''
The 12 members of the breakaway Super League are meeting to discuss the future of the exclusive competition, according to multiple reports on Tuesday.

TalkSPORT was the first to report news of the meeting. CNN has reached out to the Super League for comment but has not heard back.
Earlier on Tuesday, a number of clubs, including Chelsea and Manchester City, were reportedly preparing to leave the newly announced European Super League competition
According to multiple reports on Tuesday, including The Athletic and Sky News, Chelsea was preparing documents to formally withdraw from the exclusive group of 12 clubs.
Later on Tuesday, The Sun newspaper reported Manchester City was also pulling out.
CNN Sport has reached out to Chelsea and Manchester City about the reports but has not heard back.
On Sunday, six English clubs -- Arsenal, Chelsea, Liverpool, Manchester City, Manchester United, and Tottenham Hotspur -- alongside three teams from Italy -- AC Milan, Inter Milan and Juventus -- and three from Spain -- Atlético Madrid, Barcelona and Real Madrid -- had laid out plans to form the breakaway competition.
'''

doc = nlp(test)
doc.cats

{'REAL': 0.9876824021339417, 'FAKE': 0.012317556887865067}

In [10]:
with nlp.use_params(optimizer.averages):
            nlp.to_disk('../model')
        