In [2]:
import pandas as pd
import spacy
import numpy as np

data = pd.read_csv('fnn_train.csv')

In [3]:
# Create an empty model
nlp = spacy.blank("en")

# Create the TextCategorizer with exclusive classes and "bow" architecture
textcat = nlp.create_pipe(
              "textcat",
              config={
                "exclusive_classes": True,
                "architecture": "bow"})

# Add the TextCategorizer to the empty model
nlp.add_pipe(textcat)

# Add labels to text classifier
textcat.add_label("real")
textcat.add_label("fake")

1

In [4]:
train_texts = data['paragraph_based_content'].values
train_labels = [{'cats': {'real': label == 'real',
                          'fake': label == 'fake'}} 
                for label in data['label_fnn']]

train_data = list(zip(train_texts, train_labels))

In [5]:
from spacy.util import minibatch

spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

# Create the batch generator with batch size = 8
batches = minibatch(train_data, size=8)
# Iterate through minibatches
for batch in batches:
    # Each batch is a list of (text, label) but we need to
    # send separate lists for texts and labels to update().
    # This is a quick way to split a list of tuples into lists
    texts, labels = zip(*batch)
    nlp.update(texts, labels, sgd=optimizer)

import random

random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
for epoch in range(10):
    random.shuffle(train_data)
    # Create the batch generator with batch size = 8
    batches = minibatch(train_data, size=8)
    # Iterate through minibatches
    for batch in batches:
        # Each batch is a list of (text, label) but we need to
        # send separate lists for texts and labels to update().
        # This is a quick way to split a list of tuples into lists
        texts, labels = zip(*batch)
        nlp.update(texts, labels, sgd=optimizer, losses=losses)
    print(losses)

{'textcat': 14.425886104354035}
{'textcat': 25.676457950253678}
{'textcat': 35.43259564392383}
{'textcat': 43.84034698335879}
{'textcat': 51.29761679176484}
{'textcat': 57.54454548172815}
{'textcat': 63.27015254911553}
{'textcat': 68.2732394357406}
{'textcat': 72.43486978944189}
{'textcat': 76.4809805360849}


In [6]:
def predict(nlp, texts):
    # Use the tokenizer to tokenize each input text example
    docs = [nlp.tokenizer(text) for text in texts]

    # Use textcat to get the scores for each doc
    textcat = nlp.get_pipe('textcat')
    scores, _ = textcat.predict(docs)

    # From the scores, find the class with the highest score/probability
    predicted_class = scores.argmax(axis=1)

    return predicted_class

In [8]:
def evaluate(model, texts, labels):
    # Get predictions from textcat model
    predicted_class = predict(model, texts)

    # From labels, get the true class as a list of integers (POSITIVE -> 1, NEGATIVE -> 0)
    true_class = [0 if label == 'real' else 1 for label in labels]

    # A boolean or int array indicating correct predictions
    correct_predictions = predicted_class == true_class

    # The accuracy, number of correct predictions divided by all predictions
    accuracy = correct_predictions.mean()

    return accuracy

In [9]:
test_data = pd.read_csv('fnn_dev.csv')
test_texts = data.iloc[:, -2]
test_labels = data.iloc[:, -1]

accuracy = evaluate(nlp, test_texts, test_labels)
print(f"Loss: {losses['textcat']:.3f} \t Accuracy: {accuracy:.3f}")

Loss: 76.481 	 Accuracy: 0.854


In [15]:
texts = test_texts.iloc[454:464]
docs = [nlp.tokenizer(text) for text in texts]

# Use textcat to get the scores for each doc
textcat = nlp.get_pipe('textcat')
scores, _ = textcat.predict(docs)

print(scores)

[[1.0000000e+00 6.5678621e-12]
 [1.0000000e+00 1.9356144e-12]
 [9.3412977e-12 1.0000000e+00]
 [1.8524821e-06 9.9999809e-01]
 [1.0000000e+00 1.4809468e-14]
 [1.0000000e+00 3.6570100e-08]
 [2.1528252e-04 9.9978477e-01]
 [9.9693632e-01 3.0637255e-03]
 [1.0000000e+00 1.2133555e-08]
 [1.0000000e+00 6.4267618e-13]]


In [16]:
# From the scores, find the label with the highest score/probability
predicted_labels = scores.argmax(axis=1)
print([textcat.labels[label] for label in predicted_labels])
print(test_labels.iloc[454:464])

['real', 'real', 'fake', 'fake', 'real', 'real', 'fake', 'real', 'real', 'real']
454    real
455    real
456    fake
457    fake
458    real
459    real
460    fake
461    real
462    real
463    real
Name: label_fnn, dtype: object


In [18]:
ranked_articles = pd.read_csv('article_ranking_data.csv')

texts = ranked_articles.iloc[:, 1]
docs = [nlp.tokenizer(text) for text in texts]

# Use textcat to get the scores for each doc
textcat = nlp.get_pipe('textcat')
scores, _ = textcat.predict(docs)

[[0.58112293 0.41887707]
 [0.53663796 0.463362  ]]
0      Skip to:Main ContentSite NavigationSite foot...
1    Skip to main content\n  The Guardian - Back to...
Name: article_text, dtype: object


In [66]:
import datetime
import numpy
import math

class Article:
    def __init__(self, url, publishedAt, readability):
        self.url = url
        self.publishedAt = publishedAt

        if readability <= 55:
            self.readabilityScore = 15 - (55 - readability)
        elif readability > 55 and readability <= 75:
            self.readabilityScore = 15
        elif readability > 75:
            self.readabilityScore = 15 - (readability - 75)

        now = datetime.datetime.utcnow()
        year = int(publishedAt[0:4])
        month = int(publishedAt[5:7])
        day = int(publishedAt[8:10])
        hour = int(publishedAt[11:13])
        minute = int(publishedAt[14:16])
        second = int(publishedAt[17:19])
        publishDateTime = datetime.datetime(year, month, day, hour, minute, second) #2020-10-17T20:40:13Z
        self.timeSincePublished = (now - publishDateTime).total_seconds() / 3600
        self.realProb = 0
        self.rankScore = 0

    def setRealProb(self, realProbability):
        self.realProb = realProbability

    def equateRank(self):
        real = self.realProb
        readability = self.readabilityScore
        time = self.timeSincePublished
        self.rankScore = real * real * readability * 3 * ((numpy.arctan(-1.2 * (time/24 - 3))) + 3 * math.pi/2)

In [69]:
# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

import datetime
import readability
import json
from newsapi import NewsApiClient
import requests
from urllib.request import urlopen, Request
from urllib import error
from bs4 import BeautifulSoup
from sortedcontainers import SortedDict
from inscriptis import get_text
import pandas as pd
#import Article

# Init
newsapi = NewsApiClient(api_key='fb11d84c123e491983028590e5bdd0e6')

# /v2/top-headlines
health_headlines = newsapi.get_top_headlines(country='au',
                                          page_size=70, page=1, category='health')
general_headlines = newsapi.get_top_headlines(country='au',
                                          page_size=70, page=1, category='general')


# /v2/everything
# all_articles = newsapi.get_everything(q='politics',
#                                       from_param=date.today(),
#                                       language='en', page_size=10, page=2)
#
# # /v2/sources
# sources = newsapi.get_sources()
#print(json.dumps(all_articles, indent=6))


sd = SortedDict()
newsList = []    #actual unordered return list
textList = []


print("# Health Articles: " + str(health_headlines['totalResults']))
print("# General Articles: " + str(general_headlines['totalResults']))

all_headlines = health_headlines['articles'] + general_headlines['articles']
i = 0

for article in all_headlines:
    i += 1
    print(str(i) + ". " + article['url'])
    url = Request(article['url'], headers={'User-Agent': 'Mozilla/5.0'})

    html = None
    try:
        html = urlopen(url)
    except OSError:
        continue

    if "nytimes" in article['url'] or "wsj" in article['url'] or "news.google.com" in article['url'] or "subscribe" in article['url']:
        continue

    soup = BeautifulSoup(html, "html.parser")
    decoding = soup.original_encoding

    decoded = urlopen(url).read().decode(decoding)
    text = get_text(decoded)

    if len(text) < 100:
        continue

    newText = text.replace('*', '').replace('+', '')
    if len(text) > 10000:
        newText = newText[4000:len(newText) - 4000]
    elif len(text) > 5000:
        newText = newText[2000:len(newText) - 2000]
    elif len(text) > 2000:
        newText = newText[800:len(newText) - 800]
    elif len(text) > 1000:
        newText = newText[400:len(newText) - 400]

    textList.append(newText)

    readingStats = readability.getmeasures(text, lang='en')
    fleschScore = readingStats['readability grades']['FleschReadingEase']

    sd[fleschScore] = article['url']

    newArticle = Article(article['url'], article['publishedAt'], fleschScore)

    if newArticle not in newsList:
        newsList.append(newArticle)
    else:
        continue


i = 1

#print("\nlength: " + str(len(sd.keys())))
#for key in sd:
    #print(str(i) + ". " + sd[key] + ": " + str(key))
    #i += 1


#print("result lengths:")
#print(len(all_headlines))
# print(len(newsList))
#for news in newsList:
    #print("URL: " + news.url + " |Date: " + news.publishedAt + " |Readability: " + str(news.readabilityScore))

# url_list = pd.Series(urlList, name='url')
# text_list = pd.Series(textList, name='article_text')
#
# df = pd.merge(url_list, text_list, left_index=True, right_index=True)
# df.to_csv('article_ranking_data.csv')

# print(textList[1])
# print(datetime.datetime.utcnow())
# print(newsList[0].timeSincePublished)
# print(newsList[1].timeSincePublished)

#url_list = pd.Series(urlList, name='url')
ranked_articles = pd.DataFrame(textList)

#ranked_articles = pd.merge(url_list, text_list, left_index=True, right_index=True)
#ranked_articles.to_csv('article_ranking_data_2.csv')

texts = ranked_articles.iloc[:, 0]
docs = [nlp.tokenizer(text) for text in texts]

# Use textcat to get the scores for each doc
textcat = nlp.get_pipe('textcat')
scores, _ = textcat.predict(docs)
#print(scores)

# From the scores, find the label with the highest score/probability
predicted_labels = scores.argmax(axis=1)
print([textcat.labels[label] for label in predicted_labels])

for i in range(len(newsList)):
    newsList[i].setRealProb(scores[i][0])

for news in newsList:
    if news.realProb < .001:
        newsList.remove(news)

for news in newsList:
    news.equateRank()
    
newsList.sort(key=lambda newsArticle: newsArticle.rankScore, reverse=True)
j = 1
for news in newsList:
    print(str(j) + '. URL: ' + news.url + '\nScore: ' + str(news.rankScore))
    j = j + 1


# Health Articles: 70
# General Articles: 38
1. https://amp.theguardian.comworld/2020/oct/18/coronavirus-test-results-must-come-in-24-hours-says-sage-scientist
2. https://greekcitytimes.com/2020/10/18/80844/
3. https://amp.theguardian.comlifeandstyle/2020/oct/18/masked-monsters-and-socially-distanced-spooks-celebrating-halloween-at-home
4. https://news.google.com/__i/rss/rd/articles/CBMiK2h0dHBzOi8vd3d3LnlvdXR1YmUuY29tL3dhdGNoP3Y9dFUxNXFnM0dNZGfSAQA?oc=5
5. https://alkhaleejtoday.co/international/5124720/Covid-19-could-also-cause-sudden-and-permanent-hearing-loss.html
6. https://www.news.com.au/lifestyle/health/health-problems/mum-reveals-innocent-symptom-she-ignored-that-turned-out-to-be-lung-cancer/news-story/b31c822d7d4ac4a74c704611488f28f7
7. https://www.theaustralian.com.au/subscribe/news/1/
8. https://au.news.yahoo.com/nsw-womans-shock-diagnosis-after-losing-six-babies-024419790.html
9. https://alkhaleejtoday.co/international/5123468/The-flu-vaccine-suffers-from-corona-and-misinf

76. https://www.perthnow.com.au/entertainment/tv/actor-hugh-sheridan-opens-up-about-sexuality-says-he-has-been-with-men-and-women-ng-b881696492z
77. https://www.abc.net.au/news/2020-10-18/regional-victoria-coronavirus-restrictions-change-whats-allowed/12776468
78. https://www.news.com.au/world/coronavirus/closures/qld-election-2020-border-closure-decision-hated-by-20-million-aussies/news-story/c1c2aa884c29f3bbceeed5575fb0144c
79. https://www.techradar.com/news/why-the-iphone-11-is-a-better-buy-than-the-iphone-12
80. https://www.news.com.au/entertainment/celebrity-life/home-improvement-star-zachery-ty-bryan-arrested-for-allegedly-choking-his-girlfriend/news-story/846f9ef591c4f65dbe4a220653536200
81. https://amp.theguardian.comsport/2020/oct/18/bledisloe-cup-game-two-all-blacks-wallabies-match-report
82. https://www.theaustralian.com.au/subscribe/news/1/
83. https://www.theage.com.au/national/victoria/easing-of-restrictions-offers-mixed-fare-for-restaurants-20201018-p5667y.html
84. https