In [27]:
from os import listdir
from os.path import join
from datetime import date, time, datetime

import calendar
import gzip
import json

In [28]:
# Utils
import numpy as np
import random

def random_number(numberFrom, numberTo, exlude):
    found = False
    while(found == False):
        rand = random.randint(numberFrom, numberTo)
        try:
            exlude.index(rand)
        except:
            found = True
    return rand
    
# Returns only 9
print("Should only return 9: ", "returns: ", random_number(0, 10, [0,1,2,3,4,5,6,7,8,10]))

def sections_to_analyze():
    return ["title_standardized", "description_standardized"]

print(sections_to_analyze())

Should only return 9:  returns:  9
['title_standardized', 'description_standardized']


In [29]:
# Global variables

# Leave empty if all domains should be processed
_domain = ""#["wsj.com"]
_folder = "release"
_dateFrom = "2020"

_filter = ['covid-19', 'covid']
_testThreshold = 0
_debug = False

In [30]:
# Get all domains if no domain is specified
def get_domains():
    if(_domain == ""):
        domains = []
        for name in listdir(_folder+"/."):
            domains.append(name)
        return domains
    else:
        return _domain

def convert_to_datetime(_date):
    if(len(_date) <= 3 or len(_date) == 5 or len(_date) == 7 or len(_date) > 8):
        raise Exception(f"Input date cannot include {len(_date)} digits - it must contain 4,6 or 8")
    if(len(_date) == 4):
        return date(year=int(_date[0:4]), month=1, day=1)
    if(len(_date) == 6):
        return date(year=int(_date[0:4]), month=int(_date[4:6]), day=1)
    if(len(_date) == 8):
        return date(year=int(_date[0:4]), month=int(_date[4:6]), day=int(_date[6:8]))
    
def compare_date_to_inputdate(_date, dateFrom, dateTo):
    if(dateFrom == "" and dateTo == ""):
        return True
    
    criteria = convert_to_datetime(_date)
    _dateFrom = convert_to_datetime("19000101")
    _dateTo = date.today()

    if(dateFrom != ""):
        _dateFrom = convert_to_datetime(dateFrom) 
    if(dateTo != ""):
        _dateTo = convert_to_datetime(dateTo)     
        
    if(_dateFrom <= criteria and criteria <= _dateTo):
        return True

    return False

""" Creates an array with all necessary information for eah article

Parameters
----------------
domains: list
    Example: ['france24.com', 'bbc.com']

Returns
----------------
list
    List of articles content with
    {id, domain, title, description}

"""
def get_articles(domains, dateFrom="", dateTo=""):
    if(dateFrom == ""):
        returnDateFrom = convert_to_datetime("19000101")
    else:
        returnDateFrom = convert_to_datetime(dateFrom)

    if(dateTo == ""):
        returnDateTo = date.today()
    else:
        returnDateTo = convert_to_datetime(dateTo)

    articles = []
    article = {}
    alreadyProcessed = []
    getAll = False
    if(dateFrom == "" and dateTo == ""):
        getAll = True

    for domain in domains:
        for f in listdir(join(_folder+"/"+domain, "per_day")):
            # Takes first 4 numbers from filename (e.g. filename: 20190104.gz)
            if(getAll or compare_date_to_inputdate(f[0:8], dateFrom, dateTo)):
                try:
                    d = json.load(gzip.open(join(_folder+"/"+domain, "per_day", f)))
                except:
                    continue
                for i in d:
                    # Prevent articles to be added more than once
                    if i not in alreadyProcessed:
                        alreadyProcessed.append(i)         
                        articles.append({
                            "id": i,
                            "domain": domain,
                            "title": d[i]["title"],
                            "description": d[i]["description"],
                            "date": f[0:8]
                        })
                    else:
                        continue
    return articles, returnDateFrom, returnDateTo, 

def get_articles_by_month(domains, year, month):
    start, dayTo = calendar.monthrange(int(year), int(month))

    if(dayTo < 10):
        dayTo = "0"+str(dayTo)
    else:
        dayTo = str(dayTo)

    # Adding in front of single digit
    if(len(month) == 1):
        month = "0"+month
                
    dateFrom = year+month+"01"
    dateTo = year+month+dayTo
    return get_articles(domains, dateFrom, dateTo)

#articles = get_articles_by_month(get_domains(), "2018", "1")
#get_articles_by_month(["wsj.com"], "2020", "10")

In [31]:
#NLTK encapsulation

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tree import Tree
from nltk.stem.porter import *

_stemmer = PorterStemmer()
_wordnetLemmatizer = WordNetLemmatizer()

def _sentence_tokenize(text):
     return sent_tokenize(text)

def _word_tokenize(text):
    return word_tokenize(text)

# {able, possible + -ity → ability, possibility}
def _stem(text):
    return _stemmer.stem(text)

# {playing, plays, played = play}, {am, are, is = be}
def _lemmatize(text):
    return _wordnetLemmatizer.lemmatize(text)

def _speech_tag(wordList):
    return nltk.pos_tag(wordList)

def _named_entities_chunk(taggedList):
    return nltk.ne_chunk(taggedList)

In [32]:
# Word standadization

def sentence_tokenize(text):
    return _sentence_tokenize(text)

# Expects a list from sentence tokenizing
def word_list_tokenize(sentenceList):
    tokenized = []
    for bulk in sentenceList:
        for w in _word_tokenize(bulk):
            tokenized.append(w)
    return tokenized

def stemming(textList):
    stemmedList = []
    for w in textList:
        stemmedList.append(_stem(w))
    return stemmedList

def lemmatization(stemmedList):
    lemmaList = []
    for w in stemmedList:
        lemmaList.append(_lemmatize(w))
    return lemmaList

def remove_stopwords(wordList):
    returnList = []
    stopWords = set(stopwords.words("english"))
    for w in wordList:
        if w not in stopWords:  
            returnList.append(w)
    return returnList

def remove_punctuations(wordList):
    punctuations = ".?:!,;‘-’|"
    returnList = []
    for w in wordList:
        if w not in punctuations:
            returnList.append(w)
    return returnList

def entity_extraction(text):
    extracted = []
    sentTokenized = sentence_tokenize(text)
    wordTokenized = word_list_tokenize(sentTokenized)
    
    # Tagging each word
    tagged = _speech_tag(wordTokenized)
    chunk = _named_entities_chunk(tagged)

    for c in chunk:
        if type(c) == Tree:
            newItems = " ".join([token for token, pos in c.leaves()])
            if newItems not in extracted:
                extracted.append(newItems)
    return extracted

def standardize(text):
    sentTokenized = sentence_tokenize(text)
    wordTokenized = word_list_tokenize(sentTokenized)
    stemmed = stemming(wordTokenized)
    lemma = lemmatization(stemmed)
    noStopwords = remove_stopwords(lemma)
    noPunctuations = remove_punctuations(noStopwords)
    return noPunctuations

""" Standardize the model

Parameters
----------------
classified: list
    Example: {  'id': 'fb5e74aaa23103c9e97af27fee1a6be3'
                'domain': 'aljazeera.com'
                'title': ...
                'description': ...
             }

Returns
----------------
same list as in the input with additional attributes:
    Example: {
                'title_standardized': ['pfizer', 'covid-19'..]
                'description_standardized': ['report', 'news'..]
             }

"""    
def standardizeList(articles):
    for article in articles:
        article["title_standardized"] = standardize(article["title"])
        article["title_entities"] = entity_extraction(article["title"])
        article["description_standardized"] = standardize(article["description"])
        article["description_entities"] = entity_extraction(article["description"])
    return articles

#articles = get_articles(get_domains())
#print(standardizeList(articles[0]))

In [33]:
# Classifier
from sklearn.model_selection import train_test_split
import random

# Set if we want maximum number of samples, used for debugging
_samplesThreshold = 0

def array_similarity(arr1, arr2):
    for a1 in arr1:
        for a2 in arr2:
            if(a1 == a2):
                return True
    return False

def create_testing_data(samples, articles):
    return [articles[i] for i in range(0, len(articles)) if i in samples]

def create_training_data(samples, articles):
    return [articles[i] for i in range(0, len(articles)) if i not in samples]

def create_testing_and_training_data(articles):
    # The threshold cannot be higher than the actual data
    if(_testThreshold == 0 or len(articles) < _testThreshold):
        testThreshold = len(articles)
    else:
        testThreshold = _testThreshold

    samples = random.sample(range(0,len(articles)), testThreshold)
    
    train, test = train_test_split(articles, test_size=0.2)
    
    return train, test
    
# Positive (articles that include '_filter' in title or description) will have index = 0
# Negative will have index = 1
def split_to_positive_and_negative(articles):
    positiveResults = []
    negativeResults = []
    processedIndexes = []
    
    for a in articles:
        if(array_similarity(a[sections_to_analyze()[0]], _filter) == True):
            positiveResults.append(a)
        elif(array_similarity(a[sections_to_analyze()[1]], _filter) == True):
            positiveResults.append(a)
        else:
            negativeResults.append(a)

        if(_samplesThreshold > 0):
            if(len(positiveResults) > _samplesThreshold):
                break;

    return positiveResults, negativeResults


""" Convert to object for classification

Parameters
----------------
classified: list
    Example: {  'id': 'fb5e74aaa23103c9e97af27fee1a6be3'
                'domain': 'aljazeera.com'
                'title': ...
                'description': ...
                'title_standardized': ['pfizer', 'covid-19'..]
                'description_standardized': ['report', 'news'..]
             }

Returns
----------------
classified: object
    Example: {  "testing":  {"positive": [{article1},{article2}]},
                            {"negative": [{article1},{article2}}},
                "training": {"positive": [{article1},{article2}]},
                            {"negative": [{article1},{article2}]}
             }
"""
def convert_to_classification_object(articles):
    testAndTrain = create_testing_and_training_data(articles)
    
    if(_debug):
        print("All test:", len(testAndTrain[0]))
        print("All training: ", len(testAndTrain[1]))
        print("==")

    test = split_to_positive_and_negative(testAndTrain[0])
    train = split_to_positive_and_negative(testAndTrain[1])

    if(_debug):
        print("Test, positive: ", len(test[0]))
        print("Test, negative: ",len(test[1]))
        print("Train, positive: ", len(train[0]))
        print("Train, negative: ", len(train[1]))

    testing = {"positive":{}, "negative":{}}
    testing["positive"] = test[0]
    testing["negative"] = test[1]

    training = {"positive":{}, "negative":{}}
    training["positive"] = train[0]
    training["negative"] = train[1]

    classified = {"testing": {}, "training": {}}
    classified["testing"] = testing
    classified["training"] = training
    
    return classified

def count_frequencies(data, _class, freq):
    for word in data:
        try:
            freq[_class][word] += 1
        except:
            freq[_class][word] = 1
    return freq

def count_total_frequencies(frequencies):
    count = 0
    for _class in frequencies:
        for w in frequencies[_class]:
            count += frequencies[_class][w]
    return count

# Input: Frequency object (described below)
# Output: {'positive': {'word':'likelihood', 'word2':'likelihood'}, 'negative': {...} }
def calculate_likelihood(frequencies):
    p_w = {"positive": {}, "negative": {}}
    for _class in frequencies:
        for w in frequencies[_class]:
            p_w[_class][w] = float(frequencies[_class][w]) / float(len(frequencies[_class]))
            #p_w[_class][w] = float(frequencies[_class][w]) / count_total_frequencies(frequencies)
    return p_w

""" Calculate frequencies from the classification

Parameters
----------------
Returns
----------------
frequencies: object
    Example: {  "positive": {"word", frequency},{"word", frequency}...,
                "negative": {"word", frequency},{"word", frequency}...,
             }
"""
def word_counter(trainingData):
    frequencies = {"positive": {}, "negative": {}}
    for _class in trainingData:
        for article in trainingData[_class]:
            for section in sections_to_analyze():
                count_frequencies(article[section], _class, frequencies)
    return frequencies

# Gets the prior probability of P(type1)
def calculate_prior_propabilities(frequencies, type1, type2):
    return float(len(frequencies[type1])) / float(len(frequencies[type1]) + len(frequencies[type2]))

def add_to_predictiveValues(predictiveValues, _type):
    try:
        predictiveValues[_type] += 1
    except:
        predictiveValues[_type] = 1

def find_predictive_parameter(testClass, articleWeight):
    pWeight = articleWeight["positive"]
    nWeight = articleWeight["negative"]

    if(pWeight >= nWeight):
        if(testClass == "positive"):
            return "TP"
        else:
            return "FP"

    if(pWeight < nWeight):
        if(testClass == "positive"):
            return "FN"
        if(testClass == "negative"):
            return "TN"

def calculate_weigh_by_article(article, trainingDataLikelihood):
    articleWeight = {}
    for _trainingClass in trainingDataLikelihood:
        for section in sections_to_analyze():
            weight = 1
            for word in article[section]:
                try:
                    weight *= trainingDataLikelihood[_trainingClass][word]
                except:
                    weight *= 0.0001
        articleWeight[_trainingClass] = weight

    return articleWeight #find_predictive_parameter(_class, combinedWeight)

def calculate_accuracy(TP, FP, FN, TN):
    numerator = TP + TN
    denominator = TP + TN + FP + FN
    return float(numerator) / float(denominator)

def calculate_positive_vs_negative(articlesWeight):
    countPositive = 0
    countNegative = 0
    for article in articlesWeight:
        if(article['positive'] >= article['negative']):
            countPositive += 1
        else:
            countNegative += 1
    
    return countPositive, countNegative

def display_classification_results(accuracy, positive, negative, displayInfo):
    dateFrom = displayInfo["dateFrom"]
    dateTo = displayInfo["dateTo"]
    domains = displayInfo["domains"]
    
    print("========")
    print(f"Finished processing articles:")
    print(f"DateFrom: {dateFrom}")
    print(f"DateTo: {dateTo}")
    print(f"Domains: {domains}")
    print(f"Accuracy: {accuracy}")
    print(f"Portion of positive: {float(positive)/(float(negative)+float(positive)) * 100} % ")
    print("========")

'''
testingData:{'positive': [  'id': '..', 'domain': '..', 'title': '..', 'title_standardized': '..',
                            'description': '...', 'description_standardized': '...']}
trainingDataLikelihood: {'positive': {'word': likelihood}, {'word2', likelihood}...,
                         'negative': {'word': likelihood}, {'word2', likelihood}...}
pPropability: {'positive': float_value, 'negative': float_value}
'''
def classification_result(testingData, trainingDataLikelihood, pPropability):
    predictiveValues = {"TP": 0, "FP": 0, "TN": 0, "FN": 0}
    articlesWeight = []
    
    for _class in testingData:
        for article in testingData[_class]:
            articleWeight = calculate_weigh_by_article(article,trainingDataLikelihood)

            predictiveParameter = find_predictive_parameter(_class, articleWeight)
            add_to_predictiveValues(predictiveValues, predictiveParameter)
            
            articlesWeight.append(articleWeight)

    positive, negative = calculate_positive_vs_negative(articlesWeight)
    accuracy = calculate_accuracy(predictiveValues["TP"],
                       predictiveValues["FP"],
                       predictiveValues["FN"],
                       predictiveValues["TN"])

    return accuracy, positive, negative


# Input: Standardized article object
def process_classification(articles, displayInfo):
    classification_object = convert_to_classification_object(articles)
    frequencies = word_counter(classification_object["training"])

    # Prior propabilities for positive and negative
    pPropability = {'positive': 0.0, 'negative': 0.0}
    pPropability['positive'] = calculate_prior_propabilities(frequencies, "positive", "negative")
    pPropability['negative'] = calculate_prior_propabilities(frequencies, "negative", "positive")
    
    accuracy, positive, negative = classification_result(classification_object["testing"],
                                                          calculate_likelihood(frequencies),
                                                          pPropability)
    
    display_classification_results(accuracy, positive, negative, displayInfo)
    

#articles = get_articles(get_domains())
#standardized = standardizeList(articles)
#process_classification(standardized)

In [34]:
def create_display_info(domains, dateFrom, dateTo):
    return {
        "domains": domains,
        "dateFrom": dateFrom,
        "dateTo": dateTo
    }
    

def standardize_and_classify(articles, displayInfo):
    standardized = standardizeList(articles)
    process_classification(standardized, displayInfo)

def text_mine_articles(domains, dateFrom="", dateTo=""):
    articles, dateFrom, dateTo = get_articles(domains,"20200101")

    if(articles is None or len(articles) == 0):
        print(f"No articles found from {dateFrom} to {dateTo}, for domains {domains}")
    else:
        standardize_and_classify(articles, create_display_info(domains, dateFrom, dateTo))

def text_mine_articles_by_month(domains, year, month):
    articles, dateFrom, dateTo = get_articles_by_month(get_domains(), year, month)

    if(articles is None or len(articles) == 0):
        print(f"No articles found for year: {year}, month: {month}, for domains {domains}")
    else:
        standardize_and_classify(articles, create_display_info(domains, dateFrom, dateTo))


# Mine all articles from 2020
#text_mine_articles(get_domains(), "20200101")

# Mine all articles for each month 2020
for a in list(range(1, 13)):
    text_mine_articles_by_month(get_domains(), "2020", str(a))

# Mine all artiles by an outlet
#text_mine_articles(["wsj.com"])

#for m in [{"20200101", "20200131"}]
#text_mine_articles("20190101")

No articles found for year: 2020, month: 1, for domains ['wsj.com', 'aljazeera.com', 'bfmtv.com', 'france24.com']
No articles found for year: 2020, month: 2, for domains ['wsj.com', 'aljazeera.com', 'bfmtv.com', 'france24.com']
No articles found for year: 2020, month: 3, for domains ['wsj.com', 'aljazeera.com', 'bfmtv.com', 'france24.com']
No articles found for year: 2020, month: 4, for domains ['wsj.com', 'aljazeera.com', 'bfmtv.com', 'france24.com']
No articles found for year: 2020, month: 5, for domains ['wsj.com', 'aljazeera.com', 'bfmtv.com', 'france24.com']
Finished processing articles:
DateFrom: 2020-06-01
DateTo: 2020-06-30
Domains: ['wsj.com', 'aljazeera.com', 'bfmtv.com', 'france24.com']
Accuracy: 0.7468354430379747
Portion of positive: 22.151898734177212 % 
Finished processing articles:
DateFrom: 2020-07-01
DateTo: 2020-07-31
Domains: ['wsj.com', 'aljazeera.com', 'bfmtv.com', 'france24.com']
Accuracy: 0.8386411889596603
Portion of positive: 12.738853503184714 % 
Finished pro

In [35]:
nltk.help.upenn_tagset("NNP")

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...
