In [None]:
import numpy as np
from collections import Counter
import json
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def json_load(filename):
    
    with open(filename) as open_file:
    
        parsed_json = json.load(open_file)

        strs = []
        strs.append(parsed_json['title'])
        strs.append(parsed_json['text'])
        strs.append(parsed_json['source'])

    return strs


def _clean(word):
    messy_symbols = r"~!@#$%^&*()_+1234567890-=|}{[]\":;'/.,<>?’`"
    for symbol in messy_symbols:
        word = word.replace(symbol, "")

    return word


def count_words(text):
    tokens = word_tokenize(text.lower())
    tokens = [_clean(t) for t in tokens]
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(t) for t in tokens]
    lemmas = [l for l in lemmas if len(l) > 1]
    return dict(Counter(lemmas))


def scrap_data(url):
    source = requests.get(url, timeout=20)
    if not source:
        raise ConnectionError('Couldn\'t get a source from {}'.format(source))

    soup = BeautifulSoup(source.text, 'html.parser')
    article_container = _extract_article_container(soup)
    if not article_container:
        raise ConnectionError('Couldn\'t find any article at {}'.format(source))

    domain = _extract_domain(url)
    title = _extract_title(article_container)
    body = _extract_body(article_container)
    return domain, title, body


def _extract_domain(url):
    parsed = urlparse(url)
    domain = '{uri.netloc}'.format(uri=parsed)
    return domain


def _extract_article_container(soup):
    if soup.find('div', {'class': 'post'}):
        return soup.find('div', {'class': 'post'})

    if soup.find('div', {'class': 'article-container'}):
        return soup.find('div', {'class': 'article-container'})

    if soup.find('div', {'class': 'article-text'}):
        return soup.find('div', {'class': 'article-text'})

    if soup.find('article', {'class': 'a-main'}):
        return soup.find('article', {'class': 'a-main'})

    if soup.find('div', {'class': 'js-article-inner'}):
        return soup.find('div', {'class': 'js-article-inner'})

    if soup.article:
        return soup.article

    if soup.find('div', {'class': 'story-body'}):
        return soup.find('div', {'class': 'story-body'})

    if soup.find('div', {'id': 'content-start'}):
        return soup.find('div', {'id': 'content-start'})

    if soup.find('div', {'class': 'entry-content'}):
        return soup.find('div', {'class': 'entry-content'})

    if soup.find('div', {'class': 'td-post-content'}):
        return soup.find('div', {'class': 'td-post-content'})

    if soup.find('div', {'class': 'theiaPostSlider_slides'}):
        return soup.find('div', {'class': 'theiaPostSlider_slides'})

    return None


def _extract_title(article):
    if article.h1:
        return article.h1.getText().strip()

    if article.h2:
        return article.h2.getText().strip()

    if article.header:
        return article.header.getText().strip()

    return None


def _extract_body(article):
    return ' '.join([p.getText().strip() for p in(article.findAll('p') + article.findAll('span'))])

def constructDataSet():
    """"""
    buzzfeed_real_files = []

    for i in range(1,92):
        file_name = "BuzzFeed_Real_" +str(i) + "-Webpage.json"
        buzzfeed_real_files.append(file_name)


    buzzfeed_fake_files = []
    for i in range(1,92):
        file_name = "BuzzFeed_Fake_" +str(i) + "-Webpage.json"
        buzzfeed_fake_files.append(file_name)

    polifact_real_files = []
    for i in range(1, 121):
        file_name = "PolitiFact_Real_" +str(i) + "-Webpage.json"
        polifact_real_files.append(file_name)


    polifact_fake_files = []
    for i in range(1, 121):
        file_name = "PolitiFact_Fake_" +str(i) + "-Webpage.json"
        polifact_fake_files.append(file_name)

    real_files = buzzfeed_real_files + polifact_real_files
    fake_files = buzzfeed_fake_files + polifact_fake_files

    # all words
    all_words = dict()

    for file in real_files:
        array = json_load(file)
        word_freq_dict = count_words(array[1])
        all_words = Counter(all_words) + Counter(word_freq_dict)

    for file in fake_files:
        array = json_load(file)
        word_freq_dict = count_words(array[1])
        all_words = Counter(all_words) + Counter(word_freq_dict)

    all_words_ls = [*all_words]

    word_index_dict = dict()
    for i in range(0, len(all_words_ls)):
        word_index_dict[all_words_ls[i]] = i

    # num of row = num of docs(news)
    # num of column = num of word features + 1 (url) + 1 (title) + 1 (label fake or real)
    num_of_word_features = len(all_words_ls)
    train_data = np.empty([(91+91+120+120), (num_of_word_features + 3)])
    index_url = num_of_word_features
    index_title = num_of_word_features+1
    index_label = num_of_word_features+2

    counter = 0
    for file in real_files:
        array = json_load(file)
        word_freq_dict = count_words(array[1])
        for word, freq in word_freq_dict.items():
            train_data[counter][word_index_dict[word]] = freq
            #train_data[counter][index_url] = array[2]
            #train_data[counter][index_title] = array[0]
            train_data[counter][index_label] = 1
        counter += 1

    for file in fake_files:
        array = json_load(file)
        word_freq_dict = count_words(array[1])
        for word, freq in word_freq_dict.items():
            train_data[counter][word_index_dict[word]] = freq
            #train_data[counter][index_url] = array[2]
            #train_data[counter][index_title] = array[0]
            train_data[counter][index_label] = 0
        counter += 1

    return train_data

In [None]:
constructDataSet()