In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import seaborn as sns
import sys,os
import json
import random
import altair as alt

from scipy import stats
from pymongo import MongoClient
from tqdm import tqdm

In [2]:
%matplotlib inline
sns.set(style="darkgrid")

In [3]:
client = MongoClient('localhost', 27017)
db = client.TFE
collection = db.news_cleaned

In [4]:
res = collection.aggregate(
    [
        { 
            "$match" : {
                "type" : {
                    "$in" : [
                        "fake", 
                        "reliable"
                    ]
                },
                'domain' : {'$nin' : ['nytimes.com', 'beforeitsnews.com']}
            }
        }, 
        { 
            "$group" : {
                "_id" : {
                    "type" : "$type"
                }, 
                "newsCount" : {
                    "$sum" : 1.0
                }
            }
        }
    ]
)

In [5]:
types = []
count = []
for r in res:
    types.append(r['_id']['type'])
    count.append(r['newsCount'])

In [6]:
data = pd.DataFrame({'x' : types, 'y' : count})

In [7]:
chart1 = alt.Chart(data).mark_bar().encode(
    x = alt.X('x', axis=alt.Axis(title="type")),
    y = alt.Y('y', axis=alt.Axis(title="Count"))
)

chart1.save('out/downsampled_news_count.svg', webdriver='firefox')

In [8]:
avgSentenceLength = {}
for t in types:
    data = collection.find({'type' : t, 'domain' : {'$nin' : ['nytimes.com', 'beforeitsnews.com']}}, {'avgSentenceLength' : True, '_id' : False})
    data = [x['avgSentenceLength'] for x in data]
    avgSentenceLength[t] = data

In [9]:
data = pd.DataFrame(columns=['l', 'q1', 'q2', 'q3', 'h', 'type'])

In [11]:
p = []
for t in types:
    percentile = np.percentile(avgSentenceLength[t], [25, 50, 75], interpolation='linear')
    l = percentile[0] - (percentile[2] - percentile[0]) * 1.5
    h = percentile[2] + (percentile[2] - percentile[0]) * 1.5
    if h > np.max(avgSentenceLength[t]):
        h = np.max(avgSentenceLength[t])
    if l < np.min(avgSentenceLength[t]):
        l = np.min(avgSentenceLength[t])
    p.append({'l': l, 'q1' : percentile[0], 'q2' : percentile[1], 'q3' : percentile[2], 'h' : h,  'type' : t})

In [12]:
data = data.append(p)

In [14]:
base = alt.Chart(data)

lower_plot = base.mark_rule().encode(
    y=alt.Y('l', axis=alt.Axis(title="Average Sentence Length")),
    y2='q1',
    x='type'
)

middle_plot = base.mark_bar(size=5.0).encode(
    y='q1',
    y2='q3',
    x='type'
)
upper_plot = base.mark_rule().encode(
    y='h',
    y2='q3',
    x='type'
)

middle_tick = base.mark_tick(
    color='white',
    size=5.0
).encode(
    y='q2',
    x='type',
)

chart2 = lower_plot + middle_plot + upper_plot + middle_tick
chart2.save('out/downsampled_boxplot.svg', webdriver='firefox')

In [15]:
numSentences = {}
for t in types:
    data = collection.find({'type' : t}, {'numSentences' : True, '_id' : False})
    data = [x['numSentences'] for x in data]
    numSentences[t] = data
    
p = []
for t in types:
    percentile = np.percentile(numSentences[t], [25, 50, 75], interpolation='linear')
    l = percentile[0] - (percentile[2] - percentile[0]) * 1.5
    h = percentile[2] + (percentile[2] - percentile[0]) * 1.5
    if h > np.max(avgSentenceLength[t]):
        h = np.max(avgSentenceLength[t])
    if l < np.min(avgSentenceLength[t]):
        l = np.min(avgSentenceLength[t])
    p.append({'l': l, 'q1' : percentile[0], 'q2' : percentile[1], 'q3' : percentile[2], 'h' : h,  'type' : t})

In [16]:
data = pd.DataFrame(columns=['l', 'q1', 'q2', 'q3', 'h', 'type'])
data = data.append(p)

base = alt.Chart(data)

lower_plot = base.mark_rule().encode(
    y=alt.Y('l', axis=alt.Axis(title="Number of sentences")),
    y2='q1',
    x='type'
)

middle_plot = base.mark_bar(size=5.0).encode(
    y='q1',
    y2='q3',
    x='type'
)
upper_plot = base.mark_rule().encode(
    y='h',
    y2='q3',
    x='type'
)

middle_tick = base.mark_tick(
    color='white',
    size=5.0
).encode(
    y='q2',
    x='type',
)

chart3 = lower_plot + middle_plot + upper_plot + middle_tick
chart3.save('out/downsampled__numSentences_boxplot.svg', webdriver='firefox')

In [20]:
chart = chart1 | chart2 | chart3
chart.save('out/downsampled.svg', webdriver='firefox')
chart = alt.vconcat(chart1, chart2, chart3)
chart.save('out/downsampled_vconcat.svg', webdriver='firefox')

In [16]:
domain = []
count = []
for t in types:
    domain = []
    count = []
    for res in collection.aggregate([
        {'$match': {'type' : t}},
        {
            '$group': {
                    '_id' : {'domain' : '$domain'},
                    'newsCount' : {'$sum' : 1}
                }
        }
            ]):
        domain.append(res['_id']['domain'])
        count.append(res['newsCount'])
    data = pd.DataFrame({'x' : domain, 'y' : count})
    chart = alt.Chart(data).mark_bar().encode(
        x = alt.X('x', axis=alt.Axis(title="domains")),
        y = alt.Y('y', axis=alt.Axis(title="Count")))
    chart.save('out/'+t+'.svg')

WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home


In [None]:
metadata = db.metadata

In [None]:
for i in range(0, len(domains)):
    metadata.insert_one({'_id' : i, 'domain' : domains[i], 'metadata' : {'count' : {'total' : count[i]}}})

In [None]:
# Looking if some domains have multiple kind of news
for res in collection.aggregate([
    {
        '$group': {
                '_id' : {'domain' : '$domain', 'type' : '$type'},
                'newsCount' : {'$sum' : 1}
            }
    },
    {
        '$group' : 
        {
            '_id' : '$_id.domain',
            'meta' : 
            {
                '$push' : 
                {
                    'type' : '$_id.type',
                    'count' : '$newsCount'
                }
            }
        }
     }
        ]):
    if len(res['meta']) > 1:
        print(res)