In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import seaborn as sns
import sys,os
import json
import random
import altair as alt

from scipy import stats
from pymongo import MongoClient
from tqdm import tqdm

In [18]:
%matplotlib inline
sns.set(style="darkgrid")

In [19]:
client = MongoClient('localhost', 27017)
db = client.TFE
collection = db.news_cleaned

In [20]:
res = collection.aggregate([
    {
        '$group': {
                '_id' : {'type' : '$type'},
                'newsCount' : {'$sum' : 1}
            }
        }])

In [21]:
types = []
count = []
for r in res:
    types.append(r['_id']['type'])
    count.append(r['newsCount'])

In [22]:
data = pd.DataFrame({'x' : types, 'y' : count})

In [23]:
alt.Chart(data).mark_bar().encode(
    x = alt.X('x', axis=alt.Axis(title="type")),
    y = alt.Y('y', axis=alt.Axis(title="Count"))
)

<VegaLite 3 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [25]:
avgSentenceLength = {}
for t in types:
    data = collection.find({'type' : t}, {'avgSentenceLength' : True, '_id' : False})
    data = [x['avgSentenceLength'] for x in data]
    avgSentenceLength[t] = data

In [31]:
data = pd.DataFrame(columns=['l', 'q1', 'q2', 'q3', 'h', 'type'])

In [32]:
p = []
for t in types:
    percentile = np.percentile(avgSentenceLength[t], [25, 50, 75], interpolation='linear')
    l = percentile[0] - (percentile[2] - percentile[0]) * 1.5
    h = percentile[2] + (percentile[2] - percentile[0]) * 1.5
    if h > np.max(avgSentenceLength[t]):
        h = np.max(avgSentenceLength[t])
    if l < np.min(avgSentenceLength[t]):
        l = np.min(avgSentenceLength[t])
    p.append({'l': l, 'q1' : percentile[0], 'q2' : percentile[1], 'q3' : percentile[2], 'h' : h,  'type' : t})

In [33]:
data = data.append(p)

In [34]:
data

Unnamed: 0,l,q1,q2,q3,h,type
0,0.5,9.0,11.586667,14.717647,23.294118,unknown
1,1.166667,9.458333,12.242819,15.882353,25.518382,junksci
2,0.457143,8.0,11.181818,15.1,25.75,conspiracy
3,0.333333,8.39603,10.823529,13.777778,21.8504,fake
4,3.459302,9.883721,11.777778,14.166667,20.591085,unreliable
5,3.5,9.818182,11.818182,14.030303,20.348485,hate
6,0.986735,8.734694,11.0,13.9,21.647959,clickbait
7,3.125,10.25,12.5,15.0,22.125,rumor
8,2.375,8.75,10.627907,13.0,19.375,reliable
9,2.102941,10.066176,12.5,15.375,23.338235,bias


In [35]:
base = alt.Chart(data)

lower_plot = base.mark_rule().encode(
    y=alt.Y('l', axis=alt.Axis(title="Average Sentence Length")),
    y2='q1',
    x='type'
)

middle_plot = base.mark_bar(size=5.0).encode(
    y='q1',
    y2='q3',
    x='type'
)
upper_plot = base.mark_rule().encode(
    y='h',
    y2='q3',
    x='type'
)

middle_tick = base.mark_tick(
    color='white',
    size=5.0
).encode(
    y='q2',
    x='type',
)

chart = lower_plot + middle_plot + upper_plot + middle_tick
chart.save('out/boxplot.svg', webdriver='firefox')

In [36]:
numSentences = {}
for t in types:
    data = collection.find({'type' : t}, {'numSentences' : True, '_id' : False})
    data = [x['numSentences'] for x in data]
    numSentences[t] = data
    
p = []
for t in types:
    percentile = np.percentile(numSentences[t], [25, 50, 75], interpolation='linear')
    l = percentile[0] - (percentile[2] - percentile[0]) * 1.5
    h = percentile[2] + (percentile[2] - percentile[0]) * 1.5
    if h > np.max(avgSentenceLength[t]):
        h = np.max(avgSentenceLength[t])
    if l < np.min(avgSentenceLength[t]):
        l = np.min(avgSentenceLength[t])
    p.append({'l': l, 'q1' : percentile[0], 'q2' : percentile[1], 'q3' : percentile[2], 'h' : h,  'type' : t})

In [38]:
data = pd.DataFrame(columns=['l', 'q1', 'q2', 'q3', 'h', 'type'])
data = data.append(p)

base = alt.Chart(data)

lower_plot = base.mark_rule().encode(
    y=alt.Y('l', axis=alt.Axis(title="Number of sentences")),
    y2='q1',
    x='type'
)

middle_plot = base.mark_bar(size=5.0).encode(
    y='q1',
    y2='q3',
    x='type'
)
upper_plot = base.mark_rule().encode(
    y='h',
    y2='q3',
    x='type'
)

middle_tick = base.mark_tick(
    color='white',
    size=5.0
).encode(
    y='q2',
    x='type',
)

chart = lower_plot + middle_plot + upper_plot + middle_tick
chart.save('out/boxplot_full_numSentences.svg', webdriver='firefox')

In [None]:
domain = []
count = []
for t in types:
    domain = []
    count = []
    for res in collection.aggregate([
        {'$match': {'type' : t}},
        {
            '$group': {
                    '_id' : {'domain' : '$domain'},
                    'newsCount' : {'$sum' : 1}
                }
        }
            ]):
        domain.append(res['_id']['domain'])
        count.append(res['newsCount'])
    data = pd.DataFrame({'x' : domain, 'y' : count})
    chart = alt.Chart(data).mark_bar().encode(
        x = alt.X('x', axis=alt.Axis(title="domains")),
        y = alt.Y('y', axis=alt.Axis(title="Count")))
    chart.save('out/'+t+'.svg', webdriver='firefox')

In [None]:
metadata = db.metadata

In [None]:
for i in range(0, len(domains)):
    metadata.insert_one({'_id' : i, 'domain' : domains[i], 'metadata' : {'count' : {'total' : count[i]}}})

In [None]:
# Looking if some domains have multiple kind of news
for res in collection.aggregate([
    {
        '$group': {
                '_id' : {'domain' : '$domain', 'type' : '$type'},
                'newsCount' : {'$sum' : 1}
            }
    },
    {
        '$group' : 
        {
            '_id' : '$_id.domain',
            'meta' : 
            {
                '$push' : 
                {
                    'type' : '$_id.type',
                    'count' : '$newsCount'
                }
            }
        }
     }
        ]):
    if len(res['meta']) > 1:
        print(res)