In [46]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import seaborn as sns
import sys,os
import json
import random
import altair as alt

from scipy import stats
from pymongo import MongoClient
from tqdm import tqdm

In [47]:
client = MongoClient('localhost', 27017)
db = client.TFE
collection = db.liar_liar

In [48]:
res = collection.aggregate(
    [
        { 
            "$match" : {
                "type" : {
                    "$in" : [
                        "fake", 
                        "reliable"
                    ]
                }
            }
        }, 
        { 
            "$group" : {
                "_id" : {
                    "type" : "$type"
                }, 
                "newsCount" : {
                    "$sum" : 1.0
                }
            }
        }
    ]
)

In [49]:
types = []
count = []
for r in res:
    types.append(r['_id']['type'])
    count.append(r['newsCount'])
    
data = pd.DataFrame({'x' : types, 'y' : count})
chart1 = alt.Chart(data).mark_bar().encode(
    x = alt.X('x', axis=alt.Axis(title="type")),
    y = alt.Y('y', axis=alt.Axis(title="Count"))
)

chart1.save('out/liar_news_count.png', webdriver='firefox')

In [50]:
avgSentenceLength = {}
for t in types:
    data = collection.find({'type' : t}, {'avgSentenceLength' : True, '_id' : False})
    data = [x['avgSentenceLength'] for x in data]
    avgSentenceLength[t] = data
    
data = pd.DataFrame(columns=['l', 'q1', 'q2', 'q3', 'h', 'type'])
p = []
for t in types:
    percentile = np.percentile(avgSentenceLength[t], [25, 50, 75], interpolation='linear')
    p.append({'l': percentile[0] - (percentile[1] - percentile[0]) * 1.5, 'q1' : percentile[0], 'q2' : percentile[1], 'q3' : percentile[2], 'h' : percentile[2] + (percentile[2] - percentile[1]) * 1.5,  'type' : t})

data = data.append(p)
base = alt.Chart(data)

lower_plot = base.mark_rule().encode(
    y=alt.Y('l', axis=alt.Axis(title="Average Sentence Length")),
    y2='q1',
    x='type'
)

middle_plot = base.mark_bar(size=5.0).encode(
    y='q1',
    y2='q3',
    x='type'
)
upper_plot = base.mark_rule().encode(
    y='h',
    y2='q3',
    x='type'
)

middle_tick = base.mark_tick(
    color='white',
    size=5.0
).encode(
    y='q2',
    x='type',
)

chart2 = lower_plot + middle_plot + upper_plot + middle_tick
chart2.save('out/liar_boxplot_avgSentenceLength.png', webdriver='firefox')

In [51]:
numSentences = {}
for t in types:
    data = collection.find({'type' : t}, {'numSentences' : True, '_id' : False})
    data = [x['numSentences'] for x in data]
    numSentences[t] = data
    
p = []
for t in types:
    percentile = np.percentile(numSentences[t], [25, 50, 75], interpolation='linear')
    p.append({'l': percentile[0] - (percentile[1] - percentile[0]) * 1.5, 'q1' : percentile[0], 'q2' : percentile[1], 'q3' : percentile[2], 'h' : percentile[2] + (percentile[2] - percentile[1]) * 1.5,  'type' : t})
data = pd.DataFrame(columns=['l', 'q1', 'q2', 'q3', 'h', 'type'])
data = data.append(p)

base = alt.Chart(data)

lower_plot = base.mark_rule().encode(
    y=alt.Y('l', axis=alt.Axis(title="Number of sentences")),
    y2='q1',
    x='type'
)

middle_plot = base.mark_bar(size=5.0).encode(
    y='q1',
    y2='q3',
    x='type'
)
upper_plot = base.mark_rule().encode(
    y='h',
    y2='q3',
    x='type'
)

middle_tick = base.mark_tick(
    color='white',
    size=5.0
).encode(
    y='q2',
    x='type',
)

chart3 = lower_plot + middle_plot + upper_plot + middle_tick
chart3.save('out/liar_boxplot_numSentences.png', webdriver='firefox')