In [2]:
# Insert code here.
import argparse
import sqlite3
import pandas as pd
from tqdm import tqdm

In [3]:
path = '/datasets/nela-gt/nela-eng-2019.db'
conn = sqlite3.connect(path)

In [4]:
def execute_query(conn, query):
    # execute query on database and retrieve them with fetchall
    results = conn.cursor().execute(query).fetchall()
    return results

In [6]:
path = '/datasets/nela-gt/labels.csv'
labels = dict()
with open(path) as fin:
    # Read out the header line from label file
    fin.readline()
    # Iterate over lines, taking the value from first column after name
    # i.e., aggregated label
    for line in fin:
        l = line.strip().split(",")
        source = l[0]
        if l[1] == "":  # NODATA for this entry, skip it
            continue
        labels[source] = int(l[1])  # get value from last column (label)


In [7]:
len(labels)

183

In [8]:
execute_query(conn, "SELECT * FROM sqlite_master WHERE type = 'table'")

[('table',
  'newsdata',
  'newsdata',
  2,
  'CREATE TABLE newsdata (id text primary key,date text,source text,title text,content text,author text,url text,published text,published_utc integer,collection_utc integer)')]

In [9]:
total_no_sources = execute_query(conn, "SELECT distinct source FROM newsdata")
len(total_no_sources)

261

In [10]:
total_no_sources

[('thenewyorktimes',),
 ('businessinsider',),
 ('skynewsus',),
 ('themichellemalkinblog',),
 ('cnbc',),
 ('channel4uk',),
 ('msnbc',),
 ('thedailyblog',),
 ('delawareliberal',),
 ('theindependent',),
 ('foxnews',),
 ('newsweek',),
 ('realnewsrightnow',),
 ('tass',),
 ('thinkprogress',),
 ('thedcclothesline',),
 ('sluggerotoole',),
 ('spiegel',),
 ('sottnet',),
 ('motherjones',),
 ('theweekuk',),
 ('bbcuk',),
 ('eveningstandard',),
 ('aljazeera',),
 ('prisonplanet',),
 ('adobochronicles',),
 ('newsbiscuit',),
 ('conservativetribune',),
 ('liveaction',),
 ('powerlineblog',),
 ('themoscowtimes',),
 ('veteranstoday',),
 ('hullabalooblog',),
 ('thedailymirror',),
 ('talkingpointsmemo',),
 ('activistpost',),
 ('renegadetribune',),
 ('washingtonmonthly',),
 ('theblaze',),
 ('collectiveevolution',),
 ('crooksandliars',),
 ('thehuffingtonpost',),
 ('prepareforchange',),
 ('mintpressnews',),
 ('fusion',),
 ('infowars',),
 ('frontpagemagazine',),
 ('ipolitics',),
 ('wingsoverscotland',),
 ('thede

In [11]:
each_source_counts = execute_query(conn, "SELECT source, count(source) FROM newsdata group by source")


In [12]:
each_source_counts

[('21stcenturywire', 791),
 ('abcnews', 2861),
 ('activistpost', 2780),
 ('addictinginfo', 12),
 ('adobochronicles', 417),
 ('ageofautism', 646),
 ('airwars', 16),
 ('aljazeera', 11611),
 ('americablognews', 104),
 ('anonnews', 508),
 ('anonymousconservative', 386),
 ('anti-impreialist', 11),
 ('antiimpreialist', 23),
 ('baltimoregazette', 1),
 ('bbc', 14522),
 ('bbcuk', 12211),
 ('bearingarms', 1902),
 ('bigleaguepolitics', 131),
 ('bipartisanreport', 4175),
 ('birminghammail', 16864),
 ('bizstandardnews', 149),
 ('bonginoreport', 221),
 ('breaking911', 638),
 ('breitbart', 7431),
 ('businessinsider', 737),
 ('buzzfeed', 1741),
 ('camelotdaily', 17),
 ('cbsnews', 6758),
 ('channel4uk', 2591),
 ('charlotteobserver', 1793),
 ('chicagosuntimes', 2795),
 ('chicagotribune', 1887),
 ('citizenfreepress', 318),
 ('civictribune', 1),
 ('clashdaily', 1078),
 ('cnbc', 2962),
 ('cnn', 7766),
 ('cnsnews', 7204),
 ('collectiveevolution', 707),
 ('coloradopeakpolitics', 296),
 ('conservativehome', 2

In [13]:
len(labels.keys())

183

In [14]:
counts_dict = {}
for each_source in each_source_counts:
    if each_source[0] in labels:
        type_ = labels[each_source[0]]
        if type_ in counts_dict:
            counts_dict[type_] += each_source[1]
        else:
            counts_dict[type_] = each_source[1]
    else:
        print(each_source[0])
counts_dict 

adobochronicles
anonnews
anonymousconservative
anti-impreialist
antiimpreialist
baltimoregazette
bizstandardnews
camelotdaily
channel4uk
civictribune
coloradopeakpolitics
conservativehome
conservativetribune
crikey
delawareliberal
deneenborelli
dickmorrisblog
disrn
environmentdepth
fortruss
freedombunker
freedomoutpost
ftwestminsterblog
glossynews
greenwichtime
hitandrun
hullabalooblog
humortimes
informnapalm
inquisitr
instapundit
intellectualconservative
jewworldorder
labourlist
liberaldemocratvoice
lisahaven
mail
nationalreport
newsbiscuit
newslo
newsnetscotland
newsyoucantuse
now8news
obamawatcher
osce
politicalite
politicscouk
politicsuk
prepareforchange
realnewsrightnow
renegadetribune
saraacater
sluggerotoole
thebeaverton
theborowitzreport
thechaser
thedailyblog
thedailyecho
thehuffingtonpostpoliticalsatire
themanchestereveningnews
themichellemalkinblog
theonion
thepoke
therussophileorg
theshovel
thespoof
thevaccinereaction
thevalleyreport
thewashingtonstandard
theweekuk
trumptim

{2: 126009, 0: 446251, 1: 331746}

In [15]:
percent = 10
sample_sizes = []
for x in [0,2]:
    count = counts_dict[x]
    source_counts = [each for each in each_source_counts if each[0] in labels and labels[each[0]] == x]
    source_distribution = { each[0]: each[1]/count for each in source_counts}
    sampling_10 = int(count * (percent /100))
    sample_sizes += [(each , int(source_distribution[each] * sampling_10)) for each in source_distribution.keys() if int(source_distribution[each] * sampling_10)  != 0 ]

In [16]:
sampled_data = []
for src in tqdm(sample_sizes):
    res = execute_query(conn, "select * from newsdata where source ='" + src[0] + "' limit " + str(src[1]))
    sampled_data = sampled_data + res

 19%|█▉        | 25/129 [00:23<01:08,  1.52it/s]

100%|██████████| 129/129 [01:23<00:00,  1.54it/s]


In [17]:
sampled_df = pd.DataFrame(sampled_data, columns=["id" ,"date" ,"source" ,"title" ,"content" ,"author" ,"url" ,"published" ,"published_utc" ,"collection_utc"])

In [18]:
label_mapping = {0:"Reliable", 1 : "Mixed", 2: "Unreliable"}

In [19]:
sampled_df["source"].map(labels).value_counts()

0    44582
2    12575
Name: source, dtype: int64

In [20]:
sampled_df["Reliability"] = sampled_df["source"].map(labels)

In [21]:
sampled_df.sample(10)

In [22]:
sampled_df.to_csv(path_or_buf='../datasets/nela10.csv', index=False)