In [1]:
import numpy as np
import pandas as pd

import random

import psycopg2

In [2]:
# df_rand = pd.read_csv(
#    "1mio-raw.csv", 
#    delimiter = ",",
#    header = 0, 
#    skiprows = lambda x: x > 0 and random.random() > 0.001
# )

In [3]:
# df_rand.to_csv('1krandFNC.csv', index = False)

# df_rand.shape[0]

In [4]:
# Splits a string into list
def string_splitter(string):
    #if type(string) != str: 
    lst = str(string).split(", ")
    filter_obj = filter(lambda x: x != "", lst)
    return list(filter_obj)

# Strip a string representation of list of strings
def string_stripper(string):
    lst = [i.strip() for i in string[1:-1].replace('\'',"").split(',')]
    filter_obj = filter(lambda x: x != "", lst)
    return list(filter_obj)

def string_filter(lst):
    filters = [lambda x: not x.isdigit(), lambda x: x != ""]
    filter_obj = filter(lambda x: all([f(x) for f in filters]), lst)
    return list(filter_obj)

In [5]:
df_fnc = pd.read_csv('10krandFNC.csv')

# Dropping columns (setting new ID column later)
df_fnc = df_fnc.drop(columns = ['Unnamed: 0', 'id', 'source'])

# Set new ID column
df_fnc = df_fnc.rename_axis('id').reset_index()
df_fnc.set_index('id')

df_fnc = df_fnc.astype({'domain':str, 'type':str, 'url':str, 'content':str, 'scraped_at':str, 'inserted_at':str,
        'updated_at':str, 'title':str, 'authors':str, 'keywords':str, 'meta_keywords':str,
        'meta_description':str, 'tags':str, 'summary':str}, copy = False)

# Convert blank fields into NaN
#df = df.replace(r'^\s*$', np.nan, regex=True)

# Replace 'nan' strings with NaN
df_fnc = df_fnc.replace("nan", np.nan)

# Convert all strings into lower case:
df_fnc = df_fnc.applymap(lambda s: s.lower() if type(s) == str else s)

# Clean types
type_set = ['fake', 'satire', 'bias', 'conspiracy', 'state', 'junksci', 'hate', 'clickbait', 'unreliable', 'political', 'reliable','rumor']
df_fnc['type'] = df_fnc['type'].apply(lambda x: np.nan if x not in type_set else x)

# Clean timestamps
for column in ['scraped_at','inserted_at','updated_at']:
    df_fnc[column] = df_fnc[column].apply(lambda x: pd.to_datetime(x, errors='coerce'))
    df_fnc[column] = df_fnc[column].replace({np.NaN: None})

# Clean auhtors - separate into list of strings
df_fnc['authors'] = df_fnc['authors'].apply(lambda x: string_splitter(x) if pd.notnull(x) else x)

# Clean metakeywords - strip a string representation of list of strings
df_fnc['meta_keywords'] = df_fnc['meta_keywords'].apply(lambda x: string_stripper(x) if pd.notnull(x) else x)

# Clean tags
df_fnc['tags'] = df_fnc['tags'].apply(lambda x: string_splitter(x) if pd.notnull(x) else x)
df_fnc['tags'] = df_fnc['tags'].apply(lambda x: string_filter(x) if isinstance(x, list) else x)

# Replace NaN into empty lists
for column in ['authors', 'keywords', 'meta_keywords', 'tags']:
    df_fnc[column] = df_fnc[column].fillna("").apply(list)

# Remove empty rows
df_fnc = df_fnc.dropna(subset = ['title', 'content', 'type'], how = 'all')

In [6]:
df_fnc.to_csv('test.csv', index=False)
display(df_fnc)

Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary
0,0,awm.com,unreliable,http://awm.com/woman-waves-hand-in-front-of-li...,most people’s pets are a member of the family ...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,"woman waves hand in front of lizard’s tank, ca...",[wendy michaels],[],[],,[],
1,1,canadafreepress.com,conspiracy,http://canadafreepress.com/print_friendly/ugan...,subscribe to canada free press for free\n\ntha...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,uganda’s president: i love trump because ‘he t...,"[dan calabrese, because without america, there...",[],"[conservative news, conservative newspaper]","news, politics, editorials, commentary, canada...",[],
2,2,awarenessact.com,conspiracy,http://awarenessact.com/tag/chicagotribune/,have you ever seen something or someone you ju...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,chicagotribune – awareness act,[gerald sinclair],[],[],,[],
3,3,awarenessact.com,conspiracy,http://awarenessact.com/how-spirituality-could...,with a number of people now identifying as spi...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,how spirituality could be the key to coping wi...,[gerald sinclair],[],[],,"[psychiatry, psychology, crystals, mental heal...",
4,4,awarenessact.com,conspiracy,http://awarenessact.com/bomb-cyclone-hits-east...,a winter storm hitting the east coast could ea...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,"‘bomb cyclone’ hits east coast, florida reache...",[gerald sinclair],[],[],,"[frozen, extreme weather, winter hurricane, re...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9796,9796,express.co.uk,rumor,https://www.express.co.uk/sport/football/42551...,alex buttner is close to joining besiktas\n\ne...,2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,man utd make room for leighton baines arrival ...,"[ben jefferson, besiktas vice-president ahmet ...",[],[],alexander butter is set to join turkish side b...,[],
9797,9797,wikileaks.org,unreliable,https://www.wikileaks.org/plusd/cables/1975lis...,tor\n\ntor is an encrypted anonymising network...,2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,cable: 1975lisbon04838,[],[],[],,[view tags],
9798,9798,express.co.uk,rumor,https://www.express.co.uk/sport/football/51219...,daily express: chelsea were left to reflect on...,2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,"paper round-up: arteta's arsenal plea, balotel...",[charles perrin],[],[],mikel arteta has urged arsenal they need to dr...,"[kop hero, reflect on a missed opportunity, bu...",
9799,9799,express.co.uk,rumor,https://www.express.co.uk/sport/f1-autosport/3...,unfortunately the veteran racing driver is now...,2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,striling moss: 'women lack mental toughness fo...,"[julie carpenter, stirling moss]",[],[],when it comes to putting his foot down on the ...,[],


In [7]:
#df_fnc[['scraped_at','inserted_at','updated_at']].to_csv('test3.csv')

# df_fnc.iloc[11615]

In [8]:
# Generates the tables of our database

fnc_tables = ["fnc_article", "authors", "tags", "keywords", "metakeywords"]

create_fnctables = [
    """
    CREATE TABLE fnc_article (
        id INT, 
        domain VARCHAR, 
        type VARCHAR, 
        url VARCHAR, 
        content VARCHAR, 
        scraped_at TIMESTAMP, 
        inserted_at TIMESTAMP,
        updated_at TIMESTAMP, 
        title VARCHAR (256), 
        meta_description VARCHAR, 
        summary VARCHAR,

        PRIMARY KEY (id) 
    );
    """
    ,
    """
    CREATE TABLE authors (
        a_id INT,
        authors VARCHAR,
        PRIMARY KEY (a_id, authors),
        FOREIGN KEY (a_id)
            REFERENCES fnc_article (id)
            ON UPDATE CASCADE ON DELETE CASCADE
    );
    """
    ,
    """
    CREATE TABLE tags (
        a_id INT, 
        tag VARCHAR, 
        PRIMARY KEY (a_id, tag),
        FOREIGN KEY (a_id)
            REFERENCES fnc_article (id)
            ON UPDATE CASCADE ON DELETE CASCADE
    );
    """
    ,
    """
    CREATE TABLE keywords (
        a_id INT, 
        keyword VARCHAR,
        PRIMARY KEY (a_id, keyword),
        FOREIGN KEY (a_id)
            REFERENCES fnc_article (id)
            ON UPDATE CASCADE ON DELETE CASCADE
    );
    """
    ,
    """
    CREATE TABLE metakeywords (
        a_id INT, 
        mkeyword VARCHAR,
        PRIMARY KEY (a_id, mkeyword),
        FOREIGN KEY (a_id)
            REFERENCES fnc_article (id)
            ON UPDATE CASCADE ON DELETE CASCADE
    );
    """
]

In [9]:
wn_tables = ["wn_article", "sources", "categories"]

create_wntables = [
    """
    CREATE TABLE wn_article (
        id INT, 
        content VARCHAR, 
        publish_date TIMESTAMP, 
        modified_date TIMESTAMP,
        title VARCHAR (256), 

        PRIMARY KEY (id) 
    );
    """
    ,
    """
    CREATE TABLE sources (
        a_id INT,
        sources VARCHAR,
        PRIMARY KEY (a_id, sources),
        FOREIGN KEY (a_id)
            REFERENCES wn_article (id)
            ON UPDATE CASCADE ON DELETE CASCADE
    );
    """
    ,
    """
    CREATE TABLE categories (
        a_id INT, 
        categories VARCHAR, 
        PRIMARY KEY (a_id, categories),
        FOREIGN KEY (a_id)
            REFERENCES wn_article (id)
            ON UPDATE CASCADE ON DELETE CASCADE
    );
    """
]

In [22]:
conn = psycopg2.connect(dbname="fakenewsdb", user="postgres", password="1234")

cursor = conn.cursor()

In [24]:
for table in (fnc_tables + wn_tables):
    cursor.execute("DROP TABLE IF EXISTS " + table + " CASCADE;")

for sql in (create_fnctables + create_wntables):
    cursor.execute(sql)

conn.commit()

In [25]:
# Set of functions that puts the data from pd.dataframe into the right tables. Explodes the dataframe columns with
# list of strings into seperate entries each with their own key (a_id, string)

def projectrow2tuple(fields, row):
    return tuple(map(lambda f: row[f], fields))

def insertstring(table, n):
    return "INSERT INTO {} VALUES ({}) ON CONFLICT DO NOTHING".format(table, ", ".join(map(lambda _: '%s', range(n))))

def multi_insert(server, a_id, insert, xs):
    for x in xs:
        server.execute(insert, (a_id, x))

In [26]:
def insert_rows_fnc(server, csv_row):
    A_domain = ['id', 'domain', 'type', 'url', 'content', 'scraped_at',
       'inserted_at', 'updated_at', 'title', 'meta_description', 'summary']
    AU_domain = ['id', 'authors']
    T_domain = ['id', 'tags']
    K_domain = ['id', 'keywords']
    MK_domain = ['id', 'meta_keywords']
    
    Atuple = projectrow2tuple(A_domain, csv_row)
    (a_id, tags) = projectrow2tuple(T_domain, csv_row)
    (_, au) = projectrow2tuple(AU_domain, csv_row)
    (_, kws) = projectrow2tuple(K_domain, csv_row)
    (_, mkws) = projectrow2tuple(MK_domain, csv_row)

    Ainsert = insertstring("fnc_article", len(Atuple))
    Tinsert = insertstring("tags", 2)
    AUinsert = insertstring("authors", 2)
    Kinsert = insertstring("keywords", 2)
    MKinsert = insertstring("metakeywords", 2)

    server.execute(Ainsert, Atuple)
    insert = lambda ins, xs: multi_insert(server, a_id, ins, xs)
    insert(Tinsert, tags)
    insert(AUinsert, au)
    insert(Kinsert, kws)
    insert(MKinsert, mkws)

for _, row in df_fnc.iterrows():
    insert_rows_fnc(cursor, row)

conn.commit()

In [16]:
conn.close()