In [1]:
import numpy as np
import pandas as pd

import random

import psycopg2

In [2]:
df_rand = pd.read_csv(
    '1mio-raw.csv', 
    delimiter = ',', 
    header = 0,
    skiprows = lambda i: i > 0 and random.random() > 0.2
)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
# Splits a string into list
def string_splitter(string):
    #if type(string) != str: 
    lst = str(string).split(", ")
    filter_obj = filter(lambda x: x != "", lst)
    return list(filter_obj)

# Strip a string representation of list of strings
def string_stripper(string):
    lst = [i.strip() for i in string[1:-1].replace('\'',"").split(',')]
    filter_obj = filter(lambda x: x != "", lst)
    return list(filter_obj)

def string_filter(lst):
    filters = [lambda x: not x.isdigit(), lambda x: x != ""]
    filter_obj = filter(lambda x: all([f(x) for f in filters]), lst)
    return list(filter_obj)

In [6]:
df_fnc = df_rand

# Dropping columns (setting new ID column later)
df_fnc = df_fnc.drop(columns = ['Unnamed: 0', 'id', 'source'])

# Set new ID column
df_fnc = df_fnc.rename_axis('id').reset_index()
df_fnc.set_index('id')

df_fnc = df_fnc.astype({'domain':str, 'type':str, 'url':str, 'content':str, 'scraped_at':str, 'inserted_at':str,
        'updated_at':str, 'title':str, 'authors':str, 'keywords':str, 'meta_keywords':str,
        'meta_description':str, 'tags':str, 'summary':str}, copy = False)

# Convert blank fields into NaN
# df = df.replace(r'^\s*$', np.nan, regex=True)

# Replace 'nan' strings with NaN
df_fnc = df_fnc.replace("nan", np.nan)

# Convert all strings into lower case:
# df_fnc = df_fnc.applymap(lambda s: s.lower() if type(s) == str else s)

# Fix types
type_set = ['fake', 'satire', 'bias', 'conspiracy', 'state', 'junksci', 'hate', 'clickbait', 'unreliable', 'political', 'reliable','rumor']
df_fnc['type'] = df_fnc['type'].apply(lambda x: np.nan if x not in type_set else x)

# Fix timestamps
for column in ['scraped_at','inserted_at','updated_at']:
    df_fnc[column] = df_fnc[column].apply(lambda x: pd.to_datetime(x, errors='coerce'))
    df_fnc[column] = df_fnc[column].replace({np.NaN: None})

# Fix auhtors - separate into list of strings
df_fnc['authors'] = df_fnc['authors'].apply(lambda x: string_splitter(x) if pd.notnull(x) else x)

# Fix metakeywords - strip a string representation of list of strings
df_fnc['meta_keywords'] = df_fnc['meta_keywords'].apply(lambda x: string_stripper(x) if pd.notnull(x) else x)

# Fix tags
df_fnc['tags'] = df_fnc['tags'].apply(lambda x: string_splitter(x) if pd.notnull(x) else x)
df_fnc['tags'] = df_fnc['tags'].apply(lambda x: string_filter(x) if isinstance(x, list) else x)

# Replace NaN into empty lists
for column in ['authors', 'keywords', 'meta_keywords', 'tags']:
    df_fnc[column] = df_fnc[column].fillna("").apply(list)

# Remove empty rows
df_fnc = df_fnc.dropna(subset = ['title', 'content', 'type'], how = 'all')

In [4]:
df_fnc.to_csv('test.csv', index=False)
display(df_fnc)

NameError: name 'df_fnc' is not defined

In [7]:
df_wn = pd.read_csv('wikinews.csv')

# Dropping columns
df_wn = df_wn.drop(columns = ['Unnamed: 0'])

# Set new ID column
df_wn = df_wn.rename_axis('id').reset_index()
df_wn.set_index('id')

# Fix timestamps
for column in ['publish_date', 'modified_date']:
    df_wn[column] = df_wn[column].apply(lambda x: pd.to_datetime(x, errors='coerce')).astype('datetime64[D]')
    df_wn[column] = df_wn[column].replace({np.NaN: None})

# Fix sources
df_wn['sources'] = df_wn['sources'].apply(lambda x: string_stripper(x) if pd.notnull(x) else x)

# Fix categories
df_wn['categories'] = df_wn['categories'].apply(lambda x: string_stripper(x) if pd.notnull(x) else x)

# Replace NaN into empty lists
for column in ['sources', 'categories']:
    df_wn[column] = df_wn[column].fillna("").apply(list)

# Remove empty rows
df_wn = df_wn.dropna(subset = ['title', 'content'], how = 'all')

In [8]:
# Generates the tables of our database

fnc_tables = ["fnc_article", "authors", "tags", "keywords", "metakeywords"]

create_fnctables = [
    """
    CREATE TABLE fnc_article (
        id INT, 
        domain VARCHAR, 
        type VARCHAR, 
        url VARCHAR, 
        content VARCHAR, 
        scraped_at TIMESTAMP, 
        inserted_at TIMESTAMP,
        updated_at TIMESTAMP, 
        title VARCHAR (256), 
        meta_description VARCHAR, 
        summary VARCHAR,

        PRIMARY KEY (id) 
    );
    """
    ,
    """
    CREATE TABLE authors (
        a_id INT,
        authors VARCHAR,
        PRIMARY KEY (a_id, authors),
        FOREIGN KEY (a_id)
            REFERENCES fnc_article (id)
            ON UPDATE CASCADE ON DELETE CASCADE
    );
    """
    ,
    """
    CREATE TABLE tags (
        a_id INT, 
        tag VARCHAR, 
        PRIMARY KEY (a_id, tag),
        FOREIGN KEY (a_id)
            REFERENCES fnc_article (id)
            ON UPDATE CASCADE ON DELETE CASCADE
    );
    """
    ,
    """
    CREATE TABLE keywords (
        a_id INT, 
        keyword VARCHAR,
        PRIMARY KEY (a_id, keyword),
        FOREIGN KEY (a_id)
            REFERENCES fnc_article (id)
            ON UPDATE CASCADE ON DELETE CASCADE
    );
    """
    ,
    """
    CREATE TABLE metakeywords (
        a_id INT, 
        mkeyword VARCHAR,
        PRIMARY KEY (a_id, mkeyword),
        FOREIGN KEY (a_id)
            REFERENCES fnc_article (id)
            ON UPDATE CASCADE ON DELETE CASCADE
    );
    """
]

In [9]:
wn_tables = ["wn_article", "sources", "categories"]

create_wntables = [
    """
    CREATE TABLE wn_article (
        id INT, 
        content VARCHAR, 
        publish_date TIMESTAMP, 
        modified_date TIMESTAMP,
        title VARCHAR (256), 

        PRIMARY KEY (id) 
    );
    """
    ,
    """
    CREATE TABLE sources (
        a_id INT,
        sources VARCHAR,
        PRIMARY KEY (a_id, sources),
        FOREIGN KEY (a_id)
            REFERENCES wn_article (id)
            ON UPDATE CASCADE ON DELETE CASCADE
    );
    """
    ,
    """
    CREATE TABLE categories (
        a_id INT, 
        categories VARCHAR, 
        PRIMARY KEY (a_id, categories),
        FOREIGN KEY (a_id)
            REFERENCES wn_article (id)
            ON UPDATE CASCADE ON DELETE CASCADE
    );
    """
]

In [12]:
conn = psycopg2.connect(dbname="fakenewsdb", user="postgres", password="1234")

cursor = conn.cursor()

In [13]:
for table in (fnc_tables + wn_tables):
    cursor.execute("DROP TABLE IF EXISTS " + table + " CASCADE;")

for sql in (create_fnctables + create_wntables):
    cursor.execute(sql)

conn.commit()

In [None]:
# Set of functions that puts the data from pd.dataframe into the right tables. Explodes the dataframe columns with
# list of strings into seperate entries each with their own key (a_id, string)

def projectrow2tuple(fields, row):
    return tuple(map(lambda f: row[f], fields))

def insertstring(table, n):
    return "INSERT INTO {} VALUES ({}) ON CONFLICT DO NOTHING".format(table, ", ".join(map(lambda _: '%s', range(n))))

def multi_insert(server, a_id, insert, xs):
    for x in xs:
        server.execute(insert, (a_id, x))

In [None]:
def insert_rows_fnc(server, row):
    A_domain = ['id', 'domain', 'type', 'url', 'content', 'scraped_at',
       'inserted_at', 'updated_at', 'title', 'meta_description', 'summary']
    AU_domain = ['id', 'authors']
    T_domain = ['id', 'tags']
    K_domain = ['id', 'keywords']
    MK_domain = ['id', 'meta_keywords']
    
    Atuple = projectrow2tuple(A_domain, row)
    (a_id, tags) = projectrow2tuple(T_domain, row)
    (_, au) = projectrow2tuple(AU_domain, row)
    (_, kws) = projectrow2tuple(K_domain, row)
    (_, mkws) = projectrow2tuple(MK_domain, row)

    Ainsert = insertstring("fnc_article", len(Atuple))
    Tinsert = insertstring("tags", 2)
    AUinsert = insertstring("authors", 2)
    Kinsert = insertstring("keywords", 2)
    MKinsert = insertstring("metakeywords", 2)

    server.execute(Ainsert, Atuple)
    insert = lambda ins, xs: multi_insert(server, a_id, ins, xs)
    insert(Tinsert, tags)
    insert(AUinsert, au)
    insert(Kinsert, kws)
    insert(MKinsert, mkws)

def insert_rows_wn(server, row):
    A_domain = ['id', 'content', 'publish_date', 'modified_date', 'title']
    S_domain = ['id', 'sources']
    C_domain = ['id', 'categories']

    Atuple = projectrow2tuple(A_domain, row)
    (a_id, src) = projectrow2tuple(S_domain, row)
    (a_id, cat) = projectrow2tuple(C_domain, row)

    Ainsert = insertstring("wn_article", len(Atuple))
    Sinsert = insertstring("sources", 2)
    Cinsert = insertstring("categories", 2)

    server.execute(Ainsert, Atuple)
    insert = lambda ins, xs: multi_insert(server, a_id, ins, xs)
    insert(Sinsert, src)
    insert(Cinsert, cat)

for _, row in df_fnc.iterrows():
    insert_rows_fnc(cursor, row)

for _, row in df_wn.iterrows():
    insert_rows_wn(cursor, row)

conn.commit()

In [None]:
conn.close()