In [1]:
# libraries needed 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
import re
%matplotlib inline

# libraries we might not need
import csv

### Using the following code
to use the whole document you only need one file specified by filepath for the time being

In [4]:
%%time
# imports a random sample of size s from csv-file as a pandas dataframe
# pandas using python 3.X uses utf-8 encoding

# usage: specify file location, sample size and seed(used by random)
filepath = '1mioraw.csv'
#filepath = 'news_sample.csv' # <- overwrite for setup
s = 1000000                    # desired sample size(seems to have slack ie. not exact)
seed = 1                     # seed used by Pseudorandom number generator

# init dataframe with specified values
df = pd.read_csv(filepath, index_col = [0]).sample(n=s, random_state=seed)

# visual output
#print(df.shape, '<- size of dataframe \n')
#df.head()

CPU times: user 15.1 ms, sys: 173 µs, total: 15.3 ms
Wall time: 14.4 ms


In [60]:
df['id'] = pd.to_numeric(df['id'], errors = 'coerce', downcast = 'integer')
df.drop_duplicates(subset = 'id', inplace = True)

In [61]:
df = df.dropna(subset=['id']).set_index('id')
df.shape

(999934, 15)

In [62]:
#cleaning for values out of bounds of DataBase requirements etc.
df.index = df.index.astype(int)
longAuthors = df[df['authors'].str.len() > 255].index
df.drop(longAuthors, inplace = True)
longTags = df[df['tags'].str.len() > 1000].drop_duplicates(subset = 'tags', keep = 'first').index
df.drop(longTags, inplace = True)
longMetaD = df[df['meta_description'].str.len() > 10000].index
df.drop(longMetaD, inplace = True)
df['authors'] = df['authors'].replace(np.nan, 'NoAuthor', regex = True)

In [None]:
%%time
regexEmail = r"[a-zA-Z_-]+@[a-zA-Z_-]+(\.[a-zA-Z]{2,4}){1,3}"
df.content = df.content.replace(to_replace=regexEmail, value='<EMAIL>', regex=True)


In [None]:
%%time
regexURL= r"(?:https?:\/\/)?(?:www\.)?([^@\s]+\.[a-zA-Z]{2,4})[^\s]*"
df.content = df.content.replace(to_replace=regexURL, value='<URL>', regex=True)

In [None]:
regexDoubleSpace = r"(\s{2,})|\n"
df.content = df.content.replace(to_replace=regexDoubleSpace, value=' ', regex=True)

In [None]:
regexDate = r"(((19[7-9]\d|20\d{2})|(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|(nov|dec)(?:ember)?)|(([12][0-9])|(3[01])|(0?[1-9])))[\/. \-,\n]){2,3}"
df.content = df.content.replace(to_replace=regexDate, value='<DATE>', regex=True)

In [None]:
%%time
regexNum = r"(\s)\$?(?:[\d,.-])+"
df.content = df.content.replace(to_replace=regexNum, value='<NUM>', regex=True)

### data-tables: [name]-uniq / relational-tables: [name]_in
creating csv-files for database

In [68]:
# specify where to save all csv-files
path = 'Database_CSV_IN/'

# create temporary dataframe and use article id as index 
out_df = pd.DataFrame({'id':df.index})
out_df.set_index('id', inplace=True)

In [69]:
### types_uniq - data-table ###
type_array = df.type.unique() # get array of unique types
type_df = pd.DataFrame({'id': np.arange(type_array.size), 'name':type_array})

# write file and free memory
type_df.to_csv(path + 'type_clean.csv', index=False, header=True)
#del type_array
#del type_df # tmp delete later

# create dict with type_name as key - [swap type with type_id]
type_name_as_key_df = type_df.set_index('name')
type_dict = type_name_as_key_df['id'].to_dict()

# replace type with tag id and create new column
type_id = np.array([type_dict[key] for key in df['type'].to_numpy()])
df['type_id'] =type_id

In [70]:
### tags_uniq - data-table ###

# creates list of list but formaly it is a pd.series of lists
tags_series_of_lists = df.tags.dropna().str.split(', ') # -> ', ' not ','

if not 'tags' in out_df: ### tmp need another method ###
    out_df.insert(0,column = 'tags', value = tags_series_of_lists)

# flattern tags_series_of_lists to a set(ie. unique values only)
tags_list = list(set([item for sublist in tags_series_of_lists for item in sublist]))

# create dataframe
tags_df = pd.DataFrame({'id': np.arange(len(tags_list)), 'name':tags_list})

# write file and free memory
tags_df.to_csv(path + 'tags_clean.csv', index=False, header=True)
del tags_series_of_lists
del tags_list


In [71]:
### tags_in - relational-table ###

# get all pairs of article_id and tags in a article (for all articles)
articles_id_tags_name_pairs_df = out_df.tags.dropna().explode().drop_duplicates(keep = 'first')

# split tags_name and articles_id
articles_id_array = articles_id_tags_name_pairs_df.index.to_numpy()
tags_name_array = articles_id_tags_name_pairs_df.to_numpy()

# create dict with tag_name as key - [swap tags with tags_id]
tags_name_as_key_df = tags_df.set_index('name')
tags_dict = tags_name_as_key_df['id'].to_dict()

# replace tags with tag id
tags_id = np.array([tags_dict[key] for key in tags_name_array])

# create dataframe
tags_in_df = pd.DataFrame(data=articles_id_array, index=tags_id, columns=['article_id'])
tags_in_df.index.name='tags_id'

# write file and free memory
tags_in_df.to_csv(path + 'tags_in.csv', index=True, header=True)

In [72]:

### authors-uniq - data-table ###

# creates list of list but formaly it is a pd.series of lists
authors_series_of_lists = df.authors.str.split(',') # -> ',' not ', '

if not 'authors' in out_df: ### tmp need another method ###
    out_df.insert(0,column = 'authors', value = authors_series_of_lists)

# flattern authors_series_of_lists to a set(ie. unique values only)
authors_list = list(set([item for sublist in authors_series_of_lists for item in sublist]))

# create dataframe
authors_df = pd.DataFrame({'id': np.arange(len(authors_list)), 'name':authors_list})

# write file and free memory
authors_df.to_csv(path + 'authors_clean.csv', index=False, header=True)
#del authors_series_of_lists
#del authors_list
#del authors_df

In [73]:
### authors_in - relational-table ###
#for i in out_df.index:
    #out_df.author[i] = list(set(out_df.authors[i]))
# get all pairs of article_id and authors in a article (for all articles)
articles_id_authors_name_pairs_df = out_df.authors.dropna().explode()

# split authors_name and articles_id
articles_id_array = articles_id_authors_name_pairs_df.index.to_numpy()
authors_name_array = articles_id_authors_name_pairs_df.to_numpy()

# create dict with tag_name as key - [swap authors with authors_id]
authors_name_as_key_df = authors_df.set_index('name')
authors_dict = authors_name_as_key_df['id'].to_dict()

# replace authors with tag id
authors_id = np.array([authors_dict[key] for key in authors_name_array])

# create dataframe
authors_in_df = pd.DataFrame(data=articles_id_array, index=authors_id, columns=['article_id'])
authors_in_df.index.name='authors_id'

# write file and free memory
authors_in_df.to_csv(path + 'authors_in.csv', index=True, header=True)

In [74]:
### domains-uniq - data-table ###
domain_array = df.domain.unique() # get array of unique domains
domain_df = pd.DataFrame({'id': np.arange(domain_array.size), 'name':domain_array})

# write file and free memory
domain_df.to_csv(path + 'domain_name_clean.csv', index=False, header=True)
#del domain_array
#del domain_df

# create dict with domain_name as key - [swap domain with domain_id]
domain_name_as_key_df = domain_df.set_index('name')
domain_dict = domain_name_as_key_df['id'].to_dict()

# replace domain with tag id and create new column
domain_id = np.array([domain_dict[key] for key in df['domain'].to_numpy()])
df['domain_id'] =domain_id

In [75]:
### meta_keywords_uniq - data-table ###

# use regex to remove string-padding
regex = r" *['\"\[\]]+"
meta_keywords_series = df.meta_keywords.replace(to_replace=regex, value='', regex=True).str.split(',')
#meta_keywords_series = meta_keywords_series.replace(r'', np.NaN)

if not 'meta_keywords' in out_df: ### tmp need another method ###
    out_df.insert(0,column = 'meta_keywords', value = meta_keywords_series)

# create array of unique
meta_keywords_set = meta_keywords_series.explode().unique()

# create dataframe
meta_keywords_df = pd.DataFrame({'id': np.arange(len(meta_keywords_set)), 'name':meta_keywords_set})

# write file and free memory
meta_keywords_df.to_csv(path + 'meta_keywords_clean.csv', index=False, header=True)
#del meta_keywords_series
#del meta_keywords_set
#del meta_keywords_list
#del meta_keywords_df

In [76]:
### meta_keywords_in - relational-table ###

# get all pairs of article_id and meta_keywords in a article (for all articles)
articles_id_meta_keywords_name_pairs_df = out_df.meta_keywords.dropna().explode()

# split meta_keywords_name and articles_id
articles_id_array = articles_id_meta_keywords_name_pairs_df.index.to_numpy()
meta_keywords_name_array = articles_id_meta_keywords_name_pairs_df.to_numpy()

# create dict with tag_name as key - [swap meta_keywords with meta_keywords_id]
meta_keywords_name_as_key_df = meta_keywords_df.set_index('name')
meta_keywords_dict = meta_keywords_name_as_key_df['id'].to_dict()

# replace meta_keywords with tag id
meta_keywords_id = np.array([meta_keywords_dict[key] for key in meta_keywords_name_array])

# create dataframe
meta_keywords_in_df = pd.DataFrame(data=articles_id_array, index=meta_keywords_id, columns=['article_id'])

meta_keywords_in_df.index.name='meta_keywords_id'
meta_keywords_in_df.reset_index(inplace = True)
meta_keywords_in_df.drop_duplicates(subset = ['meta_keywords_id', 'article_id'], keep = 'first', inplace = True)
# write file and free memory
meta_keywords_in_df.to_csv(path + 'meta_keywords_in.csv', index=False, header=True)
#meta_keywords_in_df

In [78]:
### article clean ###

df[['domain_id', 'type_id', "url", "content", "title", "meta_description", "scraped_at",  "updated_at", "inserted_at"]].to_csv(path + 'article_clean.csv', header=True)