From 554c8044050b4b181f920d8958725a680bf1e266 Mon Sep 17 00:00:00 2001 From: AD Date: Fri, 21 Dec 2018 12:27:19 +0100 Subject: [PATCH] Initial Load Data Class - currently, not working --- 00_Config/sourcemapping_lbsn.ini | 23 ++++++++++++ tagmaps/__main__.py | 61 ++++++++++++++++---------------- tagmaps/classes/load_data.py | 36 ++++++++++++------- tagmaps/config/config.py | 25 +++++++++++-- 4 files changed, 100 insertions(+), 45 deletions(-) create mode 100644 00_Config/sourcemapping_lbsn.ini diff --git a/00_Config/sourcemapping_lbsn.ini b/00_Config/sourcemapping_lbsn.ini new file mode 100644 index 0000000..cb066ff --- /dev/null +++ b/00_Config/sourcemapping_lbsn.ini @@ -0,0 +1,23 @@ +[Main] +name: lbsn +file_extension: csv +delimiter: , +array_separator: ; +quoting: QUOTE_MINIMAL +date_time_format: yyyy-MM-dd HH:mm:ss +[Columns] +post_guid_col: 1 +latitude_col: 2 +longitude_col: 3 +user_guid_col: 4 +post_create_date_col: 5 +post_publish_date_col: 6 +post_views_count_col: 8 +post_like_count_col: 9 +post_url_col: 10 +tags_col: 11 +emoji_col: 12 +post_title_col: 13 +post_body_col: 14 +post_geoaccuracy_col: 15 +place_guid_col: 19 diff --git a/tagmaps/__main__.py b/tagmaps/__main__.py index 5e2b892..2c6f8a6 100644 --- a/tagmaps/__main__.py +++ b/tagmaps/__main__.py @@ -110,6 +110,7 @@ cleanedPhotoList = [] from tagmaps.classes.utils import Utils +from tagmaps.classes.load_data import LoadData def main(): """Main tag maps function for direct processing @@ -120,7 +121,7 @@ def main(): # initialize logger and config cfg, log = Utils.init_main() - + filelist = LoadData.read_local_files(cfg) # READ All JSON in Current Folder and join to list #partnum = 0 @@ -128,24 +129,24 @@ def main(): count_glob = 0 partcount = 0 #filenameprev = "" - if (cfg.d_source == "fromFlickr_CSV"): - filelist = glob('01_Input/*.txt') - GMTTimetransform = 0 - guid_columnNameID = 5 #guid - Sourcecode = 2 - quoting_opt = csv.QUOTE_NONE - elif (cfg.d_source == "fromInstagram_PGlbsnEmoji") or (cfg.d_source == "fromLBSN") or (cfg.d_source == "fromLBSN_old"): - filelist = glob('01_Input/*.csv') - guid_columnNameID = 1 #guid - quoting_opt = csv.QUOTE_MINIMAL - elif (cfg.d_source == "fromSensorData_InfWuerz"): - filelist = glob('01_Input/*.csv') - GMTTimetransform = 0 - guid_columnNameID = 1 #guid - Sourcecode = 11 - quoting_opt = csv.QUOTE_NONE - else: - sys.exit("Source not supported yet.") + #if (cfg.data_source == "fromFlickr_CSV"): + # filelist = glob('01_Input/*.txt') + # GMTTimetransform = 0 + # guid_columnNameID = 5 #guid + # Sourcecode = 2 + # quoting_opt = csv.QUOTE_NONE + #elif (cfg.data_source == "fromInstagram_PGlbsnEmoji") or (cfg.data_source == "fromLBSN") or (cfg.data_source == "fromLBSN_old"): + # filelist = glob('01_Input/*.csv') + # guid_columnNameID = 1 #guid + # quoting_opt = csv.QUOTE_MINIMAL + #elif (cfg.data_source == "fromSensorData_InfWuerz"): + # filelist = glob('01_Input/*.csv') + # GMTTimetransform = 0 + # guid_columnNameID = 1 #guid + # Sourcecode = 11 + # quoting_opt = csv.QUOTE_NONE + #else: + # sys.exit("Source not supported yet.") print('\n') log.info("########## STEP 1 of 6: Data Cleanup ##########") @@ -219,10 +220,10 @@ def is_number(s): # guid_list.clear() #duplicate detection only for last 500k items with open(file_name, newline='', encoding='utf8') as f: # On input, if newline is None, universal newlines mode is enabled. Lines in the input can end in '\n', '\r', or '\r\n', and these are translated into '\n' before being returned to the caller. partcount += 1 - if (cfg.d_source == "fromInstagram_LocMedia_CSV" or cfg.d_source == "fromLBSN" or cfg.d_source == "fromLBSN_old" or cfg.d_source == "fromInstagram_UserMedia_CSV" or cfg.d_source == "fromFlickr_CSV" or cfg.d_source == "fromInstagram_PGlbsnEmoji" or cfg.d_source == "fromSensorData_InfWuerz"): + if (cfg.data_source == "fromInstagram_LocMedia_CSV" or cfg.data_source == "fromLBSN" or cfg.data_source == "fromLBSN_old" or cfg.data_source == "fromInstagram_UserMedia_CSV" or cfg.data_source == "fromFlickr_CSV" or cfg.data_source == "fromInstagram_PGlbsnEmoji" or cfg.data_source == "fromSensorData_InfWuerz"): photolist = csv.reader(f, delimiter=',', quotechar='"', quoting=quoting_opt) #QUOTE_NONE is important because media saved from php/Flickr does not contain any " check; only ',' are replaced next(photolist, None) # skip headerline - elif (cfg.d_source == "fromInstagram_HashMedia_JSON"): + elif (cfg.data_source == "fromInstagram_HashMedia_JSON"): photolist = photolist + json.loads(f.read()) #PhotosPerDayLists = defaultdict(list) #keyCreatedHash = set() @@ -233,7 +234,7 @@ def is_number(s): continue else: photoIDHash.add(item[guid_columnNameID]) - if (cfg.d_source == "fromInstagram_LocMedia_CSV"): + if (cfg.data_source == "fromInstagram_LocMedia_CSV"): if len(item) < 15: #skip skippedCount += 1 @@ -290,7 +291,7 @@ def is_number(s): photo_mTags = "" photo_dateTaken = "" photo_views = "" - elif cfg.d_source == "fromInstagram_UserMedia_CSV": + elif cfg.data_source == "fromInstagram_UserMedia_CSV": if len(item) < 15: #skip skippedCount += 1 @@ -342,7 +343,7 @@ def is_number(s): photo_mTags = "" photo_dateTaken = "" photo_views = "" - elif cfg.d_source == "fromFlickr_CSV": + elif cfg.data_source == "fromFlickr_CSV": if len(item) < 12: #skip skippedCount += 1 @@ -387,7 +388,7 @@ def is_number(s): photo_locID = str(photo_latitude) + ':' + str(photo_longitude) #create loc_id from lat/lng photo_mTags = "" #not used currently but available photo_views = item[10] - elif (cfg.d_source == "fromInstagram_HashMedia_JSON"): + elif (cfg.data_source == "fromInstagram_HashMedia_JSON"): photo_source = Sourcecode #HashMediaCode if item.get('owner'): photo_userid = item["owner"]["id"] @@ -483,7 +484,7 @@ def is_number(s): photo_mTags = "" photo_dateTaken = "" photo_views = "" - elif cfg.d_source == "fromInstagram_PGlbsnEmoji": + elif cfg.data_source == "fromInstagram_PGlbsnEmoji": if len(item) < 15: #skip skippedCount += 1 @@ -523,7 +524,7 @@ def is_number(s): photo_mTags = "" photo_dateTaken = "" photo_views = 0 - elif cfg.d_source == "fromLBSN": + elif cfg.data_source == "fromLBSN": if len(item) < 15: #skip skippedCount += 1 @@ -609,7 +610,7 @@ def is_number(s): photo_views = int(item[8]) except TypeError: pass - elif cfg.d_source == "fromLBSN_old": + elif cfg.data_source == "fromLBSN_old": if len(item) < 15: #skip skippedCount += 1 @@ -697,7 +698,7 @@ def is_number(s): # photo_views = int(item[8]) # except TypeError: # pass - elif cfg.d_source == "fromSensorData_InfWuerz": + elif cfg.data_source == "fromSensorData_InfWuerz": if len(item) < 5: #skip skippedCount += 1 @@ -1402,7 +1403,7 @@ def delete(listbox): l.pack(padx=10, pady=10) l = tk.Label(canvas, text="Select all tags you wish to exclude from analysis \n and click on remove to proceed.", background="gray7",fg="gray80") l.pack(padx=10, pady=10) - #if cfg.d_source == "fromInstagram_PGlbsnEmoji": + #if cfg.data_source == "fromInstagram_PGlbsnEmoji": # listbox_font = ("twitter Color Emoji", 12, "bold") # #listbox_font = ("Symbola", 12, "bold") #else: diff --git a/tagmaps/classes/load_data.py b/tagmaps/classes/load_data.py index a075515..2127525 100644 --- a/tagmaps/classes/load_data.py +++ b/tagmaps/classes/load_data.py @@ -1,20 +1,22 @@ # -*- coding: utf-8 -*- +import sys import os import ntpath import csv +from pathlib import Path from _csv import QUOTE_MINIMAL from glob import glob from .utils import Utils class LoadData(): - """Main Class for ingesting data and building summary statistics - for tag maps clustering. + """Main Class for ingesting data and building summary statistics. - - will filter data, cleaned output can be stored - will process CSV data into dict/set structures - - generate statistics + - will filter data, cleaned output can be stored + - will generate statistics """ + def loop_input_records(records, transferlimit, import_mapper, config): """Loops input json or csv records, converts to ProtoBuf structure and adds to records_dict @@ -47,7 +49,7 @@ def loop_input_records(records, transferlimit, import_mapper, config): return processed_records, finished @staticmethod - def fetch_csv_data_from_file(loc_filelist, start_file_id=0): + def fetch_csv_data_from_file(source_config): """Read csv entries from file (either *.txt or *.csv). The actual CSV formatting is not setable in config yet. There are many specifics, e.g. @@ -64,12 +66,22 @@ def fetch_csv_data_from_file(loc_filelist, start_file_id=0): return None return records + @staticmethod + def read_local_files(config): + """Read Local Files according to config parameters and returns list of file-paths""" + input_path = config.input_folder + filelist = list(input_path.glob(f'*.{config.source["Main"]["file_extension"]}')) + input_count = len(filelist) + if input_count == 0: + sys.exit("No input files found.") + else: + return filelist + @staticmethod def skip_empty_or_other(single_record): - """Detect Rate Limiting Notice or empty records - so they can be skipped. - """ - skip = False - if not single_record or (isinstance(single_record,dict) and single_record.get('limit')): - skip = True - return skip + """Detect empty records""" + if not single_record: + return False + return True + + diff --git a/tagmaps/config/config.py b/tagmaps/config/config.py index f64acdd..96f16d0 100644 --- a/tagmaps/config/config.py +++ b/tagmaps/config/config.py @@ -3,7 +3,8 @@ import argparse import os import sys - +from pathlib import Path +import configparser from shapely.geometry import Polygon from shapely.geometry import shape from shapely.geometry import Point @@ -12,7 +13,7 @@ class BaseConfig(): def __init__(self): ## Set Default Config options here ## or define options as input args - self.d_source = "fromLBSN" + self.data_source = "fromLBSN" self.cluster_tags = True self.cluster_photos = True self.epsg = True @@ -43,10 +44,15 @@ def __init__(self): self.shp_geom = None # initialization + self.pathname = Path.cwd() + self.config_folder = Path.cwd() / '00_Config' + self.input_folder = Path.cwd() / '01_Input' + self.output_folder = Path.cwd() / '02_Output' self.parse_args() self.load_filterlists() if self.shapefile_intersect: self.load_shapefile() + self.source = self.load_sourcemapping() def parse_args(self): """Parse init args and set default values @@ -74,7 +80,7 @@ def parse_args(self): args = parser.parse_args() if args.source: - self.d_source = args.source + self.data_source = args.source if args.clusterTags: self.cluster_tags = args.clusterTags if args.clusterPhotos: @@ -203,3 +209,16 @@ def load_custom_crs(self, override_crs): self.crs_proj = pyproj.Proj(init='epsg:{0}'.format(override_crs)) print("Custom CRS set: " + str(self.crs_proj.srs)) self.epsg_code = override_crs + + def load_sourcemapping(self): + """Loads source mapping, if available. + + Otherwise, try to read structure from first line of CSV. + """ + mapping_config_path = self.config_folder / f'sourcemapping_{self.data_source}.ini' + if not os.path.exists(mapping_config_path): + return + source_config = configparser.ConfigParser() + source_config.read(mapping_config_path) + return source_config +