Skip to content

Commit

Permalink
Initial Load Data Class
Browse files Browse the repository at this point in the history
- currently, not working
  • Loading branch information
Sieboldianus committed Dec 21, 2018
1 parent d2804aa commit 554c804
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 45 deletions.
23 changes: 23 additions & 0 deletions 00_Config/sourcemapping_lbsn.ini
@@ -0,0 +1,23 @@
[Main]
name: lbsn
file_extension: csv
delimiter: ,
array_separator: ;
quoting: QUOTE_MINIMAL
date_time_format: yyyy-MM-dd HH:mm:ss
[Columns]
post_guid_col: 1
latitude_col: 2
longitude_col: 3
user_guid_col: 4
post_create_date_col: 5
post_publish_date_col: 6
post_views_count_col: 8
post_like_count_col: 9
post_url_col: 10
tags_col: 11
emoji_col: 12
post_title_col: 13
post_body_col: 14
post_geoaccuracy_col: 15
place_guid_col: 19
61 changes: 31 additions & 30 deletions tagmaps/__main__.py
Expand Up @@ -110,6 +110,7 @@
cleanedPhotoList = []

from tagmaps.classes.utils import Utils
from tagmaps.classes.load_data import LoadData

def main():
"""Main tag maps function for direct processing
Expand All @@ -120,32 +121,32 @@ def main():

# initialize logger and config
cfg, log = Utils.init_main()

filelist = LoadData.read_local_files(cfg)

# READ All JSON in Current Folder and join to list
#partnum = 0
guid_list = set() #global list of guids
count_glob = 0
partcount = 0
#filenameprev = ""
if (cfg.d_source == "fromFlickr_CSV"):
filelist = glob('01_Input/*.txt')
GMTTimetransform = 0
guid_columnNameID = 5 #guid
Sourcecode = 2
quoting_opt = csv.QUOTE_NONE
elif (cfg.d_source == "fromInstagram_PGlbsnEmoji") or (cfg.d_source == "fromLBSN") or (cfg.d_source == "fromLBSN_old"):
filelist = glob('01_Input/*.csv')
guid_columnNameID = 1 #guid
quoting_opt = csv.QUOTE_MINIMAL
elif (cfg.d_source == "fromSensorData_InfWuerz"):
filelist = glob('01_Input/*.csv')
GMTTimetransform = 0
guid_columnNameID = 1 #guid
Sourcecode = 11
quoting_opt = csv.QUOTE_NONE
else:
sys.exit("Source not supported yet.")
#if (cfg.data_source == "fromFlickr_CSV"):
# filelist = glob('01_Input/*.txt')
# GMTTimetransform = 0
# guid_columnNameID = 5 #guid
# Sourcecode = 2
# quoting_opt = csv.QUOTE_NONE
#elif (cfg.data_source == "fromInstagram_PGlbsnEmoji") or (cfg.data_source == "fromLBSN") or (cfg.data_source == "fromLBSN_old"):
# filelist = glob('01_Input/*.csv')
# guid_columnNameID = 1 #guid
# quoting_opt = csv.QUOTE_MINIMAL
#elif (cfg.data_source == "fromSensorData_InfWuerz"):
# filelist = glob('01_Input/*.csv')
# GMTTimetransform = 0
# guid_columnNameID = 1 #guid
# Sourcecode = 11
# quoting_opt = csv.QUOTE_NONE
#else:
# sys.exit("Source not supported yet.")

print('\n')
log.info("########## STEP 1 of 6: Data Cleanup ##########")
Expand Down Expand Up @@ -219,10 +220,10 @@ def is_number(s):
# guid_list.clear() #duplicate detection only for last 500k items
with open(file_name, newline='', encoding='utf8') as f: # On input, if newline is None, universal newlines mode is enabled. Lines in the input can end in '\n', '\r', or '\r\n', and these are translated into '\n' before being returned to the caller.
partcount += 1
if (cfg.d_source == "fromInstagram_LocMedia_CSV" or cfg.d_source == "fromLBSN" or cfg.d_source == "fromLBSN_old" or cfg.d_source == "fromInstagram_UserMedia_CSV" or cfg.d_source == "fromFlickr_CSV" or cfg.d_source == "fromInstagram_PGlbsnEmoji" or cfg.d_source == "fromSensorData_InfWuerz"):
if (cfg.data_source == "fromInstagram_LocMedia_CSV" or cfg.data_source == "fromLBSN" or cfg.data_source == "fromLBSN_old" or cfg.data_source == "fromInstagram_UserMedia_CSV" or cfg.data_source == "fromFlickr_CSV" or cfg.data_source == "fromInstagram_PGlbsnEmoji" or cfg.data_source == "fromSensorData_InfWuerz"):
photolist = csv.reader(f, delimiter=',', quotechar='"', quoting=quoting_opt) #QUOTE_NONE is important because media saved from php/Flickr does not contain any " check; only ',' are replaced
next(photolist, None) # skip headerline
elif (cfg.d_source == "fromInstagram_HashMedia_JSON"):
elif (cfg.data_source == "fromInstagram_HashMedia_JSON"):
photolist = photolist + json.loads(f.read())
#PhotosPerDayLists = defaultdict(list)
#keyCreatedHash = set()
Expand All @@ -233,7 +234,7 @@ def is_number(s):
continue
else:
photoIDHash.add(item[guid_columnNameID])
if (cfg.d_source == "fromInstagram_LocMedia_CSV"):
if (cfg.data_source == "fromInstagram_LocMedia_CSV"):
if len(item) < 15:
#skip
skippedCount += 1
Expand Down Expand Up @@ -290,7 +291,7 @@ def is_number(s):
photo_mTags = ""
photo_dateTaken = ""
photo_views = ""
elif cfg.d_source == "fromInstagram_UserMedia_CSV":
elif cfg.data_source == "fromInstagram_UserMedia_CSV":
if len(item) < 15:
#skip
skippedCount += 1
Expand Down Expand Up @@ -342,7 +343,7 @@ def is_number(s):
photo_mTags = ""
photo_dateTaken = ""
photo_views = ""
elif cfg.d_source == "fromFlickr_CSV":
elif cfg.data_source == "fromFlickr_CSV":
if len(item) < 12:
#skip
skippedCount += 1
Expand Down Expand Up @@ -387,7 +388,7 @@ def is_number(s):
photo_locID = str(photo_latitude) + ':' + str(photo_longitude) #create loc_id from lat/lng
photo_mTags = "" #not used currently but available
photo_views = item[10]
elif (cfg.d_source == "fromInstagram_HashMedia_JSON"):
elif (cfg.data_source == "fromInstagram_HashMedia_JSON"):
photo_source = Sourcecode #HashMediaCode
if item.get('owner'):
photo_userid = item["owner"]["id"]
Expand Down Expand Up @@ -483,7 +484,7 @@ def is_number(s):
photo_mTags = ""
photo_dateTaken = ""
photo_views = ""
elif cfg.d_source == "fromInstagram_PGlbsnEmoji":
elif cfg.data_source == "fromInstagram_PGlbsnEmoji":
if len(item) < 15:
#skip
skippedCount += 1
Expand Down Expand Up @@ -523,7 +524,7 @@ def is_number(s):
photo_mTags = ""
photo_dateTaken = ""
photo_views = 0
elif cfg.d_source == "fromLBSN":
elif cfg.data_source == "fromLBSN":
if len(item) < 15:
#skip
skippedCount += 1
Expand Down Expand Up @@ -609,7 +610,7 @@ def is_number(s):
photo_views = int(item[8])
except TypeError:
pass
elif cfg.d_source == "fromLBSN_old":
elif cfg.data_source == "fromLBSN_old":
if len(item) < 15:
#skip
skippedCount += 1
Expand Down Expand Up @@ -697,7 +698,7 @@ def is_number(s):
# photo_views = int(item[8])
# except TypeError:
# pass
elif cfg.d_source == "fromSensorData_InfWuerz":
elif cfg.data_source == "fromSensorData_InfWuerz":
if len(item) < 5:
#skip
skippedCount += 1
Expand Down Expand Up @@ -1402,7 +1403,7 @@ def delete(listbox):
l.pack(padx=10, pady=10)
l = tk.Label(canvas, text="Select all tags you wish to exclude from analysis \n and click on remove to proceed.", background="gray7",fg="gray80")
l.pack(padx=10, pady=10)
#if cfg.d_source == "fromInstagram_PGlbsnEmoji":
#if cfg.data_source == "fromInstagram_PGlbsnEmoji":
# listbox_font = ("twitter Color Emoji", 12, "bold")
# #listbox_font = ("Symbola", 12, "bold")
#else:
Expand Down
36 changes: 24 additions & 12 deletions tagmaps/classes/load_data.py
@@ -1,20 +1,22 @@
# -*- coding: utf-8 -*-

import sys
import os
import ntpath
import csv
from pathlib import Path
from _csv import QUOTE_MINIMAL
from glob import glob
from .utils import Utils

class LoadData():
"""Main Class for ingesting data and building summary statistics
for tag maps clustering.
"""Main Class for ingesting data and building summary statistics.
- will filter data, cleaned output can be stored
- will process CSV data into dict/set structures
- generate statistics
- will filter data, cleaned output can be stored
- will generate statistics
"""

def loop_input_records(records, transferlimit, import_mapper, config):
"""Loops input json or csv records, converts to ProtoBuf structure and adds to records_dict
Expand Down Expand Up @@ -47,7 +49,7 @@ def loop_input_records(records, transferlimit, import_mapper, config):
return processed_records, finished

@staticmethod
def fetch_csv_data_from_file(loc_filelist, start_file_id=0):
def fetch_csv_data_from_file(source_config):
"""Read csv entries from file (either *.txt or *.csv).
The actual CSV formatting is not setable in config yet. There are many specifics, e.g.
Expand All @@ -64,12 +66,22 @@ def fetch_csv_data_from_file(loc_filelist, start_file_id=0):
return None
return records

@staticmethod
def read_local_files(config):
"""Read Local Files according to config parameters and returns list of file-paths"""
input_path = config.input_folder
filelist = list(input_path.glob(f'*.{config.source["Main"]["file_extension"]}'))
input_count = len(filelist)
if input_count == 0:
sys.exit("No input files found.")
else:
return filelist

@staticmethod
def skip_empty_or_other(single_record):
"""Detect Rate Limiting Notice or empty records
so they can be skipped.
"""
skip = False
if not single_record or (isinstance(single_record,dict) and single_record.get('limit')):
skip = True
return skip
"""Detect empty records"""
if not single_record:
return False
return True


25 changes: 22 additions & 3 deletions tagmaps/config/config.py
Expand Up @@ -3,7 +3,8 @@
import argparse
import os
import sys

from pathlib import Path
import configparser
from shapely.geometry import Polygon
from shapely.geometry import shape
from shapely.geometry import Point
Expand All @@ -12,7 +13,7 @@ class BaseConfig():
def __init__(self):
## Set Default Config options here
## or define options as input args
self.d_source = "fromLBSN"
self.data_source = "fromLBSN"
self.cluster_tags = True
self.cluster_photos = True
self.epsg = True
Expand Down Expand Up @@ -43,10 +44,15 @@ def __init__(self):
self.shp_geom = None

# initialization
self.pathname = Path.cwd()
self.config_folder = Path.cwd() / '00_Config'
self.input_folder = Path.cwd() / '01_Input'
self.output_folder = Path.cwd() / '02_Output'
self.parse_args()
self.load_filterlists()
if self.shapefile_intersect:
self.load_shapefile()
self.source = self.load_sourcemapping()

def parse_args(self):
"""Parse init args and set default values
Expand Down Expand Up @@ -74,7 +80,7 @@ def parse_args(self):

args = parser.parse_args()
if args.source:
self.d_source = args.source
self.data_source = args.source
if args.clusterTags:
self.cluster_tags = args.clusterTags
if args.clusterPhotos:
Expand Down Expand Up @@ -203,3 +209,16 @@ def load_custom_crs(self, override_crs):
self.crs_proj = pyproj.Proj(init='epsg:{0}'.format(override_crs))
print("Custom CRS set: " + str(self.crs_proj.srs))
self.epsg_code = override_crs

def load_sourcemapping(self):
"""Loads source mapping, if available.
Otherwise, try to read structure from first line of CSV.
"""
mapping_config_path = self.config_folder / f'sourcemapping_{self.data_source}.ini'
if not os.path.exists(mapping_config_path):
return
source_config = configparser.ConfigParser()
source_config.read(mapping_config_path)
return source_config

0 comments on commit 554c804

Please sign in to comment.