From 554c8044050b4b181f920d8958725a680bf1e266 Mon Sep 17 00:00:00 2001
From: AD <alexander.dunkel@tu-dresden.de>
Date: Fri, 21 Dec 2018 12:27:19 +0100
Subject: [PATCH] Initial Load Data Class - currently, not working

---
 00_Config/sourcemapping_lbsn.ini | 23 ++++++++++++
 tagmaps/__main__.py              | 61 ++++++++++++++++----------------
 tagmaps/classes/load_data.py     | 36 ++++++++++++-------
 tagmaps/config/config.py         | 25 +++++++++++--
 4 files changed, 100 insertions(+), 45 deletions(-)
 create mode 100644 00_Config/sourcemapping_lbsn.ini

diff --git a/00_Config/sourcemapping_lbsn.ini b/00_Config/sourcemapping_lbsn.ini
new file mode 100644
index 0000000..cb066ff
--- /dev/null
+++ b/00_Config/sourcemapping_lbsn.ini
@@ -0,0 +1,23 @@
+[Main]
+name: lbsn
+file_extension: csv
+delimiter: ,
+array_separator: ;
+quoting: QUOTE_MINIMAL
+date_time_format: yyyy-MM-dd HH:mm:ss
+[Columns]
+post_guid_col: 1
+latitude_col: 2
+longitude_col: 3
+user_guid_col: 4
+post_create_date_col: 5
+post_publish_date_col: 6
+post_views_count_col: 8
+post_like_count_col: 9
+post_url_col: 10
+tags_col: 11
+emoji_col: 12
+post_title_col: 13
+post_body_col: 14
+post_geoaccuracy_col: 15
+place_guid_col: 19
diff --git a/tagmaps/__main__.py b/tagmaps/__main__.py
index 5e2b892..2c6f8a6 100644
--- a/tagmaps/__main__.py
+++ b/tagmaps/__main__.py
@@ -110,6 +110,7 @@
 cleanedPhotoList = []
 
 from tagmaps.classes.utils import Utils
+from tagmaps.classes.load_data import LoadData
 
 def main():
     """Main tag maps function for direct processing
@@ -120,7 +121,7 @@ def main():
     
     # initialize logger and config
     cfg, log = Utils.init_main()
-
+    filelist = LoadData.read_local_files(cfg)
 
     # READ All JSON in Current Folder and join to list
     #partnum = 0
@@ -128,24 +129,24 @@ def main():
     count_glob = 0
     partcount = 0
     #filenameprev = ""
-    if (cfg.d_source == "fromFlickr_CSV"):
-        filelist = glob('01_Input/*.txt')
-        GMTTimetransform = 0
-        guid_columnNameID = 5 #guid
-        Sourcecode = 2
-        quoting_opt = csv.QUOTE_NONE
-    elif (cfg.d_source == "fromInstagram_PGlbsnEmoji") or (cfg.d_source == "fromLBSN") or (cfg.d_source == "fromLBSN_old"):
-        filelist = glob('01_Input/*.csv')
-        guid_columnNameID = 1 #guid
-        quoting_opt = csv.QUOTE_MINIMAL
-    elif (cfg.d_source == "fromSensorData_InfWuerz"):
-        filelist = glob('01_Input/*.csv')
-        GMTTimetransform = 0
-        guid_columnNameID = 1 #guid
-        Sourcecode = 11
-        quoting_opt = csv.QUOTE_NONE
-    else:
-        sys.exit("Source not supported yet.")
+    #if (cfg.data_source == "fromFlickr_CSV"):
+    #    filelist = glob('01_Input/*.txt')
+    #    GMTTimetransform = 0
+    #    guid_columnNameID = 5 #guid
+    #    Sourcecode = 2
+    #    quoting_opt = csv.QUOTE_NONE
+    #elif (cfg.data_source == "fromInstagram_PGlbsnEmoji") or (cfg.data_source == "fromLBSN") or (cfg.data_source == "fromLBSN_old"):
+    #    filelist = glob('01_Input/*.csv')
+    #    guid_columnNameID = 1 #guid
+    #    quoting_opt = csv.QUOTE_MINIMAL
+    #elif (cfg.data_source == "fromSensorData_InfWuerz"):
+    #    filelist = glob('01_Input/*.csv')
+    #    GMTTimetransform = 0
+    #    guid_columnNameID = 1 #guid
+    #    Sourcecode = 11
+    #    quoting_opt = csv.QUOTE_NONE
+    #else:
+    #    sys.exit("Source not supported yet.")
 
     print('\n')
     log.info("########## STEP 1 of 6: Data Cleanup ##########")
@@ -219,10 +220,10 @@ def is_number(s):
         #    guid_list.clear() #duplicate detection only for last 500k items
         with open(file_name, newline='', encoding='utf8') as f: # On input, if newline is None, universal newlines mode is enabled. Lines in the input can end in '\n', '\r', or '\r\n', and these are translated into '\n' before being returned to the caller.
             partcount += 1
-            if (cfg.d_source == "fromInstagram_LocMedia_CSV" or cfg.d_source == "fromLBSN" or cfg.d_source == "fromLBSN_old" or cfg.d_source == "fromInstagram_UserMedia_CSV" or cfg.d_source == "fromFlickr_CSV" or cfg.d_source == "fromInstagram_PGlbsnEmoji" or cfg.d_source == "fromSensorData_InfWuerz"):
+            if (cfg.data_source == "fromInstagram_LocMedia_CSV" or cfg.data_source == "fromLBSN" or cfg.data_source == "fromLBSN_old" or cfg.data_source == "fromInstagram_UserMedia_CSV" or cfg.data_source == "fromFlickr_CSV" or cfg.data_source == "fromInstagram_PGlbsnEmoji" or cfg.data_source == "fromSensorData_InfWuerz"):
                 photolist = csv.reader(f, delimiter=',', quotechar='"', quoting=quoting_opt) #QUOTE_NONE is important because media saved from php/Flickr does not contain any " check; only ',' are replaced
                 next(photolist, None)  # skip headerline
-            elif (cfg.d_source == "fromInstagram_HashMedia_JSON"):
+            elif (cfg.data_source == "fromInstagram_HashMedia_JSON"):
                 photolist = photolist + json.loads(f.read())
             #PhotosPerDayLists = defaultdict(list)
             #keyCreatedHash = set()
@@ -233,7 +234,7 @@ def is_number(s):
                     continue
                 else:
                     photoIDHash.add(item[guid_columnNameID])
-                if (cfg.d_source == "fromInstagram_LocMedia_CSV"):
+                if (cfg.data_source == "fromInstagram_LocMedia_CSV"):
                     if len(item) < 15:
                         #skip
                         skippedCount += 1
@@ -290,7 +291,7 @@ def is_number(s):
                         photo_mTags = ""
                         photo_dateTaken = ""
                         photo_views = ""
-                elif cfg.d_source == "fromInstagram_UserMedia_CSV":
+                elif cfg.data_source == "fromInstagram_UserMedia_CSV":
                     if len(item) < 15:
                         #skip
                         skippedCount += 1
@@ -342,7 +343,7 @@ def is_number(s):
                         photo_mTags = ""
                         photo_dateTaken = ""
                         photo_views = ""
-                elif cfg.d_source == "fromFlickr_CSV":
+                elif cfg.data_source == "fromFlickr_CSV":
                     if len(item) < 12:
                         #skip
                         skippedCount += 1
@@ -387,7 +388,7 @@ def is_number(s):
                         photo_locID = str(photo_latitude) + ':' + str(photo_longitude) #create loc_id from lat/lng
                         photo_mTags = "" #not used currently but available
                         photo_views = item[10]
-                elif (cfg.d_source == "fromInstagram_HashMedia_JSON"):
+                elif (cfg.data_source == "fromInstagram_HashMedia_JSON"):
                     photo_source = Sourcecode #HashMediaCode
                     if item.get('owner'):
                         photo_userid = item["owner"]["id"]
@@ -483,7 +484,7 @@ def is_number(s):
                     photo_mTags = ""
                     photo_dateTaken = ""
                     photo_views = ""
-                elif cfg.d_source == "fromInstagram_PGlbsnEmoji":
+                elif cfg.data_source == "fromInstagram_PGlbsnEmoji":
                     if len(item) < 15:
                         #skip
                         skippedCount += 1
@@ -523,7 +524,7 @@ def is_number(s):
                         photo_mTags = ""
                         photo_dateTaken = ""
                         photo_views = 0
-                elif cfg.d_source == "fromLBSN":
+                elif cfg.data_source == "fromLBSN":
                     if len(item) < 15:
                         #skip
                         skippedCount += 1
@@ -609,7 +610,7 @@ def is_number(s):
                                 photo_views = int(item[8])
                             except TypeError:
                                 pass
-                elif cfg.d_source == "fromLBSN_old":
+                elif cfg.data_source == "fromLBSN_old":
                     if len(item) < 15:
                         #skip
                         skippedCount += 1
@@ -697,7 +698,7 @@ def is_number(s):
                         #        photo_views = int(item[8])
                         #    except TypeError:
                         #        pass
-                elif cfg.d_source == "fromSensorData_InfWuerz":
+                elif cfg.data_source == "fromSensorData_InfWuerz":
                     if len(item) < 5:
                         #skip
                         skippedCount += 1
@@ -1402,7 +1403,7 @@ def delete(listbox):
             l.pack(padx=10, pady=10)
             l = tk.Label(canvas, text="Select all tags you wish to exclude from analysis \n and click on remove to proceed.", background="gray7",fg="gray80")
             l.pack(padx=10, pady=10)
-            #if cfg.d_source == "fromInstagram_PGlbsnEmoji":
+            #if cfg.data_source == "fromInstagram_PGlbsnEmoji":
             #    listbox_font = ("twitter Color Emoji", 12, "bold")
             #    #listbox_font = ("Symbola", 12, "bold")
             #else:
diff --git a/tagmaps/classes/load_data.py b/tagmaps/classes/load_data.py
index a075515..2127525 100644
--- a/tagmaps/classes/load_data.py
+++ b/tagmaps/classes/load_data.py
@@ -1,20 +1,22 @@
 # -*- coding: utf-8 -*-
 
+import sys
 import os
 import ntpath
 import csv
+from pathlib import Path
 from _csv import QUOTE_MINIMAL
 from glob import glob
 from .utils import Utils
 
 class LoadData():
-    """Main Class for ingesting data and building summary statistics
-    for tag maps clustering.
+    """Main Class for ingesting data and building summary statistics.
 
-    - will filter data, cleaned output can be stored
     - will process CSV data into dict/set structures
-    - generate statistics
+    - will filter data, cleaned output can be stored
+    - will generate statistics
     """
+
     def loop_input_records(records, transferlimit, import_mapper, config):
         """Loops input json or csv records, converts to ProtoBuf structure and adds to records_dict
 
@@ -47,7 +49,7 @@ def loop_input_records(records, transferlimit, import_mapper, config):
         return processed_records, finished
 
     @staticmethod
-    def fetch_csv_data_from_file(loc_filelist, start_file_id=0):
+    def fetch_csv_data_from_file(source_config):
         """Read csv entries from file (either *.txt or *.csv).
 
         The actual CSV formatting is not setable in config yet. There are many specifics, e.g.
@@ -64,12 +66,22 @@ def fetch_csv_data_from_file(loc_filelist, start_file_id=0):
             return None
         return records
 
+    @staticmethod
+    def read_local_files(config):
+       """Read Local Files according to config parameters and returns list of file-paths"""
+       input_path = config.input_folder
+       filelist = list(input_path.glob(f'*.{config.source["Main"]["file_extension"]}'))
+       input_count = len(filelist)
+       if input_count == 0:
+           sys.exit("No input files found.")
+       else:
+           return filelist
+
     @staticmethod
     def skip_empty_or_other(single_record):
-        """Detect  Rate Limiting Notice or empty records
-           so they can be skipped.
-        """
-        skip = False
-        if not single_record or (isinstance(single_record,dict) and single_record.get('limit')):
-            skip = True
-        return skip
+        """Detect empty records"""
+        if not single_record:
+            return False
+        return True
+
+
diff --git a/tagmaps/config/config.py b/tagmaps/config/config.py
index f64acdd..96f16d0 100644
--- a/tagmaps/config/config.py
+++ b/tagmaps/config/config.py
@@ -3,7 +3,8 @@
 import argparse
 import os
 import sys
-
+from pathlib import Path
+import configparser
 from shapely.geometry import Polygon
 from shapely.geometry import shape
 from shapely.geometry import Point
@@ -12,7 +13,7 @@ class BaseConfig():
     def __init__(self):
         ## Set Default Config options here
         ## or define options as input args
-        self.d_source = "fromLBSN"
+        self.data_source = "fromLBSN"
         self.cluster_tags = True
         self.cluster_photos = True
         self.epsg = True
@@ -43,10 +44,15 @@ def __init__(self):
         self.shp_geom = None
 
         # initialization
+        self.pathname = Path.cwd()
+        self.config_folder = Path.cwd() / '00_Config'
+        self.input_folder = Path.cwd() / '01_Input'
+        self.output_folder = Path.cwd() / '02_Output'
         self.parse_args()
         self.load_filterlists()
         if self.shapefile_intersect:
             self.load_shapefile()
+        self.source = self.load_sourcemapping()
 
     def parse_args(self):
         """Parse init args and set default values
@@ -74,7 +80,7 @@ def parse_args(self):
 
         args = parser.parse_args()
         if args.source:
-            self.d_source = args.source
+            self.data_source = args.source
         if args.clusterTags:
             self.cluster_tags = args.clusterTags
         if args.clusterPhotos:
@@ -203,3 +209,16 @@ def load_custom_crs(self, override_crs):
         self.crs_proj = pyproj.Proj(init='epsg:{0}'.format(override_crs))
         print("Custom CRS set: " + str(self.crs_proj.srs))
         self.epsg_code = override_crs
+
+    def load_sourcemapping(self):
+        """Loads source mapping, if available.
+        
+        Otherwise, try to read structure from first line of CSV.
+        """
+        mapping_config_path = self.config_folder / f'sourcemapping_{self.data_source}.ini'
+        if not os.path.exists(mapping_config_path):
+            return
+        source_config = configparser.ConfigParser()
+        source_config.read(mapping_config_path)
+        return source_config
+