Initial Load Data Class

- currently, not working
Sieboldianus · Dec 21, 2018 · 554c804 · 554c804
1 parent d2804aa
commit 554c804
Show file tree

Hide file tree

Showing 4 changed files with 100 additions and 45 deletions.
diff --git a/00_Config/sourcemapping_lbsn.ini b/00_Config/sourcemapping_lbsn.ini
@@ -0,0 +1,23 @@
+[Main]
+name: lbsn
+file_extension: csv
+delimiter: ,
+array_separator: ;
+quoting: QUOTE_MINIMAL
+date_time_format: yyyy-MM-dd HH:mm:ss
+[Columns]
+post_guid_col: 1
+latitude_col: 2
+longitude_col: 3
+user_guid_col: 4
+post_create_date_col: 5
+post_publish_date_col: 6
+post_views_count_col: 8
+post_like_count_col: 9
+post_url_col: 10
+tags_col: 11
+emoji_col: 12
+post_title_col: 13
+post_body_col: 14
+post_geoaccuracy_col: 15
+place_guid_col: 19
diff --git a/tagmaps/__main__.py b/tagmaps/__main__.py
@@ -110,6 +110,7 @@
 cleanedPhotoList = []
 
 from tagmaps.classes.utils import Utils
+from tagmaps.classes.load_data import LoadData
 
 def main():
     """Main tag maps function for direct processing
@@ -120,32 +121,32 @@ def main():
 
     # initialize logger and config
     cfg, log = Utils.init_main()
-
+    filelist = LoadData.read_local_files(cfg)
 
     # READ All JSON in Current Folder and join to list
     #partnum = 0
     guid_list = set() #global list of guids
     count_glob = 0
     partcount = 0
     #filenameprev = ""
-    if (cfg.d_source == "fromFlickr_CSV"):
-        filelist = glob('01_Input/*.txt')
-        GMTTimetransform = 0
-        guid_columnNameID = 5 #guid
-        Sourcecode = 2
-        quoting_opt = csv.QUOTE_NONE
-    elif (cfg.d_source == "fromInstagram_PGlbsnEmoji") or (cfg.d_source == "fromLBSN") or (cfg.d_source == "fromLBSN_old"):
-        filelist = glob('01_Input/*.csv')
-        guid_columnNameID = 1 #guid
-        quoting_opt = csv.QUOTE_MINIMAL
-    elif (cfg.d_source == "fromSensorData_InfWuerz"):
-        filelist = glob('01_Input/*.csv')
-        GMTTimetransform = 0
-        guid_columnNameID = 1 #guid
-        Sourcecode = 11
-        quoting_opt = csv.QUOTE_NONE
-    else:
-        sys.exit("Source not supported yet.")
+    #if (cfg.data_source == "fromFlickr_CSV"):
+    #    filelist = glob('01_Input/*.txt')
+    #    GMTTimetransform = 0
+    #    guid_columnNameID = 5 #guid
+    #    Sourcecode = 2
+    #    quoting_opt = csv.QUOTE_NONE
+    #elif (cfg.data_source == "fromInstagram_PGlbsnEmoji") or (cfg.data_source == "fromLBSN") or (cfg.data_source == "fromLBSN_old"):
+    #    filelist = glob('01_Input/*.csv')
+    #    guid_columnNameID = 1 #guid
+    #    quoting_opt = csv.QUOTE_MINIMAL
+    #elif (cfg.data_source == "fromSensorData_InfWuerz"):
+    #    filelist = glob('01_Input/*.csv')
+    #    GMTTimetransform = 0
+    #    guid_columnNameID = 1 #guid
+    #    Sourcecode = 11
+    #    quoting_opt = csv.QUOTE_NONE
+    #else:
+    #    sys.exit("Source not supported yet.")
 
     print('\n')
     log.info("########## STEP 1 of 6: Data Cleanup ##########")
@@ -219,10 +220,10 @@ def is_number(s):
         #    guid_list.clear() #duplicate detection only for last 500k items
         with open(file_name, newline='', encoding='utf8') as f: # On input, if newline is None, universal newlines mode is enabled. Lines in the input can end in '\n', '\r', or '\r\n', and these are translated into '\n' before being returned to the caller.
             partcount += 1
-            if (cfg.d_source == "fromInstagram_LocMedia_CSV" or cfg.d_source == "fromLBSN" or cfg.d_source == "fromLBSN_old" or cfg.d_source == "fromInstagram_UserMedia_CSV" or cfg.d_source == "fromFlickr_CSV" or cfg.d_source == "fromInstagram_PGlbsnEmoji" or cfg.d_source == "fromSensorData_InfWuerz"):
+            if (cfg.data_source == "fromInstagram_LocMedia_CSV" or cfg.data_source == "fromLBSN" or cfg.data_source == "fromLBSN_old" or cfg.data_source == "fromInstagram_UserMedia_CSV" or cfg.data_source == "fromFlickr_CSV" or cfg.data_source == "fromInstagram_PGlbsnEmoji" or cfg.data_source == "fromSensorData_InfWuerz"):
                 photolist = csv.reader(f, delimiter=',', quotechar='"', quoting=quoting_opt) #QUOTE_NONE is important because media saved from php/Flickr does not contain any " check; only ',' are replaced
                 next(photolist, None)  # skip headerline
-            elif (cfg.d_source == "fromInstagram_HashMedia_JSON"):
+            elif (cfg.data_source == "fromInstagram_HashMedia_JSON"):
                 photolist = photolist + json.loads(f.read())
             #PhotosPerDayLists = defaultdict(list)
             #keyCreatedHash = set()
@@ -233,7 +234,7 @@ def is_number(s):
                     continue
                 else:
                     photoIDHash.add(item[guid_columnNameID])
-                if (cfg.d_source == "fromInstagram_LocMedia_CSV"):
+                if (cfg.data_source == "fromInstagram_LocMedia_CSV"):
                     if len(item) < 15:
                         #skip
                         skippedCount += 1
@@ -290,7 +291,7 @@ def is_number(s):
                         photo_mTags = ""
                         photo_dateTaken = ""
                         photo_views = ""
-                elif cfg.d_source == "fromInstagram_UserMedia_CSV":
+                elif cfg.data_source == "fromInstagram_UserMedia_CSV":
                     if len(item) < 15:
                         #skip
                         skippedCount += 1
@@ -342,7 +343,7 @@ def is_number(s):
                         photo_mTags = ""
                         photo_dateTaken = ""
                         photo_views = ""
-                elif cfg.d_source == "fromFlickr_CSV":
+                elif cfg.data_source == "fromFlickr_CSV":
                     if len(item) < 12:
                         #skip
                         skippedCount += 1
@@ -387,7 +388,7 @@ def is_number(s):
                         photo_locID = str(photo_latitude) + ':' + str(photo_longitude) #create loc_id from lat/lng
                         photo_mTags = "" #not used currently but available
                         photo_views = item[10]
-                elif (cfg.d_source == "fromInstagram_HashMedia_JSON"):
+                elif (cfg.data_source == "fromInstagram_HashMedia_JSON"):
                     photo_source = Sourcecode #HashMediaCode
                     if item.get('owner'):
                         photo_userid = item["owner"]["id"]
@@ -483,7 +484,7 @@ def is_number(s):
                     photo_mTags = ""
                     photo_dateTaken = ""
                     photo_views = ""
-                elif cfg.d_source == "fromInstagram_PGlbsnEmoji":
+                elif cfg.data_source == "fromInstagram_PGlbsnEmoji":
                     if len(item) < 15:
                         #skip
                         skippedCount += 1
@@ -523,7 +524,7 @@ def is_number(s):
                         photo_mTags = ""
                         photo_dateTaken = ""
                         photo_views = 0
-                elif cfg.d_source == "fromLBSN":
+                elif cfg.data_source == "fromLBSN":
                     if len(item) < 15:
                         #skip
                         skippedCount += 1
@@ -609,7 +610,7 @@ def is_number(s):
                                 photo_views = int(item[8])
                             except TypeError:
                                 pass
-                elif cfg.d_source == "fromLBSN_old":
+                elif cfg.data_source == "fromLBSN_old":
                     if len(item) < 15:
                         #skip
                         skippedCount += 1
@@ -697,7 +698,7 @@ def is_number(s):
                         #        photo_views = int(item[8])
                         #    except TypeError:
                         #        pass
-                elif cfg.d_source == "fromSensorData_InfWuerz":
+                elif cfg.data_source == "fromSensorData_InfWuerz":
                     if len(item) < 5:
                         #skip
                         skippedCount += 1
@@ -1402,7 +1403,7 @@ def delete(listbox):
             l.pack(padx=10, pady=10)
             l = tk.Label(canvas, text="Select all tags you wish to exclude from analysis \n and click on remove to proceed.", background="gray7",fg="gray80")
             l.pack(padx=10, pady=10)
-            #if cfg.d_source == "fromInstagram_PGlbsnEmoji":
+            #if cfg.data_source == "fromInstagram_PGlbsnEmoji":
             #    listbox_font = ("twitter Color Emoji", 12, "bold")
             #    #listbox_font = ("Symbola", 12, "bold")
             #else:

diff --git a/tagmaps/classes/load_data.py b/tagmaps/classes/load_data.py
@@ -1,20 +1,22 @@
 # -*- coding: utf-8 -*-
 
+import sys
 import os
 import ntpath
 import csv
+from pathlib import Path
 from _csv import QUOTE_MINIMAL
 from glob import glob
 from .utils import Utils
 
 class LoadData():
-    """Main Class for ingesting data and building summary statistics
-    for tag maps clustering.
+    """Main Class for ingesting data and building summary statistics.
 
-    - will filter data, cleaned output can be stored
     - will process CSV data into dict/set structures
-    - generate statistics
+    - will filter data, cleaned output can be stored
+    - will generate statistics
     """
+
     def loop_input_records(records, transferlimit, import_mapper, config):
         """Loops input json or csv records, converts to ProtoBuf structure and adds to records_dict
 
@@ -47,7 +49,7 @@ def loop_input_records(records, transferlimit, import_mapper, config):
         return processed_records, finished
 
     @staticmethod
-    def fetch_csv_data_from_file(loc_filelist, start_file_id=0):
+    def fetch_csv_data_from_file(source_config):
         """Read csv entries from file (either *.txt or *.csv).
 
         The actual CSV formatting is not setable in config yet. There are many specifics, e.g.
@@ -64,12 +66,22 @@ def fetch_csv_data_from_file(loc_filelist, start_file_id=0):
             return None
         return records
 
+    @staticmethod
+    def read_local_files(config):
+       """Read Local Files according to config parameters and returns list of file-paths"""
+       input_path = config.input_folder
+       filelist = list(input_path.glob(f'*.{config.source["Main"]["file_extension"]}'))
+       input_count = len(filelist)
+       if input_count == 0:
+           sys.exit("No input files found.")
+       else:
+           return filelist
+
     @staticmethod
     def skip_empty_or_other(single_record):
-        """Detect  Rate Limiting Notice or empty records
-           so they can be skipped.
-        """
-        skip = False
-        if not single_record or (isinstance(single_record,dict) and single_record.get('limit')):
-            skip = True
-        return skip
+        """Detect empty records"""
+        if not single_record:
+            return False
+        return True
+
+
diff --git a/tagmaps/config/config.py b/tagmaps/config/config.py
@@ -3,7 +3,8 @@
 import argparse
 import os
 import sys
-
+from pathlib import Path
+import configparser
 from shapely.geometry import Polygon
 from shapely.geometry import shape
 from shapely.geometry import Point
@@ -12,7 +13,7 @@ class BaseConfig():
     def __init__(self):
         ## Set Default Config options here
         ## or define options as input args
-        self.d_source = "fromLBSN"
+        self.data_source = "fromLBSN"
         self.cluster_tags = True
         self.cluster_photos = True
         self.epsg = True
@@ -43,10 +44,15 @@ def __init__(self):
         self.shp_geom = None
 
         # initialization
+        self.pathname = Path.cwd()
+        self.config_folder = Path.cwd() / '00_Config'
+        self.input_folder = Path.cwd() / '01_Input'
+        self.output_folder = Path.cwd() / '02_Output'
         self.parse_args()
         self.load_filterlists()
         if self.shapefile_intersect:
             self.load_shapefile()
+        self.source = self.load_sourcemapping()
 
     def parse_args(self):
         """Parse init args and set default values
@@ -74,7 +80,7 @@ def parse_args(self):
 
         args = parser.parse_args()
         if args.source:
-            self.d_source = args.source
+            self.data_source = args.source
         if args.clusterTags:
             self.cluster_tags = args.clusterTags
         if args.clusterPhotos:
@@ -203,3 +209,16 @@ def load_custom_crs(self, override_crs):
         self.crs_proj = pyproj.Proj(init='epsg:{0}'.format(override_crs))
         print("Custom CRS set: " + str(self.crs_proj.srs))
         self.epsg_code = override_crs
+
+    def load_sourcemapping(self):
+        """Loads source mapping, if available.
+        
+        Otherwise, try to read structure from first line of CSV.
+        """
+        mapping_config_path = self.config_folder / f'sourcemapping_{self.data_source}.ini'
+        if not os.path.exists(mapping_config_path):
+            return
+        source_config = configparser.ConfigParser()
+        source_config.read(mapping_config_path)
+        return source_config
+