# To do list:
- [X] limit the getting of data using a date ( = access last log file name)
- [X] clean functions that create `gis` as this will be done only once at the begining of the workflow
- [X] Create a controller function to the arcgis part
- [X] Add code to log execution of functions in the arcgis part.
- [ ] `assert` that there are contacts in the csv, before publishing to arcgis
- [ ] use `assert` in functions instead of `try-except`


Create a text file at the begining and pass it to each function and append. Or have a json and add it to the controller function that outputs a log. 



Log of failed contacts, list at the begining of the controller and the function that checks geocoding can add there, at the end of the controller function write the json file with the outputs. 
Initiating...


- After publishing new service get url to create a feature layer and whilelist it (limit usage). See [here](https://support.esri.com/en/technical-article/000017029) and [here](https://developers.arcgis.com/rest/users-groups-and-items/create-proxies.htm)

This seems to be only possible for registered apps. Once a new layer is published manually it is possible to create a feature layer using the url and whitelist it, then use that layer for the app. 

In [1]:
import json
import requests
import re
from pprint import pprint
import pandas as pd
import logging
from datetime import datetime
import os

# Functions

In [2]:
def executeRequest(url, headers, payload):
    
    try:
    ## Create empty object first
        r_json = {}
    
    ## cleaner way of using requests
        response = requests.get(url, headers=headers, data=payload)

    except:
        print("There was an problem in the request :(")
        return None

    ## always nice to print the url as a sanity check
    #print(response.url)
    logging.info(response.url)
    # if succesful, populate your response json
    if  response.status_code == 200:           
            r_json = response.json()
    else:
        logging.info(f"Failed to get data {response.status_code}, {response.json()}")
    
    return r_json

In [3]:
def create_strict_reg_exp(to_search):
    try:
        reg_exp = f"^{to_search}$"
        logging.info(f"regular expression is: {reg_exp}")
    except:
        print("There was a problem with the string.")
    return reg_exp

In [4]:
def searchContactListsID(r_json, reg_exp):
    try:
        logging.info("Searching Contact list by ID")
        l_json = r_json.get("lists", [])
        assert type(l_json) is list and len(l_json) is not 0,  "Error with l_json"
        sel_contact_dict = {
        d['name']: d.get('list_id', '')
        for d in l_json
        if re.search(reg_exp, d['name']) != None
        }     
        return sel_contact_dict
    except:
        print("There was a problem with the structure of the json")
    

In [5]:
def requestURLbyListID(id_contact_list, updated_after = None): #updated_after=2020-04-01
    try:
        
        if updated_after:
            url = f"https://api.cc.email/v3/contacts?lists={id_contact_list}&include=street_addresses&limit=500&include_count=false&updated_after={updated_after}"
        else:
            url = f"https://api.cc.email/v3/contacts?lists={id_contact_list}&include=street_addresses&limit=500&include_count=false"
            
    except:
        print("There was a problem with the id.")
    return url

In [6]:
def getContactsLocation(r_json):
    try:
        contacts_list = []
        no_location_list = []
        l_json = r_json.get("contacts", [])
        for d in l_json:
            street_address = d.get('street_addresses', [{}])#[0]
            if street_address: # here is where an else is necessary to log the contacts without address information
                street_address = street_address[0]
                postal_code = street_address.get("postal_code", None)
                country = street_address.get("country", None)
                if postal_code and country:
                    contact_dict = {
                        'contact_id': d.get('contact_id', ''), ## Need a fallback for contact_id? No, there is always a contact_id
                        'postal_code': postal_code,
                        'country': country
                    }
                    contacts_list.append(contact_dict)  
                else:
                    no_location_list.append(d.get('contact_id', ''))
                    #logging.info(d.get('contact_id', '')) 
                    #print(d.get('contact_id', ''))
                    #break
            else:
                no_location_list.append(d.get('contact_id', ''))
                #logging.info(d.get('contact_id', ''))
                #print(d.get('contact_id', ''))
                #break
        df = pd.DataFrame(contacts_list)
        logging.info(f"{len(df)} contacts with location")
        logging.info(f"{len(no_location_list)}contacts without location {no_location_list}")
    except:
        print("There was a problem with the structure of the json")
    return df

In [7]:
def missLocation(r_json, df):
    try: 
        l_json = r_json.get("contacts", [])
        all_contacts_list = []
        for d in l_json:
            all_contacts_list.append(d.get('contact_id', ''))
        original_set = set(all_contacts_list)
        located_set = set(df.contact_id)
        contact_diff = original_set.difference(located_set)
    except:
        print("There was a problem with the structure of the json")
    return contact_diff

In [8]:
def writeLocationCsv(df, csvName):
    try:
        csv_file = f'./{csvName}.csv'
        df.to_csv(csv_file, index=False)
        logging.info(f"{csv_file} written")
    except:
        print("The csv hasn't been written")

In [9]:
def searchLog(csv_name):
    log_list = []
    arr = os.listdir('.')
    for a in arr:
        if re.search("^logfile", a)!=None:
            log_list.append(a)
    log_list.sort(reverse=True)
    nber_logs = len(log_list)
    date_limit = None   
    for logs in log_list:
        with open(logs) as f:                
            for line in f:                   
                if re.search(f"{csv_name}.csv written", line) != None:  #./Nat Geo Meeting 2018.csv written
                    d = logs.split("_")
                    date_limit = f"{d[1]}-{d[2]}-{d[3]}"
                    break
        if date_limit != None:
            logging.info(f"The last time the API was accessed for {csv_name} was on {date_limit}")
            return date_limit  
    logging.info(f"First time accessing the API for {csv_name}")
    return date_limit          

# Getting the data from Constant Contact

In [10]:
def constantContactController(token, contact_lists_of_interest):
    headers = {
      'Authorization': f'Bearer {token}'
    }
    payload = {}
    url = "https://api.cc.email/v3/contact_lists?include_count=false"
    r_contact_lists = executeRequest(url, headers, payload)
    if r_contact_lists:
        logging.info(f"Constant contact API accessed on {datetime.now().strftime('%Y_%m_%d_%H_%M_%S')}")
        action_dict = {}
        for list_element in contact_lists_of_interest:
            cl_to_search = create_strict_reg_exp(list_element)
            id_dict = searchContactListsID(r_json = r_contact_lists, reg_exp = cl_to_search)
            date_limit = searchLog(list_element)
            url_contacts = requestURLbyListID(id_dict[list_element], updated_after = date_limit)
            r_contacts = executeRequest(url_contacts, headers, payload)
            if r_contacts['contacts']:                
                contacts_location_df = getContactsLocation(r_contacts)
                if len(contacts_location_df)>0:
                    writeLocationCsv(contacts_location_df, list_element)
                    action_key = list_element
                    if date_limit == None:    # if date_limit == None: publish
                        action_value = "publish"
                    else:    # else: append
                        action_value = "append"
                    action_dict[action_key] = action_value
                else:
                    logging.info(f"No new entries with location for {list_element} since {date_limit}")
            else:
                logging.info(f"No new entries for {list_element} since {date_limit}")
        return action_dict   
    else:
        logging.info("There was a problem accessing Constant Contact API")

In [11]:
#action_dict = constantContactController(token = token, contact_lists_of_interest = contact_lists_of_interest)

In [12]:
#action_dict

# Once the csv is ready it can be published in arcgis online via the arcgis api
Here there can be different cases:
- publish a new service, if the list is a new one
- fully overwrite a service
- append data to a service

In [13]:
import arcgis
from arcgis.gis import GIS
from arcgis.features import FeatureLayerCollection
from copy import deepcopy
from arcgis.geocoding import geocode
from arcgis import geometry
import re
from pprint import pprint
import pandas as pd

Documentation on setting the content_status [here](https://developers.arcgis.com/python/api-reference/arcgis.gis.toc.html#arcgis.gis.Item.content_status)

In [14]:
def publishCSVasFS(csvName, gis, aol_folder_name): #, sharing = None
    try:
        
        if gis.content.is_service_name_available(csvName, "featureService"):
            logging.info(f"Service name {csvName} is available")
            csv_file = f'./{csvName}.csv'
            csv_item = gis.content.add({}, csv_file)
            csv_lyr = csv_item.publish(None,  { 'CountryCode' : 'country',
                                            'Postal' : 'postal_code'} )
            
            flayer_collection = FeatureLayerCollection.fromitem(csv_lyr)
            searched_flayer = flayer_collection.layers[0] 
            nber_features = searched_flayer.query(return_count_only=True)
            logging.info(f"The service {csvName} has been published. The service has {nber_features} entries")
            logging.info(f"Moving service {csvName} to {aol_folder_name} in ArcGIS Online...")
            csv_item.move(aol_folder_name)
            csv_lyr.move(aol_folder_name)
            logging.info(f"Service {csvName} has been moved to {aol_folder_name} in ArcGIS Online")
            #sharing
            #if sharing == "everyone":
            #    csv_lyr.share(everyone=True, org=False, groups=None, allow_members_to_edit=False)
            sharing_prop = csv_lyr.shared_with
            if sharing_prop['everyone']==True:
                logging.info(f"shared with everyone")
            else:
                logging.info(f"not a public layer, for this layer to be used it has to be public or the urls have to be whitelisted")            
            #not allowing deleting
            csv_lyr.protect()
            logging.info(f"{csvName}'s protection against deletion : {csv_lyr.protected}") 
            #mark deprecated
            # it is possible to check the status with csv_item.content_status
            return csv_lyr.id
        else:
            logging.info("The service name is not available, try overwritting, appending the data or a different service name")
        
    except:
        print("The csv hasn't been published")

In [15]:
#publishCSVasFS(csvName = testing_val)

In [16]:
def findItemGetID(csvName, gis):
    try:
        searched_item = gis.content.search(csvName, item_type = "Feature Layer")
        for i in searched_item:
            reg_exp = create_strict_reg_exp(csvName)
            if re.search(reg_exp, i.title)!= None:    
                logging.info(f"{csvName} has the id: {i.id}")
                return i.id
    except:
        print("There was a problem finding the item")

To overwrite follow [this notebook from ESRI](https://developers.arcgis.com/python/sample-notebooks/overwriting-feature-layers/). 

In [17]:
def overwriteFSwithCSV(item_id, csvName, gis):
    searched_item = gis.content.get(item_id)             
    csv_file = f'./{csvName}.csv'
    try:
        flayer_collection = FeatureLayerCollection.fromitem(searched_item)
        overwrite_message = flayer_collection.manager.overwrite(csv_file)
        if overwrite_message['success'] == True:
            searched_flayer = flayer_collection.layers[0] 
            nber_features = searched_flayer.query(return_count_only=True)
            logging.info(f"The service {csvName} has been overwritten. The service has {nber_features} entries")
    except:
            print("There was a problem overwriting the service")                

In [18]:
#testing_id = findItemGetID(csvName = testing_val)
#overwriteFSwithCSV(csvName = testing_val, item_id = testing_id)

To append follow [this notebook from ESRI](https://developers.arcgis.com/python/sample-notebooks/updating-features-in-a-feature-layer/).

In [19]:
def appendCSVtoFS(csvName, item_id, gis): 
    csv_file = f'./{csvName}.csv'
    df = pd.read_csv(csv_file)
    item = gis.content.get(item_id)
    flayer = item.layers[0]
    fset = flayer.query()
    overlap_rows = pd.merge(left = fset.sdf, right = df, how='inner', on = 'contact_id')
    #get number of overlap rows
    features_for_update = [] #list containing corrected features
    all_features = fset.features    

In [20]:
def getFeatureSet(item_id, gis):
    try:
        item = gis.content.get(item_id)
        flayer = item.layers[0]
        fset = flayer.query()
        return fset
    except:
        print("A feature set couldn't be created from this item")

In [21]:
def checkOverlap(csvName, fset):
    try:
        csv_file = f'./{csvName}.csv'
        df = pd.read_csv(csv_file)
        overlap_rows = pd.merge(left = fset.sdf, right = df, how='inner', on = 'contact_id')
        if overlap_rows:
            logging.info(f"There are {len(overlap_rows)} overlapping")
            return overlap_rows
        else:
            return None 
    except:
        print("There has been a problem checking row overlap")

In [22]:
def updateFeaturesInService(overlap_rows, fset, gis):
    try:
        all_features = fset.features
        missing_locations = []
        features_for_update = [] #list containing corrected features
        for contact_id in overlap_rows['contact_id']:
            # get the feature to be updated
            original_feature = [f for f in all_features if f.attributes['contact_id'] == contact_id][0]
            feature_to_be_updated = deepcopy(original_feature)
            # get the matching row from csv
            matching_row = df.where(df.contact_id == contact_id).dropna()
            # from the csv geocode the country and postcode
            address = {"CountryCode": matching_row['country'][0], "Postal": int(matching_row['postal_code'][0])}
            add_loc = geocode(address)           
            if add_loc:
                input_geometry = add_loc[0]['location']
                output_geometry = geometry.project(geometries = [input_geometry],
                                                   in_sr = 4326, 
                                                   out_sr = fset.spatial_reference['latestWkid'],
                                                   gis = gis)
                feature_to_be_updated.geometry = output_geometry[0]    
                feature_to_be_updated.attributes['contact_id'] = matching_row['contact_id'].values[0]
                feature_to_be_updated.attributes['postal_code'] = matching_row['postal_code'].values[0]
                feature_to_be_updated.attributes['country'] = matching_row['country'].values[0]
                features_for_update.append(feature_to_be_updated)
            else:
                missing_locations.append(row[1]['contact_id'])
        if features_for_update:
            message = flayer.edit_features(updates= features_for_update)
            logging.info(message)
            logging.info(f"Geocoding not available for {len(missing_locations)} contacts: {missing_locations}")
        else:
            logging.info("no features were updated")
    except:
        print("There was a problem updating the features")

In [23]:
def checkNewRows(csvName, overlap_rows):
    try:
        csv_file = f'./{csvName}.csv'
        df = pd.read_csv(csv_file)
        new_rows = df[~df['contact_id'].isin(overlap_rows['contact_id'])]
        if new_rows:
            return new_rows
        else:
            return False 
    except:
        print("There has been a problem checking for new rows")     

In [24]:
def addNewFeatures(new_rows, fset):
    try:
        features_to_be_added = []
        missing_locations = []
        template_feature = deepcopy(fset[0])
        for row in new_rows.iterrows():   
            address = {"CountryCode": row['country'], "Postal": row['postal_code']}
            add_loc = geocode(address, out_fields="City,Country")
            if add_loc:
                new_feature = deepcopy(template_feature)
                #get geometries in the destination coordinate system
                input_geometry = add_loc[0]['location']
                output_geometry = geometry.project(geometries = [input_geometry],
                                               in_sr = 4326, 
                                               out_sr = fset.spatial_reference['latestWkid'],
                                               gis = gis)
                # assign the updated values
                new_feature.geometry = output_geometry[0]
                new_feature.attributes['contact_id'] = row[1]['contact_id']
                new_feature.attributes['state'] = row[1]['state']
                new_feature.attributes['capital'] = row[1]['capital']
                #add this to the list of features to be updated
                features_to_be_added.append(new_feature)
            else:
                missing_locations.append(row[1]['contact_id'])                
        if features_to_be_added:
            flayer.edit_features(adds = features_to_be_added)
            logging.info(f"Geocoding not available for {len(missing_locations)} contacts: {missing_locations}")
        else:
            print("no features were added")
    except:
        print("There has been a problem adding new features")

In [25]:
def locationNotMapped(csvName, item_id, gis):
    fset = getFeatureSet(item_id, gis)
    csv_file = f'./{csvName}.csv'
    df = pd.read_csv(csv_file)
    left_out_rows = pd.merge(left = fset.sdf, right = df, how='outer', on = 'contact_id', indicator=True).query('_merge != "both"')
    missing_locations = left_out_rows['contact_id'].to_list()
    if missing_locations:
        logging.info(f"There are {len(missing_locations)} locations that couldn't be geocoded: {missing_locations}")
    else:
        logging.info(f"All the locations were geocoded.")

In [26]:
def csvToArcgis(csvName, action, gis, aol_folder_name):
    if action == "publish": 
        published_id = publishCSVasFS(csvName, gis, aol_folder_name)
        locationNotMapped(csvName, published_id, gis)
    if action == "overwrite":
        item_id = findItemGetID(csvName, gis)
        overwriteFSwithCSV(item_id, csvName, gis)
        locationNotMapped(csvName, item_id, gis)
    if action == "append":
        item_id = findItemGetID(csvName)
        fset = getFeatureSet(item_id, gis)
        overlapRows = checkOverlap(csvName, fset)
        if overlapRows != None:
            updateFeaturesInService(csvName, fset, gis)
            newRows = checkNewRows(csvName, overlapRows)
            if newRows:
                addNewFeatures(newRows, fset) 
        #locationNotMapped(csvName, item_id)

In [27]:
def connectingToGIS(aol_password, aol_username):
    gis = GIS("https://eowilson.maps.arcgis.com", aol_username, aol_password)
    return gis

In [28]:
def arcgisController(action_dict, aol_password, aol_username, aol_folder_name):
    try:
        if action_dict:
            gis = connectingToGIS(aol_password, aol_username)
            for key in action_dict:
                csvName = key
                action = action_dict[key]
                logging.info(f"starting {action} for {csvName}")
                csvToArcgis(csvName, action, gis, aol_folder_name)
                logging.info(f"{action} for {csvName} done")
        else:
            logging.info(f"There were no feature services to update")
    except:
        print("Something went wrong with the arcgis controller")

In [29]:
def fullController(token, contact_lists_of_interest, aol_password, aol_username, aol_folder_name):
    LOG_FILENAME = f"./logfile_{datetime.now().strftime('%Y_%m_%d_%H_%M_%S')}.log"
    logging.basicConfig(filename=LOG_FILENAME,level=logging.DEBUG) 
    logging.info("Starting Constant Contact Controller")
    action_dict = constantContactController(token = token, contact_lists_of_interest = contact_lists_of_interest)
    logging.info("Starting ArcGIS Controller")
    arcgisController(action_dict, aol_password, aol_username, aol_folder_name)
    logging.info(f"running of nbk finished at {datetime.now().strftime('%Y_%m_%d_%H_%M_%S')}")

# Environmental variables - Authentication

In [30]:
env_path = ".env"
with open(env_path) as f:
    env = {}
    for line in f:
        env_key, _val = line.split("=")
        env_value = _val.split("\n")[0]
        env[env_key] = env_value
api_key = env['cc_api_key']

To get the token put this in the web browser: https://api.cc.email/v3/idfed?client_id={api_key}&redirect_uri=https://localhost&response_type=token&scope=contact_data

In [31]:
#f"https://api.cc.email/v3/idfed?client_id={api_key}&redirect_uri=https://localhost&response_type=token&scope=contact_data"

In [32]:
env_path = ".env"
with open(env_path) as f:
    env = {}
    for line in f:
        env_key, _val = line.split("=")
        env_value = _val.split("\n")[0]
        env[env_key] = env_value
token = env['cc_token']

In [34]:
aol_password = env['aol_key']
aol_username = env['aol_username']

and then update the `.env` file. Is there a way of getting the url where this get call takes?
# variable = contact_lists_of_interest

In [33]:
contact_lists_of_interest = ["Nat Geo Meeting 2018", "Biodiversity Days 2016 Attendees", "2019 EOY List"] #"Educator Ambassadors",  

In [35]:
aol_folder_name = "constant_contact"

In [36]:
fullController(token = token, 
               contact_lists_of_interest = contact_lists_of_interest, 
               aol_password = aol_password, 
               aol_username = aol_username,
               aol_folder_name = aol_folder_name)