## Data Cleaning
This notebook aims to:  
- Keep the data with value
- Compare 3 databases in Chabelley folder
- Attribute Completeness Check
- Create GeoPackage

#### import necessary packages

In [1]:
import arcpy
import os
import csv

### Keep the data with value

#### create folders to seperate data
00_raw : all the raw data  
01_raw_no_duplicates : raw data with duplicate GDB removed  
02_raw_no_duplicates_or_empty_feature_layers : feature layers containing data (4 GDBs and 4 GeoPackages)

In [2]:
# Base folder
basefolder = r"C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA"

# Create base folder if it doesn't exist
if not arcpy.Exists(basefolder):
    os.makedirs(basefolder)

    folder_names = ["00_raw", "01_raw_no_duplicates", "02_raw_no_duplicates_or_empty_feature_layers"]
    for folder in folder_names:
        folder_path = os.path.join(basefolder, folder)
        os.makedirs(folder_path)

#### seperate empty feature class 
since lots of the feature classes are empty(based on the results of Cross_Base_Evaluation), remove those empty feature classes first to focus on the data has value.  

In [15]:
list_of_db_paths = [
    #r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\00_raw\Chabelley_SDSFIE_MASTER_March2023.gdb",
    #r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\00_raw\Chebelly_40_MASTER_CIP.gdb",
    #r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\01_raw_no_duplicates\Chebelley_40_MASTER.gdb",
    #r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\01_raw_no_duplicates\CIP_311_AB201.gdb",
    #r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\01_raw_no_duplicates\Niamey_SDSFIE_311_Master_9Jan2023.gdb",
    #r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\01_raw_no_duplicates\SDSFIE_05Apr2023.gdb"
    
    # updated Jan2024
    #r"C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\04_update\AB101_Jan24\Niamey_SDSFIE_311_Master_18MAY2023.gdb",
    #r"C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\04_update\AB201_JAN2024\AB201_JAN2024.gdb"
    
    #update Jul2024
    r"C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\04_update\update_Jul2024\Chebelley_40_MASTER_May2024_small.gdb",
    r"C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\04_update\update_Jul2024\MandaBay_SDSFIE_40_MASTER_Small.gdb"
    
]

In [16]:
target_path = os.path.join(basefolder,"02_raw_no_duplicates_or_empty_feature_layers")
  
def seperate_empty_fc(source_fc, target_paths):
    # count the data in each feature class
    count_result = arcpy.management.GetCount(source_fc)

    feature_count = int(count_result.getOutput(0))
    
    # not empty data copied to "02_raw_no_duplicates_or_empty_feature_layers"
    if feature_count != 0: 
        target_fc = os.path.join(target_path, os.path.basename(gdb), os.path.basename(source_fc))
        arcpy.management.CopyFeatures(source_fc, target_fc)
        print(f"Feature class {os.path.basename(source_fc)} copied to {os.path.dirname(target_fc)}")

In [17]:
for gdb in list_of_db_paths:
    arcpy.env.workspace = gdb

    # create the gdb named by the source gdb
    arcpy.management.CreateFileGDB(target_path, os.path.basename(gdb))

    # list all datasets in gdbs
    feature_fds = arcpy.ListDatasets("", "")

    # if the database has datasets
    if feature_fds:
        for fd in feature_fds:
            
            # list all the feature classes
            feature_fcs = arcpy.ListFeatureClasses("*", "ALL", fd)

            for fc in feature_fcs:
                source_fc = os.path.join(gdb, fd, fc)
                seperate_empty_fc(source_fc, target_path)

    # if database doesn't has datasets
    else:
        # list all the feature classes
        feature_fcs = arcpy.ListFeatureClasses()

        for fc in feature_fcs:
            source_fc = os.path.join(gdb, fc)
            seperate_empty_fc(source_fc, target_path)

Feature class Installation_P copied to C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\Chebelley_40_MASTER_May2024_small.gdb
Feature class Installation_A copied to C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\Chebelley_40_MASTER_May2024_small.gdb
Feature class Site_P copied to C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\Chebelley_40_MASTER_May2024_small.gdb
Feature class Site_A copied to C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\Chebelley_40_MASTER_May2024_small.gdb
Feature class Zones_A_12_2022 copied to C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\Chebelley_40_MASTER_May2024_small.gdb
Feature class EmergencyMedicalPoint_P copied to C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_l

Feature class Airfield_A copied to C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\MandaBay_SDSFIE_40_MASTER_Small.gdb
Feature class RoadCenterline_L copied to C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\MandaBay_SDSFIE_40_MASTER_Small.gdb
Feature class CStructure_P copied to C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\MandaBay_SDSFIE_40_MASTER_Small.gdb
Feature class CSegmentedCable_L copied to C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\MandaBay_SDSFIE_40_MASTER_Small.gdb
Feature class CAntenna_P copied to C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\MandaBay_SDSFIE_40_MASTER_Small.gdb
Feature class CFiberCable_L copied to C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\Mand

#### delete null or no value attribute fields
keep the attribute which has value in each feature class

In [6]:
clean_gdbs = [
    #r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\Chebelley_40_MASTER.gdb",
    #r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\CIP_311_AB201.gdb",
    #r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\Niamey_SDSFIE_311_Master_9Jan2023.gdb",
    #r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\SDSFIE_05Apr2023.gdb"
    #r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\Chabelley_SDSFIE_MASTER_March2023.gdb",
    #r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\Chebelly_40_MASTER_CIP.gdb"
    
    # updated Jan2024
    #r'C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\AB201_JAN2024.gdb',
    #r'C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\Niamey_SDSFIE_311_Master_18MAY2023.gdb'
    
    # updated Jul2024
    r"C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\Chebelley_40_MASTER_May2024_small.gdb",
    r"C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\MandaBay_SDSFIE_40_MASTER_Small.gdb"
]

In [7]:
for gdb in clean_gdbs:
    arcpy.env.workspace = gdb

    # list all feature classes in gdbs
    feature_fcs = arcpy.ListFeatureClasses()
    print(f"---- {os.path.basename(gdb)} ----")

    for fc in feature_fcs:
        print(f"--{fc}-- empty field:")
        
        # Get a list of field names that are not required
        non_required_fields = [field.name for field in arcpy.ListFields(fc) if not field.required]
        
        for field_name in non_required_fields:
            # Check if the field is required
            field_info = arcpy.ListFields(fc, field_name)[0]
            if not field_info.required:
                with arcpy.da.SearchCursor(fc, [field_name]) as cursor:
                    all_values_empty = all(row[0] is None or row[0] == " " or row[0] == "" for row in cursor)

                if all_values_empty:
                    arcpy.management.DeleteField(fc, field_name)
                    print(f"{field_name} deleted")
            #else:
                #print(f"{field_name} is a required field and cannot be deleted")

---- Chebelley_40_MASTER_May2024_small.gdb ----
--Installation_P-- empty field:
alternateFeatureName deleted
areaSize deleted
areaSizeUom deleted
baseType deleted
categoryCode deleted
createDate deleted
creator deleted
dataCollection deleted
dataSteward deleted
dateRecorded deleted
elevation deleted
elevationUom deleted
facilityNumber deleted
featureDescription deleted
featureName deleted
flexFieldDt deleted
flexFieldInt deleted
flexFieldRealNum deleted
flexFieldStr deleted
flexFieldYn deleted
geoloc deleted
horizontalDatumType deleted
hostOrganization deleted
installationId deleted
installationIdpk deleted
installationName deleted
installationOpStatus deleted
installationSuffix deleted
isJointBase deleted
latitude deleted
longitude deleted
mediaId deleted
metadataId deleted
metaNotes deleted
mgrs deleted
perimeterSize deleted
perimeterSizeUom deleted
primaryMission deleted
releasePolicy deleted
rpsuid deleted
rpuid deleted
sdsId deleted
siteId deleted
subCommand deleted
verticalDatumT

metadataId deleted
metaNotes deleted
mgrs deleted
owner deleted
releasePolicy deleted
rpsuid deleted
sdsId deleted
siteId deleted
verticalDatumType deleted
--FutureProjectSite_A-- empty field:
categoryCode deleted
dataSource deleted
dataSteward deleted
elevationUom deleted
flexFieldDt deleted
flexFieldInt deleted
flexFieldRealNum deleted
flexFieldStr deleted
flexFieldYn deleted
futureProjectPlannedUse deleted
futureProjectSiteIdpk deleted
mediaId deleted
metadataId deleted
metaNotes deleted
mgrs deleted
planningTimeHorizon deleted
projectCost deleted
projectFundingSource deleted
releasePolicy deleted
rpaType deleted
rpnid deleted
rpsuid deleted
rpuid deleted
sdsId deleted
verticalDatumType deleted
--LandManagementZone_A-- empty field:
categoryCode deleted
country deleted
createDate deleted
creator deleted
dataCollection deleted
dataSource deleted
dataSteward deleted
dateEdited deleted
editor deleted
featureDescription deleted
featureName deleted
flexFieldDt deleted
flexFieldInt deleted

mediaId deleted
metadataId deleted
mph deleted
msagCommunityNameLeft deleted
msagCommunityNameRight deleted
narrative deleted
numLanes deleted
oneWayDirection deleted
owner deleted
parkingLot deleted
postalCommunityNameLeft deleted
postalCommunityNameRight deleted
postalZipCodeLeft deleted
postalZipCodeRight deleted
releasePolicy deleted
roadCategory deleted
roadClassification deleted
roadClosed deleted
roadPostDirectional deleted
roadPrefixDirectional deleted
roadSuffix deleted
roadWidth deleted
roadWidthUom deleted
rpnid deleted
rpsuid deleted
rpuid deleted
sdsId deleted
siteId deleted
tFDriveTime deleted
toAddressLeft deleted
toAddressRight deleted
verticalDatumType deleted
--TrafficControlPostSign_P-- empty field:
backgroundRetroVal1 deleted
backgroundRetroVal2 deleted
createDate deleted
creator deleted
dataCollection deleted
dataSource deleted
dataSteward deleted
dateEdited deleted
editor deleted
elevation deleted
elevationUom deleted
featureLocation deleted
featureTypeDescription

maintainedBy deleted
mdi deleted
measuredLength deleted
measuredLengthUom deleted
mediaId deleted
metadataId deleted
neutralMaterial deleted
neutralSize deleted
neutralType deleted
nominalConductorTemp deleted
nominalConductorTempUom deleted
nominalVoltage deleted
notes deleted
numberOfFailures deleted
operatingClass deleted
operatingVoltage deleted
operationalStatus deleted
owner deleted
peakLoad deleted
phaseDesignation deleted
pmType deleted
projectId deleted
releasePolicy deleted
rpaType deleted
rpInterest deleted
rpnid deleted
rpsuid deleted
rpuid deleted
sdsId deleted
segmentId deleted
siteId deleted
substationId deleted
totalBaseLoad deleted
uci deleted
uciAssessmentDate deleted
ucsCode deleted
udi deleted
uniformatIIComponent deleted
uniformatIISection deleted
uniformatIISystem deleted
updateNote deleted
verticalDatumType deleted
voltageClass deleted
workOrderNum deleted
--EOHSecondary_L-- empty field:
ancillaryRole deleted
apparentSag deleted
averageLoading deleted
baseService

horizontalDatumType deleted
industryStandards deleted
infiltration deleted
infiltrationSource deleted
installationId deleted
installationName deleted
inventoryControlNumber deleted
isWinterized deleted
lastPmDate deleted
lastRepairDate deleted
latitude deleted
leachFieldCondition deleted
linearSegmentationGrouping deleted
locationDescription deleted
longitude deleted
maintainedBy deleted
manufacturerName deleted
mdi deleted
mediaId deleted
metadataId deleted
metaNotes deleted
mgrs deleted
model deleted
notes deleted
numberOfFailures deleted
obstructionLights deleted
operationalArea deleted
otherEquipment deleted
permitCompliance deleted
pipeCount deleted
pmType deleted
projectId deleted
releasePolicy deleted
rimElevation deleted
rpnid deleted
rpsuid deleted
sdsId deleted
securityCondition deleted
segmentId deleted
septicTankCapacity deleted
septicTankCapacityUom deleted
septicTankType deleted
siteId deleted
specificMaterialType deleted
spmLocation deleted
storageTankProductType deleted

yLocation deleted
zLocation deleted
zLocationUom deleted
--ElevationContour_L-- empty field:
createDate deleted
creator deleted
dataSteward deleted
elevationFrom deleted
elevationTo deleted
elevationUom deleted
elevationVerticalDatum deleted
featureDescription deleted
featureName deleted
flexFieldDt deleted
flexFieldInt deleted
flexFieldRealNum deleted
flexFieldStr deleted
flexFieldYn deleted
geoloc deleted
horizontalDatumType deleted
installationId deleted
installationName deleted
latitudeFrom deleted
latitudeTo deleted
lengthSizeUom deleted
longitudeFrom deleted
longitudeTo deleted
majorCommand deleted
mediaId deleted
metadataId deleted
metaNotes deleted
owner deleted
releasePolicy deleted
rpsuid deleted
sdsId deleted
siteId deleted
verticalEpoch deleted
wacInnrCode deleted
--Berm_A-- empty field:
areaSize deleted
areaSizeUom deleted
dataSource deleted
dataSteward deleted
flexFieldDt deleted
flexFieldInt deleted
flexFieldRealNum deleted
flexFieldStr deleted
flexFieldYn deleted
instal

latitudeTo deleted
lengthSizeUom deleted
longitudeFrom deleted
longitudeTo deleted
mediaId deleted
metadataId deleted
metaNotes deleted
mgrs deleted
perimeterSize deleted
perimeterSizeUom deleted
releasePolicy deleted
rpaType deleted
rpnid deleted
rpsuid deleted
rpuid deleted
sdsId deleted
siteId deleted
--Roadway_A-- empty field:
areaSizeUom deleted
categoryCode deleted
dataSource deleted
fac deleted
flexFieldDt deleted
flexFieldInt deleted
flexFieldRealNum deleted
flexFieldStr deleted
flexFieldYn deleted
installationId deleted
installationName deleted
latitude deleted
linearStructureId deleted
longitude deleted
mediaId deleted
metadataId deleted
metaNotes deleted
mgrs deleted
operationalStatus deleted
pavementBranchIdfk deleted
pavementBranchType deleted
paverSectionIdfk deleted
perimeterSizeUom deleted
releasePolicy deleted
roadPrefix deleted
roadSuffix deleted
rpnid deleted
rpsuid deleted
sdsId deleted
siteId deleted
--VehicleParking_A-- empty field:
areaSizeUom deleted
categoryCod

photocellType deleted
pmType deleted
powerRatingUom deleted
projectId deleted
releasePolicy deleted
rpaType deleted
rpInterest deleted
rpnid deleted
rpsuid deleted
rpuid deleted
sdsId deleted
segmentId deleted
siteId deleted
structureId deleted
substationId deleted
subtypeCd deleted
symbolRotation deleted
timeControlIndicator deleted
transformerId deleted
uci deleted
uciAssessmentDate deleted
ucsCode deleted
udi deleted
uniformatIIComponent deleted
uniformatIISection deleted
uniformatIISystem deleted
updateNote deleted
verticalDatumType deleted
wattage deleted
workOrderNum deleted
--EGenerator_P-- empty field:
ancillaryRole deleted
baseServiceLifeAdjustFactor deleted
cadAngle deleted
cadBlock deleted
cadLayer deleted
calculatedServiceLife deleted
categoryCode deleted
circuitCategory deleted
condRatingAssessDate deleted
condRatingMethod deleted
condRatingValue deleted
connectionConfiguration deleted
constructionStatus deleted
dataCollection deleted
dataSource deleted
dataSteward deleted

rpInterest deleted
rpnid deleted
rpsuid deleted
rpuid deleted
securityCondition deleted
segmentId deleted
serialNumber deleted
siteId deleted
specificMaterialType deleted
spillProtectDescript deleted
stiCategory deleted
storageTankProductType deleted
strategicReserve deleted
structuralCondition deleted
tankBottomHeight deleted
tankBottomHeightUom deleted
tankCapacity deleted
tankCapacityUom deleted
tankColor deleted
tankCorrosion deleted
tankDepth deleted
tankDepthUom deleted
tankDiameter deleted
tankDiameterUom deleted
tankDimensionUom deleted
tankFooterHeight deleted
tankFooterHeightUom deleted
tankHeight deleted
tankHeightUom deleted
tankLength deleted
tankLengthUom deleted
tankManufacturer deleted
tankMaterial deleted
tankOverflowHeight deleted
tankOverflowHeightUom deleted
tankPoc deleted
tankRoofType deleted
tankStatus deleted
tankTopHeight deleted
tankTopHeightUom deleted
tankWidth deleted
tankWidthUom deleted
uci deleted
uciAssessmentDate deleted
udi deleted
uniformatIIComponen

isRealProperty deleted
lastPmDate deleted
lastRepairDate deleted
latitude deleted
linearSegmentationGrouping deleted
locationDescription deleted
longitude deleted
maintainedBy deleted
mdi deleted
mediaId deleted
metadataId deleted
mgrs deleted
notes deleted
numberOfFailures deleted
operationalArea deleted
owner deleted
perimeterSize deleted
perimeterSizeUom deleted
permitCompliance deleted
permittedFlowRate deleted
permittedFlowRateUom deleted
pmType deleted
projectId deleted
releasePolicy deleted
rpaType deleted
rpInterest deleted
rpnid deleted
rpsuid deleted
rpuid deleted
scadaControlId deleted
scadaMonitorId deleted
segmentId deleted
siteId deleted
source deleted
subtypeCd deleted
symbolRotation deleted
uci deleted
uciAssessmentDate deleted
ucsCode deleted
udi deleted
uniformatIIComponent deleted
uniformatIISection deleted
uniformatIISystem deleted
updateNote deleted
utilityAreaId deleted
verticalDatumType deleted
waterTreatmentType deleted
generalMaterialType deleted
--WStorageTank

## The steps down below is not necessary

### Compare 3 databases in Chabelley folder 
we found that there are 3 databases in Chabelley folder, to make sure weather those data has the same value or we can use one of them in the ongoing work (This step can be  done in the beginging).

In [11]:
cha_gdbs = [
    r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\Chebelley_40_MASTER.gdb",
    r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\Chabelley_SDSFIE_MASTER_March2023.gdb",
    r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\Chebelly_40_MASTER_CIP.gdb"
]

compare Chebelley_40_MASTER.gdb and Chabelley_SDSFIE_MASTER_March2023.gdb

In [13]:
# Set local variables
cha_gdb1 = cha_gdbs[0] 
cha_gdb2 = cha_gdbs[1]

arcpy.env.workspace = cha_gdb1
feature1_fcs = arcpy.ListFeatureClasses()

arcpy.env.workspace = cha_gdb2
feature2_fcs = arcpy.ListFeatureClasses()

for fc1 in feature1_fcs:
    base_features = cha_gdb1 + "\\" + fc1
    sort_field = [field.name for field in arcpy.ListFields(base_features)]

    # Check if fc1 exists in feature2_fcs
    if fc1 in feature2_fcs:
        test_features = cha_gdb2 + "\\" + fc1

        # Process: FeatureCompare
        compare_result = arcpy.FeatureCompare_management(
            base_features, test_features, sort_field[0], "ALL", None,
            None, 0, 0, None, None, "CONTINUE_COMPARE", None)

        # Check if there are differences
        if compare_result[0] == 'true':
            print(f"Differences found in {fc1}:")
            print(compare_result[1])
        else:
            print(f"No differences found in {fc1}")
        #print(arcpy.GetMessages())
    else:
        print(f"{fc1} does not exist in {cha_gdb2}")


No differences found in Installation_P
No differences found in Installation_A
No differences found in Site_P
No differences found in Site_A
No differences found in Zones_A_12_2022
No differences found in EmergencyMedicalPoint_P
No differences found in Wall_L
No differences found in Wall_A
No differences found in ControlMonument_P
No differences found in EsqdArc_A
No differences found in ImpactArea_A
No differences found in MilitaryLandingZone_A
No differences found in MilitaryObservationPosition_P
No differences found in SpentMunitionsStorage_P
No differences found in SpentMunitionsStorage_A
No differences found in PavementSlab_A
No differences found in AirAccidentPotentialZone_A
No differences found in AirfieldImaginarySurface_A
No differences found in DesignatedTobaccoUse_P
No differences found in FutureProjectSite_A
No differences found in LandManagementZone_A
No differences found in LandUse_A
No differences found in StandoffDistanceArc_A
No differences found in Building_A
No differ

compare Chebelley_40_MASTER.gdb and Chebelley_40_MASTER_CIP.gdb

In [11]:
# Set local variables
cha_gdb1 = cha_gdbs[0] 
cha_gdb2 = cha_gdbs[2]

arcpy.env.workspace = cha_gdb1
feature1_fcs = arcpy.ListFeatureClasses()

arcpy.env.workspace = cha_gdb2
feature2_fcs = arcpy.ListFeatureClasses()

for fc1 in feature1_fcs:
    base_features = cha_gdb1 + "\\" + fc1
    sort_field = [field.name for field in arcpy.ListFields(base_features)]

    # Check if fc1 exists in feature2_fcs
    if fc1 in feature2_fcs:
        test_features = cha_gdb2 + "\\" + fc1

        # Process: FeatureCompare
        compare_result = arcpy.FeatureCompare_management(
            base_features, test_features, sort_field[0], "ALL", None,
            None, 0, 0, None, None, "CONTINUE_COMPARE", None)

        # Check if there are differences
        if compare_result[0] == 'true':
            print(f"Differences found in {fc1}:")
            print(compare_result[1])
        else:
            print(f"No differences found in {fc1}")

        #print(arcpy.GetMessages())
    else:
        print(f"{fc1} does not exist in {cha_gdb2}")


Installation_P does not exist in D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\01_raw_no_duplicates\removed_empty_fc\Chebelly_40_MASTER_CIP.gdb
No differences found in Installation_A
Site_P does not exist in D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\01_raw_no_duplicates\removed_empty_fc\Chebelly_40_MASTER_CIP.gdb
Site_A does not exist in D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\01_raw_no_duplicates\removed_empty_fc\Chebelly_40_MASTER_CIP.gdb
Zones_A_12_2022 does not exist in D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\01_raw_no_duplicates\removed_empty_fc\Chebelly_40_MASTER_CIP.gdb
EmergencyMedicalPoint_P does not exist in D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\01_raw_no_duplicates\removed_empty_fc\Chebelly_40_MASTER_CIP.gdb
No differences found in Wall_L
No differences found in Wall_A
ControlMonument_P does not exist in D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\01_raw_no_duplicates\removed_empty_fc\Chebelly_40_MASTER_CIP.gdb
EsqdArc_A does not exist in D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA

After checking the data in three databases in Chabelley, they are all the same. Therefore, we will only keep one, Chebelley_40_MASTER.gdb, to continue the project.

#### delete duplicates GDBs

In [14]:
arcpy.Delete_management(r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\Chabelley_SDSFIE_MASTER_March2023.gdb")
arcpy.Delete_management(r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\Chebelly_40_MASTER_CIP.gdb")

### Attribute Completeness Check
This step aims to assess the completeness of each attribute. It can enable us to discuss whether we should retain attributes with low completeness or not.

In [109]:
clean_gdbs = [
    r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\Chebelley_40_MASTER.gdb",
    #r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\CIP_311_AB201.gdb",
    #r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\Niamey_SDSFIE_311_Master_9Jan2023.gdb",
    #r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\SDSFIE_05Apr2023.gdb" 
]

In [121]:
def completeness_calculate(fc, csv_writer, header_written):
    # all fields name
    fields = [field.name for field in arcpy.ListFields(fc)]
    
    with arcpy.da.SearchCursor(fc, fields) as cursor:
        
        row_data = [fc]
        #literal all fields in each feature class
        for field in fields:
            non_null_count = 0
            total_count = 0

            # literal all cell in each field
            for row in cursor:
                value = row[fields.index(field)]
                total_count += 1
                if value is not None and value != '':
                    non_null_count += 1

            # calculate completeness in %
            completeness_percentage = (non_null_count / total_count) * 100

            row_data.append(f"{completeness_percentage:.2f}%")

            # reset cursor
            cursor.reset()

    # input feild name
    csv_writer.writerow([''] + [f"{field}" for field in fields])

    # input CSV 
    csv_writer.writerow(row_data)

In [122]:
def fc_attribute(gdb_path, output_csv_path):
    arcpy.env.workspace = gdb_path
    
    # Extract the last part of the GDB path
    gdb_basename = os.path.basename(gdb_path)

    # Create a CSV file for output
    with open(output_csv_path, 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)

        feature_classes = arcpy.ListFeatureClasses()
        header_written = False
        for fc in feature_classes:
            completeness_calculate(fc, csv_writer, header_written)

    print(f"CSV ：{output_csv_path}")

In [123]:
for gdb_path in clean_gdbs:
    # Extract GDB filename without extension
    gdb_name = os.path.splitext(os.path.basename(gdb_path))[0]
    
    # Construct CSV filename using GDB name
    output_csv_path = rf"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\completeness_{gdb_name}.csv"

    
    fc_attribute(gdb_path, output_csv_path)

CSV ：D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\completeness_Chebelley_40_MASTER.csv


### Create GeoPackage
1. It may encounter errors if all databases are run simultaneously. I recommend running the databases one by one. 
2. Annotations won't be copied to the GeoPackage.

In [1]:
clean_gdbs = [
    #r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\Chebelley_40_MASTER.gdb",
    #r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\CIP_311_AB201.gdb",
    #r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\Niamey_SDSFIE_311_Master_9Jan2023.gdb",
    #r"D:\spring2024\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\SDSFIE_05Apr2023.gdb"
    # update
    r"C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\AB201_JAN2024.gdb",
    r"C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\Niamey_SDSFIE_311_Master_18MAY2023.gdb"
]

In [3]:
target_path = r"C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers"
arcpy.env.workspace = target_path

for gdb in clean_gdbs:
    
    gpkg_name = os.path.basename(gdb).replace(".gdb", "") + ".gpkg"
    arcpy.management.CreateSQLiteDatabase(os.path.join(target_path, gpkg_name), "GeoPackage")

In [4]:
# copy feature classes from gdbs to gpkg
for gdb in clean_gdbs:
    
    arcpy.env.workspace = gdb
    
    feature_fcs = arcpy.ListFeatureClasses()

    print(f"---- {os.path.basename(gdb)} ----")

    for fc in feature_fcs:
        
        gpkg_full_path = os.path.join(target_path, gpkg_name, fc)

        try:
            arcpy.conversion.ExportFeatures(fc, gpkg_full_path)
 
        except Exception as e:
            print(f"Failed to export {fc}. Skipping. Error: {str(e)}")
            continue
        

---- AB201_JAN2024.gdb ----
---- Niamey_SDSFIE_311_Master_18MAY2023.gdb ----
Failed to export GateAnno. Skipping. Error: ERROR 000210: Cannot create output C:\Users\ma000551\Desktop\AFRICOM\AFRICOM_CLEAN_DATA\02_raw_no_duplicates_or_empty_feature_layers\Niamey_SDSFIE_311_Master_18MAY2023.gpkg\GateAnno
ERROR 000956: Output workspace does not support annotation
Failed to execute (ExportFeatures).

