# KML to CSV converter

## Imports

In [1]:
from tkinter import Tk
from tkinter.filedialog import askopenfilename
import geopandas as gpd
import pandas as pd
import fiona
import pprint
from xml.dom.minidom import *
import pandas as pd
import csv
import xml.etree.ElementTree as ET
import re
import numpy as np
import os
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import shapely
import math
from zipfile import ZipFile

## Functions

In [2]:
# Function to import file and turn into dataframe and returns the dataframe and the filepath-
def importKML():
    Tk().withdraw()
    file = askopenfilename()
    print("File Uploaded:", str(file))
    fiona.drvsupport.supported_drivers['kml'] = 'rw'
    fiona.drvsupport.supported_drivers['KML'] = 'rw'
    fiona.drvsupport.supported_drivers['KMZ'] = 'rw'
    fiona.drvsupport.supported_drivers['kmz'] = 'rw'
    fiona.drvsupport.supported_drivers['LIBKML'] = 'rw'
    if file[-3:] == 'kml':
        df = gpd.read_file(str(file), driver='KML')
    if file[-3:] == 'kmz':
        kmz = ZipFile(file, 'r')
        kmz.extract('doc.kml', os.path.split(file)[0])
        df = gpd.read_file(os.path.split(file)[0]+'/doc.kml', driver='KML')
    return df, file

In [3]:
# This function takes two arrays as inputs and assumes the arrays are [x,y] format. 
# [x,y,...,n] is supported however x and y have to be index 0 and 1 respectively.
# The following returns distance in feet
def distance(origin, destination):
    #grabs the cordinates
    lat1 = origin[0]
    lon1 = origin[1]
    lat2 = destination[0]
    lon2 = destination[1]
    
    # Radius of earth
    radius = 3959 # miles

    # This is the distance formula for latitude and longitude
    dlat = math.radians(lat2-lat1)
    dlon = math.radians(lon2-lon1)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = radius * c
    
    # Convert to feet
    d = d *5280
    return d

In [4]:
# This function takes an array of arrays, assuming arrays are [x,y] coordinate pairs:
def perim(inputArr):
    # The running total for perimeter
    total = 0
    # For each coordinate pair in the array-
    for i in range(len(inputArr)):
        
        # If it is the final coordinate pair within the input array then it takes coordinate i and measures the distance between i and index 0
        if i == len(inputArr)-1:
            total += distance(inputArr[i], inputArr[0])
        
        # This adds the distance between coordinate pair i and coordinate pair i+1
        else:
            total += distance(inputArr[i], inputArr[i+1])
    
    # The total of the distance betwen all points in the input array is the perimeter 
    return total

In [5]:
# Function- empty array that will append with either a perimiter or a blank
def perimeterColumnGenerator(series):
    
    # This turns the series into a list
    listedSeries = series.tolist()
    
    perimiterArr = []
    for i in range(len(listedSeries)):

        # This looks at each index and if the string dose not contain the word POINT
        if re.search("POINT",listedSeries[i]) == None:

            # This removes anything that isn't a number, comma, decimal, or negative
            listedSeries[i] = re.sub('[^0-9\-\,\.\ ]', '', listedSeries[i])
            
            # This splits on a comma seperating the cordinate pairs into an array
            tempPointArr = listedSeries[i].split(',')

            # This splits on spaces serperating each x, y, z into its own array while removing blanks
            # For each cordinate pair within the arr-
            for i in range(len(tempPointArr)):
                # Split on space removing blank
                tempPointArr[i] = list(filter(None, tempPointArr[i].split(' ')))
                # For each point within the arr
                for k in range(len(tempPointArr[i])):
                    # Cast as a float
                    tempPointArr[i][k] = float(tempPointArr[i][k])

            # Run the perimeter function and add perimiter to the array
            perimiterArr.append(perim(tempPointArr))

        # If the indexed entry in the dataframe is a point, then just add a 0.0 to the perimiterArr
        else:
            perimiterArr.append(0.0)

    return perimiterArr

In [6]:
# This function cleans the "Shape", "Number of Points", "Description", and "0" into "Perimete/Length (feet)" columns:
def NameCleanUp(df):
    # Clean up the "Description" column:
    df["Description"] = df['Description'].replace("Dimension: <br>description:", "")
    # Split column "Points" into two columns: column "Shape" and column "Number of Points":
    df[['Shape', 'Number of Points']] = df['Points'].str.split('(', 1, expand=True)
    # Renaming the "Shape" column using the "my_recode" function:
    df['Shape Type'] = df['Shape'].apply(my_recode)
    # Rename "0" to "Perimeter/Length (feet)":
    df = df.rename(columns={0: "Perimeter/Length (feet)"})
    return df

In [7]:
# This function ia for dropping unused columns
def DropUnusedColumns(df):
    # Drop the "geometry" and "Shape" columns
    df.drop("geometry", axis=1, inplace=True)
    # Drop "Shape"
    df.drop("Shape", axis=1, inplace=True)
    # Drop "Points"
    # df.drop("Points", axis=1, inplace=True)
    return df

In [8]:
# This function manipulates the "Folder" column:
def PathFolderColumn(df):
    # Create a "Folder" column that will hold the folder and file from the filepath:
    df['Folder'] = ''
    # PathArray from file (Global Variable):
    pathArray = file.split("/")
    # Adding the file path parent folder and file name into the string "ParentChildFolderPath" variable:
    ParentChildFolderPath = "/".join(pathArray[-2:])
    # Modifying existing DF by assigning the "Folder" column to be: "ParentChildFolderPath":
    df = df.assign(Folder = ParentChildFolderPath)
    return df

In [9]:
# This function is for cleaning, changing type, and math for the "Acres" column:
def Acres(df):
    # Rename the "Description" column to "Area (Acres)":
    df.rename(columns = {'Description':'Area (Acres)'}, inplace = True)
    # Clean up the "Area" column:
    for i in range(len(df['Area (Acres)'])):
        df['Area (Acres)'][i] = re.sub('^\\D+', '', df['Area (Acres)'][i])
    # Use regex to drop the "mi" since we will be using acres:
    df['Area (Acres)'] = df['Area (Acres)'].str.replace(' mi','')  
    # Adding NULL values to the empty rows:
    df['Area (Acres)'] = df['Area (Acres)'].replace('', np.nan, regex=True)
    # Change the data type to float:
    df['Area (Acres)'] = df['Area (Acres)'].astype("float64")
    # Changing the units from miles to acres:
    df['Area (Acres)'] = df['Area (Acres)'].apply(lambda x : x * 640)
    return df

In [10]:
# This function is for recoding the "Shape Type" column so that it is named relevantly:
def my_recode(Shape):
    if Shape == "POLYGON Z ":
        return "Polygon"
    if Shape == "LINESTRING Z ":
        return "Linestring"
    if Shape == "POINT Z ":
        return "Point"

# df['Shape Type'] = df['Shape'].apply(my_recode)

In [11]:
# This function is for exporting:
def SaveAsCSV(df):
    df.to_csv(file[:-3]+"csv")

## Main

In [12]:
# Run the function to import file and turn into dataframe and returns the dataframe and the filepath-
df, file = importKML()

# Creates a new column named "Points" that is the "geometry" column as a string type
df['Points'] = df['geometry'].astype(str)

# Run the following functions:

# This function cleans the "Shape", "Number of Points", "Description", and "0" into "Perimete/Length (feet)" columns:
df = NameCleanUp(df)
# This function ia for dropping unused columns
df = DropUnusedColumns(df)
# This function manipulates the "Folder" column:
df = PathFolderColumn(df)
# This function is for cleaning, changing type, and math for the "Acres" column:
df = Acres(df)

File Uploaded: C:/Users/JXL0ROY/OneDrive - NEE/Documents/Python_Scripts/TimsProblem/KML_CSV/GIS Interview Exercise1.kmz


In [13]:
# Make a new column that will contain only the "Number of Points".
for i in range(len(df["Number of Points"])):
    df["Number of Points"][i] = len(df["Number of Points"][i].split(' '))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Number of Points"][i] = len(df["Number of Points"][i].split(' '))


In [14]:
df1 = pd.DataFrame(perimeterColumnGenerator(df["Points"]))

In [15]:
# Combine the dfs together
df = pd.concat([df1, df], axis=1)

In [16]:
# Rename the "0" column to "Perimeter/Length (feet)"
df = df.rename(columns={0: "Perimeter/Length (feet)"})

In [17]:
# Reorder the columns:
df = df.loc[:,['Folder', 'Name', 'Shape Type', 'Number of Points', 'Area (Acres)', 'Perimeter/Length (feet)']]

In [18]:
# Save the CSV to the parent folder
SaveAsCSV(df)

In [19]:
# View the df head to double
df.head()

Unnamed: 0,Folder,Name,Shape Type,Number of Points,Area (Acres),Perimeter/Length (feet)
0,KML_CSV/GIS Interview Exercise1.kmz,Untitled Polygon,Polygon,15,,2961165.0
