In [13]:
import pandas as pd
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.2-bin-hadoop2.7"

In [9]:
from datetime import datetime
import math
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
import findspark
findspark.init()

In [6]:
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", "true")

In [7]:
def get_timestamp(date_text):
    """Get timestamp from date text"""
    try:
        date_text = datetime.strptime(date_text, '%Y/%m/%d %H:%M:%S+00')
    except ValueError:
        date_text = datetime.strptime(
            "1900/01/01 01:01:01+00", '%Y/%m/%d %H:%M:%S+00')

    return date_text.timestamp()

def snap_time_to_resolution(timestamp, resolution=1):
    """Snap time to resolution"""
    if resolution <= 0:
        resolution = 1
    resolution_ms = resolution * 60
    snapped_time = datetime.fromtimestamp(
        math.floor(timestamp / resolution_ms) * resolution_ms)

    return snapped_time

def snap_row(date, resolution=5):
    """Snap row to resolution"""
    timestamp = get_timestamp(date)
    snapped_time = snap_time_to_resolution(timestamp, resolution)
    return snapped_time

snap_udf = udf(snap_row, 'timestamp')

In [8]:
data_df = spark.read.csv('data/underdrain.csv',
                         header=True,
                         inferSchema=True)  # type: ignore

# data_df.na.drop(subset=['DCH_INSTALL_DATE']).show(truncate=False)
data_df.printSchema()
filtered_df = data_df.filter(data_df.DCH_INSTALL_DATE.isNotNull())

# filtered_df.foreach(lambda row: snap_row(row[18], 1))  # type: ignore

# filtered_df.withColumn('SNAPPED_TIME', snap_row(filtered_df.DCH_INSTALL_DATE, 1)).show(5)    # type: ignore
filtered_df.withColumn('SNAPPED_TIME', snap_udf(filtered_df.DCH_INSTALL_DATE)).write.csv("data/csvs", header=True)
# filtered_df.show(10)
# filtered_df.write.csv("data/csvs", header=True)

root
 |-- OBJECTID: integer (nullable = true)
 |-- DCH_FEA_KEY: integer (nullable = true)
 |-- DCH_GRPH_KEY: integer (nullable = true)
 |-- DCH_FEATYPE_TEXT: string (nullable = true)
 |-- DCH_OWNER_NAME: string (nullable = true)
 |-- DCH_PRBL_FLOW_TYPE: string (nullable = true)
 |-- DCH_MATERIAL_TYPE: string (nullable = true)
 |-- DCH_PIPE_SHP_TEXT: string (nullable = true)
 |-- DCH_LIFECYCLE_STAT: string (nullable = true)
 |-- DCH_DSTNTN_TYPE: string (nullable = true)
 |-- DCH_INSPECT_FLAG: string (nullable = true)
 |-- DCH_CASING_FLAG: string (nullable = true)
 |-- DCH_PERF_PIPE_FLAG: string (nullable = true)
 |-- DCH_LENGTH_FT_NBR: double (nullable = true)
 |-- DCH_WIDTH_IN_NBR: integer (nullable = true)
 |-- DCH_HEIGHT_IN_NBR: integer (nullable = true)
 |-- DCH_UPS_ELEV_FT_NBR: double (nullable = true)
 |-- DCH_DNS_ELEV_FT_NBR: double (nullable = true)
 |-- DCH_INSTALL_DATE: string (nullable = true)
 |-- DCH_LST_UPDT_DATE: string (nullable = true)
 |-- DCH_STREAM_NAME: string (null

                                                                                