In [None]:
# Spark app setup
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructType, StructField
from pyspark.sql.functions import col

db_path="s3a://se-indonesia-cdp/data/warehouse/tablespace/managed/hive"
spark = (SparkSession
    .builder
    .appName("homecredit-spark")
    .config("spark.sql.warehouse.dir", db_path)
    .config("spark.hadoop.fs.s2a.s3guard.ddb.region", "us-east-1")
    .config("spark.yarn.access.hadoopFileSystems","s3a://se-indonesia-cdp/")
    .master("local[5]") # should be possible to change this to SPARK on Yarn or SPARK on Kubernetes
    .getOrCreate())


#

In [None]:
# SAMPLE CODE 

from pyspark.sql.functions import *
from pyspark.sql.window import *

# Method 1: 
df = spark.createDataFrame(
    [
        ('Alice','10'),('Susan','12')
    ],
    ['Name','Age']
)


df1=df.rdd.zipWithIndex().toDF()
df2=df1.select(col("_1.*"),col("_2").alias('increasing_id'))
df2.show()

# -- Method2 
df_with_increasing_id = df.withColumn("monotonically_increasing_id", monotonically_increasing_id())
df_with_increasing_id.show()

# Method3
window = Window.orderBy(col('monotonically_increasing_id'))
df_with_consecutive_increasing_id = df_with_increasing_id.withColumn('increasing_id', row_number().over(window))
df_with_consecutive_increasing_id.show()

# Method3a - Just using row_id
df3 =df_with_consecutive_increasing_id.drop(col("monotonically_increasing_id"))
df3.show()


In [11]:
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder \
               .appName('SparkByExamples.com') \
               .getOrCreate()
data=[["1"], ["2"], ["3"]]
df=spark.createDataFrame(data,["id"])

from pyspark.sql.functions import *
#current_date() & current_timestamp()
df.withColumn("current_date",current_date()) \
  .withColumn("current_timestamp",current_timestamp()) \
  .withColumn("decreased_timestamp", current_timestamp() - expr("INTERVAL 2 minutes") * col("id"))  \
  .show(truncate=False)

+---+------------+----------------------+----------------------+
|id |current_date|current_timestamp     |decreased_timestamp   |
+---+------------+----------------------+----------------------+
|1  |2022-12-01  |2022-12-01 07:18:50.14|2022-12-01 07:16:50.14|
|2  |2022-12-01  |2022-12-01 07:18:50.14|2022-12-01 07:14:50.14|
|3  |2022-12-01  |2022-12-01 07:18:50.14|2022-12-01 07:12:50.14|
+---+------------+----------------------+----------------------+



In [None]:
df3.select(col("index"), col("sk_id_curr"), col("increasing_id"), col("target"), col("code_gender"), col("amt_income_total")).show()
df3.createOrReplaceTempView("homecredit")
#query_string = '''select index, sk_id_curr, target, code_gender, amt_income_total, cast(current_timestamp as TIMESTAMP) - (INTERVAL 1 minutes) * increasing_id as event_timestamp, from homecredit'''
query_string = '''select index, sk_id_curr, target, code_gender, amt_income_total, cast(current_timestamp as TIMESTAMP) - (INTERVAL 1 minutes) * index as event_timestamp from homecredit'''

spark.sql( query_string).show()