In [2]:
# Spark app setup
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructType, StructField
from pyspark.sql.functions import col

db_path="s3a://se-indonesia-cdp/data/warehouse/tablespace/managed/hive"
spark = (SparkSession
    .builder
    .appName("homecredit-spark")
    .config("spark.sql.warehouse.dir", db_path)
    .config("spark.hadoop.fs.s2a.s3guard.ddb.region", "us-east-1")
    .config("spark.yarn.access.hadoopFileSystems","s3a://se-indonesia-cdp/")
    .master("local[5]") # should be possible to change this to SPARK on Yarn or SPARK on Kubernetes
    .getOrCreate())


#

In [3]:
# SAMPLE CODE 

from pyspark.sql.functions import *
from pyspark.sql.window import *

# Method 1: 
df = spark.createDataFrame(
    [
        ('Alice','10'),('Susan','12')
    ],
    ['Name','Age']
)


df1=df.rdd.zipWithIndex().toDF()
df2=df1.select(col("_1.*"),col("_2").alias('increasing_id'))
df2.show()

# -- Method2 
df_with_increasing_id = df.withColumn("monotonically_increasing_id", monotonically_increasing_id())
df_with_increasing_id.show()

# Method3
window = Window.orderBy(col('monotonically_increasing_id'))
df_with_consecutive_increasing_id = df_with_increasing_id.withColumn('increasing_id', row_number().over(window))
df_with_consecutive_increasing_id.show()

# Method3a - Just using row_id
df3 =df_with_consecutive_increasing_id.drop(col("monotonically_increasing_id"))
df3.show()


                                                                                

+-----+---+-------------+
| Name|Age|increasing_id|
+-----+---+-------------+
|Alice| 10|            0|
|Susan| 12|            1|
+-----+---+-------------+

+-----+---+---------------------------+
| Name|Age|monotonically_increasing_id|
+-----+---+---------------------------+
|Alice| 10|                17179869184|
|Susan| 12|                34359738368|
+-----+---+---------------------------+

+-----+---+---------------------------+-------------+
| Name|Age|monotonically_increasing_id|increasing_id|
+-----+---+---------------------------+-------------+
|Alice| 10|                17179869184|            1|
|Susan| 12|                34359738368|            2|
+-----+---+---------------------------+-------------+

+-----+---+-------------+
| Name|Age|increasing_id|
+-----+---+-------------+
|Alice| 10|            1|
|Susan| 12|            2|
+-----+---+-------------+



In [5]:
# from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder \
               .appName('SparkByExamples.com') \
               .getOrCreate()
data=[["1"], ["2"], ["3"]]
df=spark.createDataFrame(data,["id"])

from pyspark.sql.functions import *
#current_date() & current_timestamp()
df.withColumn("current_date",current_date()) \
  .withColumn("current_timestamp",current_timestamp()) \
  .withColumn("decreased_timestamp", current_timestamp() - expr("INTERVAL 10 seconds") * col("id"))  \
  .show(truncate=False)

+---+------------+-----------------------+-----------------------+
|id |current_date|current_timestamp      |decreased_timestamp    |
+---+------------+-----------------------+-----------------------+
|1  |2022-12-06  |2022-12-06 09:38:06.941|2022-12-06 09:37:56.941|
|2  |2022-12-06  |2022-12-06 09:38:06.941|2022-12-06 09:37:46.941|
|3  |2022-12-06  |2022-12-06 09:38:06.941|2022-12-06 09:37:36.941|
+---+------------+-----------------------+-----------------------+



In [None]:
df3.select(col("index"), col("sk_id_curr"), col("increasing_id"), col("target"), col("code_gender"), col("amt_income_total")).show()
df3.createOrReplaceTempView("homecredit")
#query_string = '''select index, sk_id_curr, target, code_gender, amt_income_total, cast(current_timestamp as TIMESTAMP) - (INTERVAL 1 minutes) * increasing_id as event_timestamp, from homecredit'''
query_string = '''select index, sk_id_curr, target, code_gender, amt_income_total, cast(current_timestamp as TIMESTAMP) - (INTERVAL 1 minutes) * index as event_timestamp from homecredit'''

spark.sql( query_string).show()

In [11]:
%%bash
pip install -r /home/cdsw/requirements.txt

Collecting tensorflow
  Downloading tensorflow-2.11.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (588.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 588.3/588.3 MB 1.5 MB/s eta 0:00:00
Collecting tensorboard<2.12,>=2.11
  Downloading tensorboard-2.11.0-py3-none-any.whl (6.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.0/6.0 MB 42.3 MB/s eta 0:00:00
Collecting h5py>=2.9.0
  Downloading h5py-3.7.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (4.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.1/4.1 MB 35.5 MB/s eta 0:00:00
Collecting google-pasta>=0.1.1
  Downloading google_pasta-0.2.0-py3-none-any.whl (57 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 57.5/57.5 kB 2.0 MB/s eta 0:00:00
Collecting opt-einsum>=2.3.2
  Downloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 65.5/65.5 kB 957.7 kB/s eta 0:00:00
Collecting astunparse>=1.6.0
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Colle