In [8]:
# Spark app setup
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructType, StructField
from pyspark.sql.functions import col

db_path="s3a://se-indonesia-cdp/data/warehouse/tablespace/managed/hive"
spark = (SparkSession
    .builder
    .appName("homecredit-spark")
    .config("spark.sql.warehouse.dir", db_path)
    .config("spark.hadoop.fs.s2a.s3guard.ddb.region", "us-east-1")
    .config("spark.yarn.access.hadoopFileSystems","s3a://se-indonesia-cdp/")
    .master("local[5]") # should be possible to change this to SPARK on Yarn or SPARK on Kubernetes
    .getOrCreate())


#

In [3]:
# SAMPLE CODE 

from pyspark.sql.functions import *
from pyspark.sql.window import *

# Method 1: 
df = spark.createDataFrame(
    [
        ('Alice','10'),('Susan','12')
    ],
    ['Name','Age']
)


df1=df.rdd.zipWithIndex().toDF()
df2=df1.select(col("_1.*"),col("_2").alias('increasing_id'))
df2.show()

# -- Method2 
df_with_increasing_id = df.withColumn("monotonically_increasing_id", monotonically_increasing_id())
df_with_increasing_id.show()

# Method3
window = Window.orderBy(col('monotonically_increasing_id'))
df_with_consecutive_increasing_id = df_with_increasing_id.withColumn('increasing_id', row_number().over(window))
df_with_consecutive_increasing_id.show()

# Method3a - Just using row_id
df3 =df_with_consecutive_increasing_id.drop(col("monotonically_increasing_id"))
df3.show()


                                                                                

+-----+---+-------------+
| Name|Age|increasing_id|
+-----+---+-------------+
|Alice| 10|            0|
|Susan| 12|            1|
+-----+---+-------------+

+-----+---+---------------------------+
| Name|Age|monotonically_increasing_id|
+-----+---+---------------------------+
|Alice| 10|                17179869184|
|Susan| 12|                34359738368|
+-----+---+---------------------------+

+-----+---+---------------------------+-------------+
| Name|Age|monotonically_increasing_id|increasing_id|
+-----+---+---------------------------+-------------+
|Alice| 10|                17179869184|            1|
|Susan| 12|                34359738368|            2|
+-----+---+---------------------------+-------------+

+-----+---+-------------+
| Name|Age|increasing_id|
+-----+---+-------------+
|Alice| 10|            1|
|Susan| 12|            2|
+-----+---+-------------+



In [14]:
# from pyspark.sql import SparkSession
# Spark app setup
from pyspark.sql import SparkSession
# Create SparkSession
spark = SparkSession.builder \
               .appName('SparkByExamples.com') \
               .getOrCreate()
data=[["1"], ["2"], ["3"], ["4"], ["5"]]
df=spark.createDataFrame(data,["id"])

from pyspark.sql.functions import *
#current_date() & current_timestamp()
df=df.withColumn("current_date",current_date()) \
  .withColumn("current_timestamp",current_timestamp()) \
  .withColumn("event_timestamp", current_timestamp()- expr("INTERVAL 2 Month")  - expr("INTERVAL 10 seconds") * col("id"))  
schema = df.schema
df_pd = df.toPandas()
df1 = spark.createDataFrame(df_pd, schema=schema)
del df_pd
df1 = df1.withColumn("current_date",current_date()) \
  .withColumn("current_timestamp",current_timestamp()) \
  .withColumn("event_timestamp", current_timestamp()+ expr("INTERVAL 2 Years")  + expr("INTERVAL 10 seconds") * col("id"))  \

df.union(df1).show(truncate=False)

+---+------------+-----------------------+-----------------------+
|id |current_date|current_timestamp      |event_timestamp        |
+---+------------+-----------------------+-----------------------+
|1  |2022-12-07  |2022-12-07 05:57:56.482|2022-10-07 05:57:46.482|
|2  |2022-12-07  |2022-12-07 05:57:56.482|2022-10-07 05:57:36.482|
|3  |2022-12-07  |2022-12-07 05:57:56.482|2022-10-07 05:57:26.482|
|4  |2022-12-07  |2022-12-07 05:57:56.482|2022-10-07 05:57:16.482|
|5  |2022-12-07  |2022-12-07 05:57:56.482|2022-10-07 05:57:06.482|
|1  |2022-12-07  |2022-12-07 05:57:56.482|2024-12-07 05:58:06.482|
|2  |2022-12-07  |2022-12-07 05:57:56.482|2024-12-07 05:58:16.482|
|3  |2022-12-07  |2022-12-07 05:57:56.482|2024-12-07 05:58:26.482|
|4  |2022-12-07  |2022-12-07 05:57:56.482|2024-12-07 05:58:36.482|
|5  |2022-12-07  |2022-12-07 05:57:56.482|2024-12-07 05:58:46.482|
+---+------------+-----------------------+-----------------------+



In [None]:
df3.select(col("index"), col("sk_id_curr"), col("increasing_id"), col("target"), col("code_gender"), col("amt_income_total")).show()
df3.createOrReplaceTempView("homecredit")
#query_string = '''select index, sk_id_curr, target, code_gender, amt_income_total, cast(current_timestamp as TIMESTAMP) - (INTERVAL 1 minutes) * increasing_id as event_timestamp, from homecredit'''
query_string = '''select index, sk_id_curr, target, code_gender, amt_income_total, cast(current_timestamp as TIMESTAMP) - (INTERVAL 1 minutes) * index as event_timestamp from homecredit'''

spark.sql( query_string).show()

In [None]:
%%bash
pip install -r /home/cdsw/requirements.txt

In [9]:
my_dict = { 
    "train": {
        "path":"/home/cdsw/02_RAW_TRAINING_DATA/processed_data.csv",
         "store_name": "homecredit_processed_data_train"
    },
    
    "test": {
        "path":"/home/cdsw/02_PREDICTION_DATA/processed_data.csv",
         "store_name": "homecredit_processed_data_test"
    },
    
            
}

In [11]:
my_dict["train"]["path"]

'/home/cdsw/02_RAW_TRAINING_DATA/processed_data.csv'

In [1]:
#dynamic dictionary

import os 
datasource_dict = { 
    "train": {
        "path":"/home/cdsw/04_RAW_TRAINING_DATA/processed_data.csv",
         "sourcedatatype": "train"
    },
    
    "test": {
        "path":"/home/cdsw/02_PREDICTION_DATA/processed_data.csv",
         "sourcedatatype": "test"
    },
    
            
}

datasource_dict1 = { "train" : {}, 
                    "test": {}
                  }
datasource_dict1["train"]["path"]  = os.environ["DATASOURCE_PATH_TRAIN"]
datasource_dict1["test"]["path"] = os.environ["DATASOURCE_PATH_TEST"]

datasource_dict1["train"]["sourcedatatype"] = "train"
datasource_dict1["test"]["sourcedatatype"] = "test"



{'train': {'path': '/home/cdsw/04_RAW_TRAINING_DATA/processed_data.csv', 'sourcedatatype': 'train'}, 'test': {'path': '/home/cdsw/02_PREDICTION_DATA/processed_data.csv', 'sourcedatatype': 'test'}}
