In [None]:
!pip install numpy
!pip install pandas
!wget https://github.com/databrickslabs/dbldatagen/archive/refs/tags/v.0.2.0-rc1-master.zip
!pip install v.0.2.0-rc1-master.zip

In [None]:
#pls adjust:
access_key = "access_key"
secret_key = "secret_key"
own_folder = "tim"
shared_folder = "shared"

In [None]:
import time
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/default-java"
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages "io.delta:delta-core_2.12:1.1.0,org.apache.hadoop:hadoop-aws:3.3.1" pyspark-shell'

import pyspark
from delta import configure_spark_with_delta_pip

namespace = os.environ["NAMESPACE"] # usually "firstname-lastname"
notebook_name = os.environ["NOTEBOOK_NAME"] # might be helpful

builder = (
    pyspark.sql.SparkSession.builder.appName(f"{namespace}-spark-app")
    .config("spark.hadoop.fs.s3a.endpoint", "miniotimrelease.miniotim:9000") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.access.key", access_key) \
    .config("spark.hadoop.fs.s3a.secret.key", secret_key) \
    .config("spark.executor.instances", "1") # number of Executors
    .config("spark.executor.memory", "8g") # Executor memory
    .config("spark.executor.cores", "1") # Executor cores
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
#Verification test
spark.range(5).write.format("delta").mode("overwrite").save(f"s3a://deltabucket/{own_folder}/delta-table-bench")


In [None]:
#Verification test
spark.read.format("delta").load(f"s3a://deltabucket/{own_folder}/delta-table-bench").show()


In [None]:
##concurrent test #1 =================================================================================

In [None]:
%%time

import math
import pyspark.sql.functions as F

number_of_rows = 100000000
num_digits = int(math.log10(number_of_rows)) + 1

df = spark.range(1, number_of_rows+1)
df = df.withColumn("value", F.lit(own_folder))
df = df.withColumn("value", F.concat_ws("", F.col("value"), F.format_string(f"%0{num_digits}d", F.col("id"))))
df.write.format("delta").mode("overwrite").save(f"s3a://deltabucket/{shared_folder}/thousand")
df.show()
df.count()

In [None]:
#Verification test
read = spark.read.format("delta").load(f"s3a://deltabucket/{shared_folder}/thousand")
read.show()


In [None]:
#Verification test
read.count()

In [None]:
import dbldatagen as dg

schema = dg.SchemaParser.parseCreateTable(spark, """
    create table Test1 (
    source string ,
    language string ,
    topic string ,
    license string )
""")

data_rows = 4*10**9

x3 = (dg.DataGenerator(sparkSession=spark, name="test_table_query", rows=data_rows, partitions=20)
      .withSchema(schema)
      .withIdOutput()
      .withColumnSpec("source", values=["hackernews", "cc", "wikipedia", "academic", "books", "pubmed", "opensubtitiles", "youtubesubtitles"], random=True)
      .withColumnSpec("language", values=["en", "de", "fr", "es", "ru"], random=True)
      .withColumnSpec("topic", values=["software", "medical", "cultural", "academic", "hardware", "ai", "ml", "random"], random=True)
      .withColumnSpec("license", values=["MIT", "GPL-v2", "GPL-v3", "private", "apache", "cc"], random=True)
     )

x3_output_full = x3.build()

In [None]:
##concurrent test #2 =================================================================================

In [None]:
%%time

start = time.monotonic_ns()
#x3_output_full.write.format("delta").mode("overwrite").saveAsTable("test_data")
x3_output_full.write.format("delta").mode("overwrite").saveAsTable("test_data", path='s3a://deltabucket/shared/delta-table-bench')
print("Time elapsed : ", (time.monotonic_ns() - start)/10**9, "s")


In [None]:
#Verification test
data_table = spark.table("test_data")
data_table.count()