In [0]:
import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
import datetime
month = datetime.datetime.now().month

In [0]:
@dlt.table
def df_orders():
    df = (
        spark.readStream.format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("inferSchema", "true")
        .option("header", "true")
        .option("sep", ",")
        .load("s3://yadi-pipeline/month-{}/orders".format(month))
    )
    return (
        df.withColumn("order_id", col("order_id").cast(IntegerType()))
        .withColumn("user_id", col("user_id").cast(IntegerType()))
        .withColumn("order_number", col("order_number").cast(IntegerType()))
        .withColumn("order_dow", col("order_dow").cast(IntegerType()))
        .withColumn("order_hour_of_day", col("order_hour_of_day").cast(IntegerType()))
        .withColumn(
            "days_since_prior_order", col("days_since_prior_order").cast(DoubleType())
        )
    )


dlt.create_target_table("silver_order")

dlt.apply_changes(
    target="silver_order",
    source="df_orders",
    keys=["order_id"],
    sequence_by=col("order_id"),
)


@dlt.table
def df_products():
    df = (
        spark.readStream.format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("inferSchema", "true")
        .option("header", "true")
        .option("sep", ",")
        .load("s3://yadi-pipeline/month-{}/products".format(month))
    )
    return (
        df.withColumn("product_id", col("product_id").cast(IntegerType()))
        .withColumn("aisle_id", col("aisle_id").cast(IntegerType()))
        .withColumn("department_id", col("department_id").cast(IntegerType()))
    )


dlt.create_target_table("silver_products")

dlt.apply_changes(
    target="silver_products",
    source="df_products",
    keys=["product_id"],
    sequence_by=col("product_id"),
)


@dlt.table
def df_departments():
    df = (
        spark.readStream.format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("inferSchema", "true")
        .option("header", "true")
        .option("sep", ",")
        .load("s3://yadi-pipeline/month-{}/departments".format(month))
    )
    return df.withColumn("department_id", col("department_id").cast(IntegerType()))


dlt.create_target_table("silver_departments")

dlt.apply_changes(
    target="silver_departments",
    source="df_departments",
    keys=["department_id"],
    sequence_by=col("department_id"),
)


@dlt.table
def df_aisles():
    df = (
        spark.readStream.format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("inferSchema", "true")
        .option("header", "true")
        .option("sep", ",")
        .load("s3://yadi-pipeline/month-{}/aisles".format(month))
    )
    return df.withColumn("aisle_id", col("aisle_id").cast(IntegerType()))


dlt.create_target_table("silver_aisles")

dlt.apply_changes(
    target="silver_aisles",
    source="df_aisles",
    keys=["aisle_id"],
    sequence_by=col("aisle_id"),
)


@dlt.table
def df_prior():
    df = (
        spark.readStream.format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("inferSchema", "true")
        .option("header", "true")
        .option("sep", ",")
        .load("s3://yadi-pipeline/month-{}/order_products__prior".format(month))
    )
    return (
        df.withColumn("order_id", col("order_id").cast(IntegerType()))
        .withColumn("product_id", col("product_id").cast(IntegerType()))
        .withColumn("add_to_cart_order", col("add_to_cart_order").cast(IntegerType()))
        .withColumn("reordered", col("reordered").cast(IntegerType()))
    )


dlt.create_target_table("silver_prior")

dlt.apply_changes(
    target="silver_prior",
    source="df_prior",
    keys=["order_id", "product_id"],
    sequence_by=col("order_id"),
)


@dlt.table
def df_train():
    df = (
        spark.readStream.format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("inferSchema", "true")
        .option("header", "true")
        .option("sep", ",")
        .load("s3://yadi-pipeline/month-{}/order_products__train".format(month))
    )
    return (
        df.withColumn("order_id", col("order_id").cast(IntegerType()))
        .withColumn("product_id", col("product_id").cast(IntegerType()))
        .withColumn("add_to_cart_order", col("add_to_cart_order").cast(IntegerType()))
        .withColumn("reordered", col("reordered").cast(IntegerType()))
    )


dlt.create_target_table("silver_train")

dlt.apply_changes(
    target="silver_train",
    source="df_train",
    keys=["order_id", "product_id"],
    sequence_by=col("order_id"),
)


@dlt.table
def df_pad():
    return (
        spark.sql(
            "select p.product_id, p.product_name, p.aisle_id, a.aisle, p.department_id, d.department from LIVE.silver_products p, LIVE.silver_aisles a, LIVE.silver_departments d "
            + "where p.aisle_id == a.aisle_id and p.department_id == d.department_id "
            + "order by p.product_id"
        )
        .withColumnRenamed("aisle", "aisle_name")
        .withColumnRenamed("department", "department_name")
    )


# @dlt.table
# def df_pads():
#     return dlt.read("df_pad")

# dlt.create_streaming_live_table("silver_pad")

# dlt.apply_changes(
#   target = "silver_pad",
#   source = "df_pads",
#   key = ["product_id"],
#   sequence_by = col("product_id")
# )


@dlt.table
def df_op():
    return spark.sql(
        "SELECT a.order_id, a.user_id, a.eval_set, a.order_number, a.order_dow, a.order_hour_of_day, a.days_since_prior_order, b.product_id, b.add_to_cart_order, b.reordered FROM LIVE.silver_order a JOIN LIVE.silver_prior b ON a.order_id = b.order_id WHERE a.eval_set = 'prior'"
    )


# @dlt.table
# def df_ops():
#     return dlt.read("df_op")

# dlt.create_streaming_live_table("silver_op")

# dlt.apply_changes(
#     target = "silver_op",
#     source = "df_ops",
#     key = ["order_id", "product_id"],
#     sequence_by = col("order_id")
# )


@dlt.table
def df_opp():
    return spark.sql(
        "select p.order_id, o.user_id, p.product_id, a.product_name, a.aisle_id, a.aisle_name, a.department_id, a.department_name, p.add_to_cart_order, p.reordered from LIVE.silver_prior p, LIVE.silver_order o, LIVE.df_pad a "
        + "where p.order_id == o.order_id and p.product_id == a.product_id "
        + "order by p.order_id"
    )


# @dlt.table
# def df_opps():
#     return dlt.read("df_opp")

# dlt.create_streaming_live_table("silver_opp")

# dlt.apply_changes(
#     target = "silver_opp",
#     source = "df_opps",
#     key = ["order_id", "product_id"],
#     sequence_by = col("order_id")
# )


@dlt.table
def df_otp():
    return spark.sql(
        "select t.order_id, o.user_id, t.product_id, a.product_name, a.aisle_id, a.aisle_name, a.department_id, a.department_name, t.add_to_cart_order, t.reordered from LIVE.silver_train t, LIVE.silver_order o, LIVE.df_pad a "
        + "where t.order_id == o.order_id and t.product_id == a.product_id "
        + "order by t.order_id"
    )


# @dlt.table
# def df_otps():
#     return dlt.read("df_otp")

# dlt.create_streaming_live_table("silver_otp")

# dlt.apply_changes(
#     target = "silver_otp",
#     source = "df_otp",
#     key = ["order_id", "product_id"],
#     sequence_by = col("order_id")
# )


@dlt.table
def user_features_1():
    return spark.sql(
        "SELECT user_id, Max(order_number) AS user_orders, Sum(days_since_prior_order) AS user_period, Avg(days_since_prior_order) AS user_mean_days_since_prior FROM LIVE.silver_order GROUP BY user_id"
    )


@dlt.table
def user_features_2():
    return spark.sql(
        """SELECT user_id,
 Count(*) AS user_total_products,
 Count(DISTINCT product_id) AS user_distinct_products ,
 Sum(CASE WHEN reordered = 1 THEN 1 ELSE 0 END) / Cast(Sum(CASE WHEN
order_number > 1 THEN 1 ELSE 0 END) AS DOUBLE) AS user_reorder_ratio
FROM LIVE.df_op
GROUP BY user_id """
    )


@dlt.table
def up_features():
    return spark.sql(
        "SELECT user_id, product_id, Count(*) AS up_orders, Min(order_number) AS up_first_order, Max(order_number) AS up_last_order, Avg(add_to_cart_order) AS up_average_cart_position FROM LIVE.df_op GROUP BY user_id, product_id"
    )


@dlt.table
def prd_features():
    return spark.sql(
        """SELECT product_id,
 Count(*) AS prod_orders,
 Sum(reordered) AS prod_reorders,
 Sum(CASE WHEN product_seq_time = 1 THEN 1 ELSE 0 END) AS prod_first_orders,
 Sum(CASE WHEN product_seq_time = 2 THEN 1 ELSE 0 END) AS prod_second_orders
FROM (SELECT *,
 Rank()
 OVER (
 partition BY user_id, product_id
 ORDER BY order_number) AS product_seq_time
 FROM LIVE.df_op)
GROUP BY product_id """
    )

@dlt.table
def user_features():
    return (spark.sql("""SELECT f.user_id, 
                                f.user_orders, 
                                f.user_period, 
                                f.user_mean_days_since_prior, 
                                e.user_total_products, 
                                e.user_distinct_products, 
                                e.user_reorder_ratio
                         FROM LIVE.user_features_1 f, 
                              LIVE.user_features_2 e 
                         WHERE f.user_id == e.user_id
                         ORDER BY f.user_id"""))
    
@dlt.table(
path = "s3://yadi-pipeline/month-{}/output".format(month)
)

def ml_input():
    return (spark.sql("""SELECT u.product_id, 
                                u.up_orders, 
                                f.user_mean_days_since_prior, 
                                f.user_period, 
                                f.user_distinct_products, 
                                p.prod_second_orders, 
                                p.prod_reorders, 
                                f.user_reorder_ratio, 
                                f.user_total_products, 
                                u.up_average_cart_position, 
                                u.up_first_order, 
                                f.user_orders, 
                                u.up_last_order, 
                                p.prod_orders, 
                                p.prod_first_orders, 
                                u.user_id 
    FROM LIVE.user_features f,  
         LIVE.prd_features p, 
         LIVE.up_features u 
    WHERE u.user_id == f.user_id
          and u.product_id == p.product_id 
    ORDER BY u.product_id"""))

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-4030763951663131>[0m in [0;36m<cell line: 1>[0;34m()[0m
[0;32m----> 1[0;31m [0;34m@[0m[0mdlt[0m[0;34m.[0m[0mtable[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      2[0m [0;32mdef[0m [0mdf_orders[0m[0;34m([0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m      3[0m     df = (
[1;32m      4[0m         [0mspark[0m[0;34m.[0m[0mreadStream[0m[0;34m.[0m[0mformat[0m[0;34m([0m[0;34m"cloudFiles"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m      5[0m         [0;34m.[0m[0moption[0m[0;34m([0m[0;34m"cloudFiles.format"[0m[0;34m,[0m [0;34m"csv"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

[0;31mNameError[0m: name 'dlt' is not defined