In [48]:

# creating spark session
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()
spark = SparkSession.builder \
    .config('spark.shuffle.useOldFetchProtocol','true') \
    .config("spark.ui.port", "0") \
    .config('spark.sql.warehouse.dir', f'/user/{username}/warehouse') \
    .enableHiveSupport() \
.master('yarn') \
.getOrCreate()

In [49]:
from pyspark.sql.functions import date_format # import date format function

In [50]:
order_schema = 'order_id long, order_date string, customer_id long, order_status string' # creating meta data

In [51]:
order_data = spark.read \
.format('csv') \
.schema(order_schema) \
.load('/rishabh/ivt008688/orders_sample.csv') # reading file of csv format

In [52]:
order_data.show() # showing some records of the file

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

In [53]:
order_data.printSchema() # shcema of the file 

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [54]:
order_data.show(truncate=False)

+--------+---------------------+-----------+---------------+
|order_id|order_date           |customer_id|order_status   |
+--------+---------------------+-----------+---------------+
|1       |2013-07-25 00:00:00.0|11599      |CLOSED         |
|2       |2013-07-25 00:00:00.0|256        |PENDING_PAYMENT|
|3       |2013-07-25 00:00:00.0|12111      |COMPLETE       |
|4       |2013-07-25 00:00:00.0|8827       |CLOSED         |
|5       |2013-07-25 00:00:00.0|11318      |COMPLETE       |
|6       |2013-07-25 00:00:00.0|7130       |COMPLETE       |
|7       |2013-07-25 00:00:00.0|4530       |COMPLETE       |
|8       |2013-07-25 00:00:00.0|2911       |PROCESSING     |
|9       |2013-07-25 00:00:00.0|5657       |PENDING_PAYMENT|
|10      |2013-07-25 00:00:00.0|5648       |PENDING_PAYMENT|
|11      |2013-07-25 00:00:00.0|918        |PAYMENT_REVIEW |
|12      |2013-07-25 00:00:00.0|1837       |CLOSED         |
|13      |2013-07-25 00:00:00.0|9149       |PENDING_PAYMENT|
|14      |2013-07-25 00:

### Date Transformation

In [55]:
order_data_transformed =order_data.withColumn('order_date',date_format('order_date','MM-dd-yyyy' )) # converting from yyyy-mm-dd to MM-DD-YYYY format.

In [56]:
order_data_transformed.show()

+--------+----------+-----------+---------------+
|order_id|order_date|customer_id|   order_status|
+--------+----------+-----------+---------------+
|       1|07-25-2013|      11599|         CLOSED|
|       2|07-25-2013|        256|PENDING_PAYMENT|
|       3|07-25-2013|      12111|       COMPLETE|
|       4|07-25-2013|       8827|         CLOSED|
|       5|07-25-2013|      11318|       COMPLETE|
|       6|07-25-2013|       7130|       COMPLETE|
|       7|07-25-2013|       4530|       COMPLETE|
|       8|07-25-2013|       2911|     PROCESSING|
|       9|07-25-2013|       5657|PENDING_PAYMENT|
|      10|07-25-2013|       5648|PENDING_PAYMENT|
|      11|07-25-2013|        918| PAYMENT_REVIEW|
|      12|07-25-2013|       1837|         CLOSED|
|      13|07-25-2013|       9149|PENDING_PAYMENT|
|      14|07-25-2013|       9842|     PROCESSING|
|      15|07-25-2013|       2568|       COMPLETE|
|      16|07-25-2013|       7276|PENDING_PAYMENT|
|      17|07-25-2013|       2667|       COMPLETE|


# Save the into sql or postgresql server

In [None]:

# give table name
table_name = "order_date_transformed"
# jdbc url, it is dummy url for showing work
jdbc_url = "jdbc:sqlserver://your_sql_server_host:your_sql_server_port;databaseName=your_database"

# dummy properteis to show, as this apache spark is not connected to actual server
properties = {
    "user": "your_username",
    "password": "your_password",
    "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}

# code to save table into sql server
order_data_transformed.write. \
.jdbc(url=jdbc_url, table=table_name, mode="overwrite", properties=properties)

## Save the output as a Parquet file.

In [57]:
order_data_transformed.write \
.format("parquet") \
.mode('overwrite') \  
.save("practiceOutput2/order_data_transformed")  # file location to store