In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("column").getOrCreate()

In [3]:
orders_df = spark.read.format("csv") \
    .option("inferSchema","true") \
        .option("header","true") \
            .load(r"C:\Users\TARUN\Desktop\Pyspark\Trendytech\data\orders1.csv")

In [4]:
orders_df.show(3)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 3 rows



In [5]:
orders_df.select("order_id").show(2)

+--------+
|order_id|
+--------+
|       1|
|       2|
+--------+
only showing top 2 rows



In [7]:
orders_df.select(orders_df.order_id).show(2)

+--------+
|order_id|
+--------+
|       1|
|       2|
+--------+
only showing top 2 rows



In [8]:
orders_df.select(orders_df['order_id']).show(2)

+--------+
|order_id|
+--------+
|       1|
|       2|
+--------+
only showing top 2 rows



In [11]:
from pyspark.sql.functions import *

In [14]:
orders_df.select(expr("order_id")).show(2)

+--------+
|order_id|
+--------+
|       1|
|       2|
+--------+
only showing top 2 rows



In [15]:
orders_df.selectExpr("order_id").show(2)

+--------+
|order_id|
+--------+
|       1|
|       2|
+--------+
only showing top 2 rows



In [16]:
orders_df.select(col("order_id")).show(2)

+--------+
|order_id|
+--------+
|       1|
|       2|
+--------+
only showing top 2 rows



In [17]:
orders_df.select(column("order_id")).show(2)

+--------+
|order_id|
+--------+
|       1|
|       2|
+--------+
only showing top 2 rows



In [19]:
orders_df.select('order_id',orders_df['order_date'],expr("order_customer_id"),col("order_status")).show(3)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 3 rows



In [22]:
orders_df.select('order_id' ,(col('order_id') + 1).alias('new_id')).show(2)

+--------+------+
|order_id|new_id|
+--------+------+
|       1|     2|
|       2|     3|
+--------+------+
only showing top 2 rows



In [25]:
orders_df.select('order_id' ,(column('order_id') + 1).alias('new_id')).show(2)

+--------+------+
|order_id|new_id|
+--------+------+
|       1|     2|
|       2|     3|
+--------+------+
only showing top 2 rows



In [24]:
orders_df.select("order_id", expr("order_id + 1  as new_id")).show(2)

+--------+------+
|order_id|new_id|
+--------+------+
|       1|     2|
|       2|     3|
+--------+------+
only showing top 2 rows



In [26]:
pands_df = orders_df.toPandas()

  series = series.astype(t, copy=False)


In [28]:
pands_df.head(3)

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,1,2013-07-25,11599,CLOSED
1,2,2013-07-25,256,PENDING_PAYMENT
2,3,2013-07-25,12111,COMPLETE


In [29]:
pands_df.order_id.head(2)

0    1
1    2
Name: order_id, dtype: int32

In [30]:
pands_df['order_id'].head(2)

0    1
1    2
Name: order_id, dtype: int32

In [31]:
pands_df[['order_id',"order_status"]].head(2)

Unnamed: 0,order_id,order_status
0,1,CLOSED
1,2,PENDING_PAYMENT


In [32]:
orders_df.createOrReplaceTempView("orders")

In [34]:
spark.sql("select * from orders limit 2").show()

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
+--------+-------------------+-----------------+---------------+



In [35]:
spark.sql("select order_id from orders limit 2").show()

+--------+
|order_id|
+--------+
|       1|
|       2|
+--------+



In [39]:
orders_df.select("order_id",
                 orders_df.order_date,
                 orders_df['order_date'],
                 column('order_customer_id'),
                 col('order_customer_id'),
                 expr("order_status")
                 ).where("order_status like 'PENDING%'").show()

+--------+-------------------+-------------------+-----------------+-----------------+---------------+
|order_id|         order_date|         order_date|order_customer_id|order_customer_id|   order_status|
+--------+-------------------+-------------------+-----------------+-----------------+---------------+
|       2|2013-07-25 00:00:00|2013-07-25 00:00:00|              256|              256|PENDING_PAYMENT|
|       9|2013-07-25 00:00:00|2013-07-25 00:00:00|             5657|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|2013-07-25 00:00:00|             5648|             5648|PENDING_PAYMENT|
|      13|2013-07-25 00:00:00|2013-07-25 00:00:00|             9149|             9149|PENDING_PAYMENT|
|      16|2013-07-25 00:00:00|2013-07-25 00:00:00|             7276|             7276|PENDING_PAYMENT|
|      19|2013-07-25 00:00:00|2013-07-25 00:00:00|             9488|             9488|PENDING_PAYMENT|
|      21|2013-07-25 00:00:00|2013-07-25 00:00:00|             2711|     

In [38]:
orders_df.select("order_id",
                 orders_df.order_date,
                 orders_df['order_date'],
                 column('order_customer_id'),
                 col('order_customer_id'),
                 expr("order_status")
                 ).where(col('order_status').like('PENDING%')).show()

+--------+-------------------+-------------------+-----------------+-----------------+---------------+
|order_id|         order_date|         order_date|order_customer_id|order_customer_id|   order_status|
+--------+-------------------+-------------------+-----------------+-----------------+---------------+
|       2|2013-07-25 00:00:00|2013-07-25 00:00:00|              256|              256|PENDING_PAYMENT|
|       9|2013-07-25 00:00:00|2013-07-25 00:00:00|             5657|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|2013-07-25 00:00:00|             5648|             5648|PENDING_PAYMENT|
|      13|2013-07-25 00:00:00|2013-07-25 00:00:00|             9149|             9149|PENDING_PAYMENT|
|      16|2013-07-25 00:00:00|2013-07-25 00:00:00|             7276|             7276|PENDING_PAYMENT|
|      19|2013-07-25 00:00:00|2013-07-25 00:00:00|             9488|             9488|PENDING_PAYMENT|
|      21|2013-07-25 00:00:00|2013-07-25 00:00:00|             2711|     

In [40]:
orders_df.select("order_id",
                 'order_customer_id',
                 "order_status"
                 ).where(col('order_status').like('PENDING%')).show()

+--------+-----------------+---------------+
|order_id|order_customer_id|   order_status|
+--------+-----------------+---------------+
|       2|              256|PENDING_PAYMENT|
|       9|             5657|PENDING_PAYMENT|
|      10|             5648|PENDING_PAYMENT|
|      13|             9149|PENDING_PAYMENT|
|      16|             7276|PENDING_PAYMENT|
|      19|             9488|PENDING_PAYMENT|
|      21|             2711|        PENDING|
|      23|             4367|PENDING_PAYMENT|
|      27|             3241|PENDING_PAYMENT|
|      30|            10039|PENDING_PAYMENT|
|      33|             5793|PENDING_PAYMENT|
|      36|             5649|        PENDING|
|      39|             8214|        PENDING|
|      40|            12092|PENDING_PAYMENT|
|      41|             8136|PENDING_PAYMENT|
|      42|             9776|        PENDING|
|      43|             7776|PENDING_PAYMENT|
|      44|            10500|        PENDING|
|      47|             8487|PENDING_PAYMENT|
|      49|

In [41]:
spark.stop()