In [7]:
pip install findspark
import findspark
findspark.init()

In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [9]:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row

df = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
])
df.printSchema()
df.show()

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
|  4|5.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+



In [10]:
from pyspark.sql.functions import datediff
from pyspark.sql.functions import current_timestamp

data = spark.createDataFrame([
    (1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
    (2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
    (3, 4., 'string3', date(2000, 3, 1), datetime(2023, 2, 27, 12, 0))
], schema='a long, b double, c string, d date, e timestamp')

data.filter(datediff(current_timestamp(), data.e) < 7).show()




+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  3|4.0|string3|2000-03-01|2023-02-27 12:00:00|
+---+---+-------+----------+-------------------+



In [11]:
df = spark.createDataFrame([(100,"DEBIT",1000.0,"IND"),(101,"CREDIT",2000.0,"IND"),(102,"DEBIT",3000.0,"AUS"),
                            (103,"CREDIT",4000.0,"JPN"),(104,"DEBIT",5000.0,"IND"),(105,"CREDIT",6000.0,"AUS")]
, schema= 'id int, type string, amt float, code string')
df.printSchema()
df.show()

root
 |-- id: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amt: float (nullable = true)
 |-- code: string (nullable = true)

+---+------+------+----+
| id|  type|   amt|code|
+---+------+------+----+
|100| DEBIT|1000.0| IND|
|101|CREDIT|2000.0| IND|
|102| DEBIT|3000.0| AUS|
|103|CREDIT|4000.0| JPN|
|104| DEBIT|5000.0| IND|
|105|CREDIT|6000.0| AUS|
+---+------+------+----+



In [13]:
from pyspark.sql.functions import lower, upper 
df.select(df.amt, lower(df.type)).show()

+------+-----------+
|   amt|lower(type)|
+------+-----------+
|1000.0|      debit|
|2000.0|     credit|
|3000.0|      debit|
|4000.0|     credit|
|5000.0|      debit|
|6000.0|     credit|
+------+-----------+



In [14]:
df.select(lower(df.type),"amt").show()

+-----------+------+
|lower(type)|   amt|
+-----------+------+
|      debit|1000.0|
|     credit|2000.0|
|      debit|3000.0|
|     credit|4000.0|
|      debit|5000.0|
|     credit|6000.0|
+-----------+------+



In [None]:
from pyspark.sql.functions import current_timestamp

To add dynamically individual column explicitly to the existing dataframe
- dataframe.withColumn(Column_name, values)

In [None]:
data = [(100,"DEBIT",1000.0,"IND"),(101,"CREDIT",2000.0,"IND"),(102,"DEBIT",3000.0,"AUS"),
                            (103,"CREDIT",4000.0,"JPN"),(104,"DEBIT",5000.0,"IND"),(105,"CREDIT",6000.0,"AUS")]

columns = 'id int, type string, amt float, code string'

df = spark.createDataFrame(data = data, schema = columns)

df_with_ts = df.withColumn("curr_timestamp", current_timestamp())

df_with_ts.show(truncate=False)

In [None]:
df.groupby('type').sum('amt').show() # if amt is not passed all the integer columns are displayed
df.groupby('type').min('amt').show()
df.groupby('type').max('amt').show()
df.groupby('type').avg('amt').show()

In [None]:
df.rdd.collect()

In [None]:
df.filter((df.type.like("%CREDIT%") & (df.amt > 1000.0))).show()

In [None]:
df_with_ts.filter(df_with_ts("curr_timestamp").lt(lit("2023-02-28 16:55:22.725605")))   

In [None]:
df_with_ts.printSchema()

In [None]:
from pyspark.sql.functions import concat, lit
df.withColumn("desc", concat("type", lit(" "), "amt")).show()

In [None]:
from pyspark.sql.functions import concat_ws
df.withColumn("desc", concat_ws(" ", "type", "amt")).show()