In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import pyspark.sql.functions as F

# Initialize Spark Session
spark = SparkSession.builder.appName("ExpensesDataFrame").getOrCreate()

# Define Schema
schema = StructType([
    StructField("Empld", IntegerType(), True),
    StructField("Date_str", StringType(), True),  # Keeping Date as String for now
    StructField("Expenses", IntegerType(), True)
])

# Define Data
data = [
    (1, "21-01-2022", 50),
    (1, "22-02-2022", 100),
    (1, "04-03-2023", 120),
    (1, "04-05-2023", 300),
    (2, "03-03-2022", 100),
    (2, "04-04-2022", 300)
]

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)
df = df.withColumn('Date_exp', F.to_date('Date_str', 'dd-MM-yyyy'))
df.createOrReplaceTempView('df')
# Show DataFrame
df.show()


+-----+----------+--------+----------+
|Empld|  Date_str|Expenses|  Date_exp|
+-----+----------+--------+----------+
|    1|21-01-2022|      50|2022-01-21|
|    1|22-02-2022|     100|2022-02-22|
|    1|04-03-2023|     120|2023-03-04|
|    1|04-05-2023|     300|2023-05-04|
|    2|03-03-2022|     100|2022-03-03|
|    2|04-04-2022|     300|2022-04-04|
+-----+----------+--------+----------+



In [0]:
%sql

select *,
sum(Expenses) over(partition by Empld order by Date_exp) as running_total
from df


Empld,Date_str,Expenses,Date_exp,running_total
1,21-01-2022,50,2022-01-21,50
1,22-02-2022,100,2022-02-22,150
1,04-03-2023,120,2023-03-04,270
1,04-05-2023,300,2023-05-04,570
2,03-03-2022,100,2022-03-03,100
2,04-04-2022,300,2022-04-04,400


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
from datetime import datetime

# Initialize Spark Session
spark = SparkSession.builder.appName("UserActivity").getOrCreate()

# Define Schema
schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("log_id", IntegerType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("action", StringType(), True)
])

# Define Sample Data
data = [
    (1, 101, datetime(2024, 2, 3, 10, 0, 0), "login"),
    (2, 102, datetime(2024, 2, 3, 10, 1, 30), "click"),
    (3, 103, datetime(2024, 2, 3, 10, 3, 0), "purchase"),
    (1, 104, datetime(2024, 2, 3, 10, 5, 30), "logout"),
    (2, 105, datetime(2024, 2, 3, 10, 6, 0), "login"),
    (3, 106, datetime(2024, 2, 3, 10, 7, 0), "click")
]

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)
df.createOrReplaceTempView('df')

# Show DataFrame
df.show(truncate=False)


+-------+------+-------------------+--------+
|user_id|log_id|timestamp          |action  |
+-------+------+-------------------+--------+
|1      |101   |2024-02-03 10:00:00|login   |
|2      |102   |2024-02-03 10:01:30|click   |
|3      |103   |2024-02-03 10:03:00|purchase|
|1      |104   |2024-02-03 10:05:30|logout  |
|2      |105   |2024-02-03 10:06:00|login   |
|3      |106   |2024-02-03 10:07:00|click   |
+-------+------+-------------------+--------+



In [0]:
%sql
WITH LastActivity AS (
    SELECT user_id, MAX(timestamp) AS last_activity
    FROM df
    GROUP BY user_id
)
SELECT user_id
FROM LastActivity
WHERE last_activity < (SELECT MAX(timestamp) FROM df) - INTERVAL '3' MINUTE;

user_id


In [0]:
json_str = [{
  "id": 1,
  "name": "John Doe",
  "address": {
    "street": "123 Main St",
    "city": "New York",
    "state": {
      "code": "NY",
      "full": "New York"
    }
  },
  "contacts": [
    {
      "type": "email",
      "value": "john@example.com"
    },
    {
      "type": "phone",
      "value": "1234567890"
    }
  ]
}]

from pyspark.sql.types import *
from pyspark.sql.functions import *

schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('name', StringType(), True),
    StructField('address', StructType([
        StructField('street', StringType(), True),
        StructField('city', StringType(), True),
        StructField('state', StructType([
            StructField('code', StringType(), True),
            StructField('full', StringType(), True)
        ]), True)
    ]), True),
    StructField('contacts', ArrayType(StructType([
        StructField('type', StringType(), True),
        StructField('value', StringType(), True)
    ])), True)
])


df = spark.createDataFrame(json_str, schema=schema)
display(df)


id,name,address,contacts
1,John Doe,"List(123 Main St, New York, List(NY, New York))","List(List(email, john@example.com), List(phone, 1234567890))"


In [0]:
df = spark.read.json('dbfs:/FileStore/nested_json.json', schema = schema)

In [0]:
display(df)

id,name,address,contacts
,,,
,,,
,,,
,,,
,,,
,,,
,,,
,,,
,,,
,,,


In [0]:
usr_data = [('Jaya', '20', ['SQL', 'Data Science', '']), 
 ('Milan', '21', ['ML', 'AI']), 
 ('Rohit', '19', None), 
 ('Maria', '20', ['DBMS', 'Networking']), 
 ('Jay', '22', None)] 

usr_columns = ['Name', 'Age', 'Courses_enrolled'] 


df = spark.createDataFrame(usr_data, usr_columns)
df.show()

+-----+---+--------------------+
| Name|Age|    Courses_enrolled|
+-----+---+--------------------+
| Jaya| 20|[SQL, Data Scienc...|
|Milan| 21|            [ML, AI]|
|Rohit| 19|                null|
|Maria| 20|  [DBMS, Networking]|
|  Jay| 22|                null|
+-----+---+--------------------+



In [0]:
df = df.withColumn('exploded_data', explode('Courses_enrolled'))
df.show()

+-----+---+--------------------+-------------+
| Name|Age|    Courses_enrolled|exploded_data|
+-----+---+--------------------+-------------+
| Jaya| 20|[SQL, Data Scienc...|          SQL|
| Jaya| 20|[SQL, Data Scienc...| Data Science|
| Jaya| 20|[SQL, Data Scienc...|             |
|Milan| 21|            [ML, AI]|           ML|
|Milan| 21|            [ML, AI]|           AI|
|Maria| 20|  [DBMS, Networking]|         DBMS|
|Maria| 20|  [DBMS, Networking]|   Networking|
+-----+---+--------------------+-------------+



In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [0]:


json_str = [{
  "id": 1,
  "name": "John Doe",
  "address": {
    "street": "123 Main St",
    "city": "New York",
    "state": {
      "code": "NY",
      "full": "New York"
    }
  },
  "contacts": [
    {
      "type": "email",
      "value": "john@example.com"
    },
    {
      "type": "phone",
      "value": "1234567890"
    }
  ]
}]


schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("address", StructType([
        StructField("street", StringType(), True),
        StructField("city", StringType(), True),
        StructField("state", StructType([
            StructField("code", StringType(), True),
            StructField("full", StringType(), True)
        ]))
    ])),
    StructField("contacts", ArrayType(StructType([
        StructField("type", StringType(), True),
        StructField("value", StringType(), True)
    ]), True))
])

df = spark.createDataFrame(json_str, schema=schema)

In [0]:
display(df)

df_final = df.withColumn('street', col('address').street).withColumn('contact', explode(col('contacts')))

df_final.show()


id,name,address,contacts
1,John Doe,"List(123 Main St, New York, List(NY, New York))","List(List(email, john@example.com), List(phone, 1234567890))"


+---+--------+--------------------+--------------------+-----------+--------------------+
| id|    name|             address|            contacts|     street|             contact|
+---+--------+--------------------+--------------------+-----------+--------------------+
|  1|John Doe|{123 Main St, New...|[{email, john@exa...|123 Main St|{email, john@exam...|
|  1|John Doe|{123 Main St, New...|[{email, john@exa...|123 Main St| {phone, 1234567890}|
+---+--------+--------------------+--------------------+-----------+--------------------+



In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
from datetime import date

dim_schema = StructType([
    StructField('customer_id', IntegerType(), True),
    StructField('name', StringType(), True),
    StructField('city', StringType(), True),
    StructField('start_date', DateType(), True),
    StructField('end_date', DateType(), True),
    StructField('is_current', StringType(), True)
])

stg_schema = StructType([
    StructField('customer_id', IntegerType(), True),
    StructField('name', StringType(), True),
    StructField('city', StringType(), True)
])

# Sample existing dimension table (customer_dim)
customer_dim = spark.createDataFrame([
    (1, "Alice", "New York", date(2023, 1, 1), None, "Y"),
    (2, "Bob", "Los Angeles", date(2023, 2, 1), None, "Y"),
], schema=dim_schema)

# Sample incoming data (staging table)
staging_data = spark.createDataFrame([
    (1, "Alice", "San Francisco"),  # Change in city (New York → San Francisco)
    (2, "Bob", "Los Angeles"),      # No change
    (3, "Charlie", "Chicago")       # New customer
], schema=stg_schema)

In [0]:
df_combained = customer_dim.alias("dim").join(staging_data.alias("stg"), 'customer_id', 'outer')\
    .select(col('stg.customer_id'),
            col('stg.name'),
            col('stg.city'),
            col('dim.city').alias('existing_city'),
            col('dim.start_date'),
            col('dim.end_date'),
            col('dim.is_current')
            )

display(df_combained)

customer_id,name,city,existing_city,start_date,end_date,is_current
1,Alice,San Francisco,New York,2023-01-01,,Y
2,Bob,Los Angeles,Los Angeles,2023-02-01,,Y
3,Charlie,Chicago,,,,


In [0]:
scd_update_df = df_combained.withColumn('change_flag', when(col('existing_city').isNull(), 'New').when(col('city') != col('existing_city'), 'update').otherwise('no_change'))

display(scd_update_df)

customer_id,name,city,existing_city,start_date,end_date,is_current,change_flag
1,Alice,San Francisco,New York,2023-01-01,,Y,update
2,Bob,Los Angeles,Los Angeles,2023-02-01,,Y,no_change
3,Charlie,Chicago,,,,,New


In [0]:
new_records_df = scd_update_df.where(col('change_flag') == 'New').select(
    col('customer_id'),
    col('name'),
    col('city'),
    current_date().alias('start_date'),
    lit(None).alias('end_date'),
    lit('Y').alias('is_current')
)

display(new_records_df)

customer_id,name,city,start_date,end_date,is_current
3,Charlie,Chicago,2025-05-07,,Y


In [0]:
update_inactive_df = scd_update_df.where(col('change_flag') == 'update').select(
    col('customer_id'),
    col('name'),
    col('city'),
    col('start_date'),
    current_date().alias('end_date'),
    lit('N').alias('is_current')
)

update_active_df = scd_update_df.where(col('change_flag') == 'update').select(
    col('customer_id'),
    col('name'),
    col('city'),
    current_date().alias('start_date'),
    lit(None).alias('end_date'),
    lit('Y').alias('is_current')
)

In [0]:
df_final = scd_update_df.where(col('change_flag') == 'no_change').select(
    col('customer_id'),
    col('name'),
    col('city'),
    col('start_date'),
    col('end_date'),
    col('is_current')
).union(new_records_df).union(update_inactive_df).union(update_active_df)

display(df_final)

customer_id,name,city,start_date,end_date,is_current
2,Bob,Los Angeles,2023-02-01,,Y
3,Charlie,Chicago,2025-05-07,,Y
1,Alice,San Francisco,2023-01-01,2025-05-07,N
1,Alice,San Francisco,2025-05-07,,Y


In [0]:
def decorator(func):
    def wrapper():
        print("before")
        func()
        print("after")
    return wrapper


@decorator
def greet():
    print("hello")

greet()

before
hello
after


In [0]:
greet()

before
hello
after


In [0]:

@decorator
def greet():
    print('hello')

greet()

before
hello
after


In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
customer_dim_data = [

(1,'manish','arwal','india','N','2022-09-15','2022-09-25'),
(2,'vikash','patna','india','Y','2023-08-12',None),
(3,'nikita','delhi','india','Y','2023-09-10',None),
(4,'rakesh','jaipur','india','Y','2023-06-10',None),
(5,'ayush','NY','USA','Y','2023-06-10',None),
(1,'manish','gurgaon','india','Y','2022-09-25',None),
]

customer_schema= ['id','name','city','country','active','effective_start_date','effective_end_date']

customer_dim_df = spark.createDataFrame(data= customer_dim_data,schema=customer_schema)

sales_data = [

(1,1,'manish','2023-01-16','gurgaon','india',380),
(77,1,'manish','2023-03-11','bangalore','india',300),
(12,3,'nikita','2023-09-20','delhi','india',127),
(54,4,'rakesh','2023-08-10','jaipur','india',321),
(65,5,'ayush','2023-09-07','mosco','russia',765),
(89,6,'rajat','2023-08-10','jaipur','india',321)
]

sales_schema = ['sales_id', 'customer_id','customer_name', 'sales_date', 'food_delivery_address','food_delivery_country', 'food_cost']

sales_df = spark.createDataFrame(data=sales_data,schema=sales_schema)

In [0]:
joined_df = customer_dim_df.join(sales_df, customer_dim_df["id"] == sales_df["customer_id"], 'left')
display(joined_df)

id,name,city,country,active,effective_start_date,effective_end_date,sales_id,customer_id,customer_name,sales_date,food_delivery_address,food_delivery_country,food_cost
1,manish,arwal,india,N,2022-09-15,2022-09-25,77.0,1.0,manish,2023-03-11,bangalore,india,300.0
1,manish,arwal,india,N,2022-09-15,2022-09-25,1.0,1.0,manish,2023-01-16,gurgaon,india,380.0
2,vikash,patna,india,Y,2023-08-12,,,,,,,,
3,nikita,delhi,india,Y,2023-09-10,,12.0,3.0,nikita,2023-09-20,delhi,india,127.0
4,rakesh,jaipur,india,Y,2023-06-10,,54.0,4.0,rakesh,2023-08-10,jaipur,india,321.0
5,ayush,NY,USA,Y,2023-06-10,,65.0,5.0,ayush,2023-09-07,mosco,russia,765.0
1,manish,gurgaon,india,Y,2022-09-25,,77.0,1.0,manish,2023-03-11,bangalore,india,300.0
1,manish,gurgaon,india,Y,2022-09-25,,1.0,1.0,manish,2023-01-16,gurgaon,india,380.0
