In [1]:
from pyspark.sql import SparkSession

In [2]:
#build a spark session, name the app ArchDemo, use all CPU and get spark session if present otherwise create it and assign with the variable.

spark = (
    SparkSession.builder
    .appName("ArchDemo")
    .master("local[*]")
    .getOrCreate()
)

data = [("Baburao", 100), ("Raju", 200), ("Shyam", 300)]
df = spark.createDataFrame(data, ["Name", "Sales"])

update_df = df.withColumn("Bonus", df.Sales * 0.10)

update_df.show()

spark.stop()


+-------+-----+-----+
|   Name|Sales|Bonus|
+-------+-----+-----+
|Baburao|  100| 10.0|
|   Raju|  200| 20.0|
|  Shyam|  300| 30.0|
+-------+-----+-----+



In [3]:
spark = (
    SparkSession.builder
    .appName("ArchDemo")
    .master("local[*]")
    .getOrCreate()
)

rdd = spark.sparkContext.parallelize([1,2,3,4])
rdd.collect()

spark.stop()

In [4]:
#data cleaning using PySpark

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim, lower, to_date, mean,regexp_replace

spark = (
    SparkSession.builder
    .appName("Data Cleaning")
    .master("local[*]")
    .getOrCreate()
)

In [5]:
synthetic_data = """id,name,age,address,join_date
1, Alice,23,123 Main St.,2023-01-01
2,Bob,30,456 Oak Ave.,2023-02-15
3,Charlie,27,789 Pine Rd.,2023-03-10
4,Diana,35,321 Elm St.,2023-04-05
5,Ethan,29,654 Maple Blvd.,2023-05-20
6,Fiona,24,987 Birch Ln.,2023-06-12
7,George,32,741 Cedar Ct.,2023-07-08
8,Hannah,28,159 Spruce Dr.,2023-08-22
9,Ivan,26,852 Walnut Way,2023-09-14
10,Julia,31,963 Cherry St.,2023-10-30"""

In [6]:
synthetic_data

'id,name,age,address,join_date\n1, Alice,23,123 Main St.,2023-01-01\n2,Bob,30,456 Oak Ave.,2023-02-15\n3,Charlie,27,789 Pine Rd.,2023-03-10\n4,Diana,35,321 Elm St.,2023-04-05\n5,Ethan,29,654 Maple Blvd.,2023-05-20\n6,Fiona,24,987 Birch Ln.,2023-06-12\n7,George,32,741 Cedar Ct.,2023-07-08\n8,Hannah,28,159 Spruce Dr.,2023-08-22\n9,Ivan,26,852 Walnut Way,2023-09-14\n10,Julia,31,963 Cherry St.,2023-10-30'

In [7]:
with open("synthetic_data", "w") as f:
    f.write(synthetic_data)

In [9]:
df = spark.read.csv("synthetic_data", header=True, inferSchema=True)
df.show()

+---+-------+---+---------------+----------+
| id|   name|age|        address| join_date|
+---+-------+---+---------------+----------+
|  1|  Alice| 23|   123 Main St.|2023-01-01|
|  2|    Bob| 30|   456 Oak Ave.|2023-02-15|
|  3|Charlie| 27|   789 Pine Rd.|2023-03-10|
|  4|  Diana| 35|    321 Elm St.|2023-04-05|
|  5|  Ethan| 29|654 Maple Blvd.|2023-05-20|
|  6|  Fiona| 24|  987 Birch Ln.|2023-06-12|
|  7| George| 32|  741 Cedar Ct.|2023-07-08|
|  8| Hannah| 28| 159 Spruce Dr.|2023-08-22|
|  9|   Ivan| 26| 852 Walnut Way|2023-09-14|
| 10|  Julia| 31| 963 Cherry St.|2023-10-30|
+---+-------+---+---------------+----------+



In [10]:
df = df.withColumn("name",lower(trim(col("name"))))
df.show()

+---+-------+---+---------------+----------+
| id|   name|age|        address| join_date|
+---+-------+---+---------------+----------+
|  1|  alice| 23|   123 Main St.|2023-01-01|
|  2|    bob| 30|   456 Oak Ave.|2023-02-15|
|  3|charlie| 27|   789 Pine Rd.|2023-03-10|
|  4|  diana| 35|    321 Elm St.|2023-04-05|
|  5|  ethan| 29|654 Maple Blvd.|2023-05-20|
|  6|  fiona| 24|  987 Birch Ln.|2023-06-12|
|  7| george| 32|  741 Cedar Ct.|2023-07-08|
|  8| hannah| 28| 159 Spruce Dr.|2023-08-22|
|  9|   ivan| 26| 852 Walnut Way|2023-09-14|
| 10|  julia| 31| 963 Cherry St.|2023-10-30|
+---+-------+---+---------------+----------+

