In [1]:
from pyspark.sql import SparkSession

# Spark session
spark = SparkSession.builder.appName("PySparkIntro").getOrCreate()

print("Apache Spark version:", spark.version)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/01 16:46:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Apache Spark version: 3.5.4


> `inferSchema=True` allows Spark to guess the data type for each column automatically.

## Reading and Inspecting Data

In [2]:
# Read the CSV file into a DataFrame
df = spark.read.csv("data/sample.csv", header=True, inferSchema=True)

# Display the first few rows of the DataFrame
df.show()

+---+-------+---+------+-----------+
| id|   name|age|salary| department|
+---+-------+---+------+-----------+
|  1|  Alice| 30| 70000|         HR|
|  2|    Bob| 35| 80000|Engineering|
|  3|Charlie| 25| 50000|  Marketing|
|  4|  David| 40| 90000|Engineering|
|  5|    Eva| 28| 60000|         HR|
|  6|  Frank| 32| 75000|  Marketing|
|  7|   Gina| 27| 55000|Engineering|
|  8|  Harry| 31| 70000|         HR|
|  9|    Ivy| 29| 60000|  Marketing|
| 10|   Jack| 33| 80000|Engineering|
| 11|   Kate| 26| 50000|         HR|
| 12|   Lily| 34| 75000|  Marketing|
| 13|   Mike| 28| 60000|Engineering|
| 14|  Nancy| 30| 70000|         HR|
| 15|  Oscar| 32| 80000|  Marketing|
+---+-------+---+------+-----------+



In [3]:
# Print the DataFrame schema to see the inferred column types
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



## Transformations and Aggregations

In [4]:
# Select the 'name' and 'department' columns from the DataFrame
df.select("name", "department").show()

+-------+-----------+
|   name| department|
+-------+-----------+
|  Alice|         HR|
|    Bob|Engineering|
|Charlie|  Marketing|
|  David|Engineering|
|    Eva|         HR|
|  Frank|  Marketing|
|   Gina|Engineering|
|  Harry|         HR|
|    Ivy|  Marketing|
|   Jack|Engineering|
|   Kate|         HR|
|   Lily|  Marketing|
|   Mike|Engineering|
|  Nancy|         HR|
|  Oscar|  Marketing|
+-------+-----------+



In [5]:
# Filter the DataFrame to include only employees with a salary greater than 65000
df.filter(df["salary"] > 65000).show()

+---+-----+---+------+-----------+
| id| name|age|salary| department|
+---+-----+---+------+-----------+
|  1|Alice| 30| 70000|         HR|
|  2|  Bob| 35| 80000|Engineering|
|  4|David| 40| 90000|Engineering|
|  6|Frank| 32| 75000|  Marketing|
|  8|Harry| 31| 70000|         HR|
| 10| Jack| 33| 80000|Engineering|
| 12| Lily| 34| 75000|  Marketing|
| 14|Nancy| 30| 70000|         HR|
| 15|Oscar| 32| 80000|  Marketing|
+---+-----+---+------+-----------+



In [6]:
# Filter the DataFrame to include only employees with a salary less than 65000
df.filter(df["salary"] < 65000).show()

+---+-------+---+------+-----------+
| id|   name|age|salary| department|
+---+-------+---+------+-----------+
|  3|Charlie| 25| 50000|  Marketing|
|  5|    Eva| 28| 60000|         HR|
|  7|   Gina| 27| 55000|Engineering|
|  9|    Ivy| 29| 60000|  Marketing|
| 11|   Kate| 26| 50000|         HR|
| 13|   Mike| 28| 60000|Engineering|
+---+-------+---+------+-----------+



### Adding a new column

In [7]:
from pyspark.sql.functions import col

# Add a new column 'double_salary' that is twice the 'salary'
df_with_new_column = df.withColumn("double_salary", col("salary") * 2)
df_with_new_column.show()

+---+-------+---+------+-----------+-------------+
| id|   name|age|salary| department|double_salary|
+---+-------+---+------+-----------+-------------+
|  1|  Alice| 30| 70000|         HR|       140000|
|  2|    Bob| 35| 80000|Engineering|       160000|
|  3|Charlie| 25| 50000|  Marketing|       100000|
|  4|  David| 40| 90000|Engineering|       180000|
|  5|    Eva| 28| 60000|         HR|       120000|
|  6|  Frank| 32| 75000|  Marketing|       150000|
|  7|   Gina| 27| 55000|Engineering|       110000|
|  8|  Harry| 31| 70000|         HR|       140000|
|  9|    Ivy| 29| 60000|  Marketing|       120000|
| 10|   Jack| 33| 80000|Engineering|       160000|
| 11|   Kate| 26| 50000|         HR|       100000|
| 12|   Lily| 34| 75000|  Marketing|       150000|
| 13|   Mike| 28| 60000|Engineering|       120000|
| 14|  Nancy| 30| 70000|         HR|       140000|
| 15|  Oscar| 32| 80000|  Marketing|       160000|
+---+-------+---+------+-----------+-------------+



In [8]:
# Group the DataFrame by 'department' and compute the average salary
df.groupBy("department").avg("salary").show()

+-----------+-----------+
| department|avg(salary)|
+-----------+-----------+
|Engineering|    73000.0|
|         HR|    64000.0|
|  Marketing|    68000.0|
+-----------+-----------+



## SQL Queries in Spark

In [9]:
# Create a temporary view from the DataFrame
df.createOrReplaceTempView("employees")

# Run an SQL query to count the number of employees in each department
sql_query = """
SELECT department, COUNT(*) as employee_count
FROM employees
GROUP BY department
"""
result = spark.sql(sql_query)
result.show()

+-----------+--------------+
| department|employee_count|
+-----------+--------------+
|Engineering|             5|
|         HR|             5|
|  Marketing|             5|
+-----------+--------------+



## Streaming

In [10]:
# Define the schema based on our sample data
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("salary", IntegerType(), True),
    StructField("department", StringType(), True)
])

# Read streaming data from the 'streaming_data' directory
stream_df = spark.readStream.option("header", True).schema(schema).csv("streaming_data/")  #  This dir must be present

# Let's group by department and count employees
aggregated_stream = stream_df.groupBy("department").count()

# Write the streaming output to the console
query = aggregated_stream.writeStream.outputMode("complete").format("console").start()

# Wait for the streaming query to finish (press Ctrl+C to stop)
query.awaitTermination()

25/02/01 16:46:38 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-adffdc3d-830f-4286-af4e-cd24f9768cd5. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/02/01 16:46:38 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/02/01 16:46:42 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-----------+-----+
| department|count|
+-----------+-----+
|Engineering|    5|
|         HR|    5|
|  Marketing|    5|
+-----------+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+-----------+-----+
| department|count|
+-----------+-----+
|Engineering|   10|
|         HR|   10|
|  Marketing|   10|
+-----------+-----+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+-----------+-----+
| department|count|
+-----------+-----+
|Engineering|   15|
|         HR|   15|
|  Marketing|   15|
+-----------+-----+



ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/ms/Projects/Machine-Learning/learn/PySpark/pyspark-intro/env/lib/python3.12/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ms/Projects/Machine-Learning/learn/PySpark/pyspark-intro/env/lib/python3.12/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ms/Projects/Machine-Learning/learn/PySpark/pyspark-intro/env/lib/python3.12/socket.py", line 720, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

## Simple Linear Regression

Let's predict salary using age 

In [11]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

# Create a new DataFrame with the necessary columns ('age' as the feature, 'salary' as the label)
assembler = VectorAssembler(inputCols=["age"], outputCol="features")
ml_df = assembler.transform(df)

# Display the new DataFrame with the 'features' column
ml_df.select("age", "salary", "features").show()

# Split the data into training and testing sets (80/20 split)
train_df, test_df = ml_df.randomSplit([0.8, 0.2], seed=42)

# Initialize and train the linear regression model
lr = LinearRegression(featuresCol="features", labelCol="salary")
lr_model = lr.fit(train_df)

# Make predictions on the test set
predictions = lr_model.transform(test_df)
predictions.select("age", "salary", "prediction").show()

# Print the coefficients and intercept for the linear regression model
print("Coefficients:", lr_model.coefficients)
print("Intercept:", lr_model.intercept)

+---+------+--------+
|age|salary|features|
+---+------+--------+
| 30| 70000|  [30.0]|
| 35| 80000|  [35.0]|
| 25| 50000|  [25.0]|
| 40| 90000|  [40.0]|
| 28| 60000|  [28.0]|
| 32| 75000|  [32.0]|
| 27| 55000|  [27.0]|
| 31| 70000|  [31.0]|
| 29| 60000|  [29.0]|
| 33| 80000|  [33.0]|
| 26| 50000|  [26.0]|
| 34| 75000|  [34.0]|
| 28| 60000|  [28.0]|
| 30| 70000|  [30.0]|
| 32| 80000|  [32.0]|
+---+------+--------+



25/02/01 16:49:16 WARN Instrumentation: [bd4e989b] regParam is zero, which might cause numerical instability and overfitting.
25/02/01 16:49:16 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/02/01 16:49:16 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
25/02/01 16:49:16 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


+---+------+------------------+
|age|salary|        prediction|
+---+------+------------------+
| 25| 50000| 53208.23244552073|
| 27| 55000|58740.920096852395|
| 29| 60000| 64273.60774818408|
| 30| 70000| 67039.95157384992|
+---+------+------------------+

Coefficients: [2766.3438256658396]
Intercept: -15950.363196125269
