In [5]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("DataFrame-Operations").getOrCreate()

In [6]:
# Load the synthetic data into a DataFrame
data_file_path = "C:/Users/rammo/OneDrive/Desktop/stocks_data.csv"
df = spark.read.csv(data_file_path, header=True, inferSchema=True)

In [7]:
# Display schema of DataFrame
df.printSchema()

# Show the initial DataFrame
print("Initial DataFrame:")
df.show(5)

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: double (nullable = true)

Initial DataFrame:
+---+-------+-----------+--------+-------+
| id|   name|   category|quantity|  price|
+---+-------+-----------+--------+-------+
|  1|  phone|Electronics|      10| 899.99|
|  2|macbook|Electronics|       5|1299.99|
|  3|   ipad|Electronics|      15| 499.99|
|  4|samsung|Electronics|       8| 799.99|
|  5|     lg|Electronics|      10| 699.99|
+---+-------+-----------+--------+-------+



Select: Choose specific columns.

In [8]:
# Select specific columns
selected_columns = df.select("id", "name", "price")
print("Selected Columns:")
selected_columns.show(5)

Selected Columns:
+---+-------+-------+
| id|   name|  price|
+---+-------+-------+
|  1|  phone| 899.99|
|  2|macbook|1299.99|
|  3|   ipad| 499.99|
|  4|samsung| 799.99|
|  5|     lg| 699.99|
+---+-------+-------+



Filter: Apply conditions to filter rows.

In [9]:
# Filter rows based on a condition
filtered_data= df.filter(df.quantity > 10)
print("Filtered Data:", filtered_data.count())
filtered_data.show()

Filtered Data: 1
+---+----+-----------+--------+------+
| id|name|   category|quantity| price|
+---+----+-----------+--------+------+
|  3|ipad|Electronics|      15|499.99|
+---+----+-----------+--------+------+



Join: Combine multiple DataFrames based on specified columns.

In [10]:
# Join with another DataFrame
df2 = df.select("id", "category").limit(5)
joined_data = df.join(df2, "id", "inner")
print("Joined Data:")
joined_data.show()

Joined Data:
+---+-------+-----------+--------+-------+-----------+
| id|   name|   category|quantity|  price|   category|
+---+-------+-----------+--------+-------+-----------+
|  1|  phone|Electronics|      10| 899.99|Electronics|
|  2|macbook|Electronics|       5|1299.99|Electronics|
|  3|   ipad|Electronics|      15| 499.99|Electronics|
|  4|samsung|Electronics|       8| 799.99|Electronics|
|  5|     lg|Electronics|      10| 699.99|Electronics|
+---+-------+-----------+--------+-------+-----------+



Sort: Arrange rows based on one or more columns.

In [11]:
df.show()

+---+-------+-----------+--------+-------+
| id|   name|   category|quantity|  price|
+---+-------+-----------+--------+-------+
|  1|  phone|Electronics|      10| 899.99|
|  2|macbook|Electronics|       5|1299.99|
|  3|   ipad|Electronics|      15| 499.99|
|  4|samsung|Electronics|       8| 799.99|
|  5|     lg|Electronics|      10| 699.99|
+---+-------+-----------+--------+-------+



In [12]:
# Sort by a column
sorted_data = df.orderBy("price")
print("Sorted Data:")
sorted_data.show(15)

Sorted Data:
+---+-------+-----------+--------+-------+
| id|   name|   category|quantity|  price|
+---+-------+-----------+--------+-------+
|  3|   ipad|Electronics|      15| 499.99|
|  5|     lg|Electronics|      10| 699.99|
|  4|samsung|Electronics|       8| 799.99|
|  1|  phone|Electronics|      10| 899.99|
|  2|macbook|Electronics|       5|1299.99|
+---+-------+-----------+--------+-------+



In [13]:
# Sort by a column desc
from pyspark.sql.functions import col, desc
sorted_data = df.orderBy(col("price").desc(), col("id").desc())
print("Sorted Data Descending:")
sorted_data.show(10)

Sorted Data Descending:


+---+-------+-----------+--------+-------+
| id|   name|   category|quantity|  price|
+---+-------+-----------+--------+-------+
|  2|macbook|Electronics|       5|1299.99|
|  1|  phone|Electronics|      10| 899.99|
|  4|samsung|Electronics|       8| 799.99|
|  5|     lg|Electronics|      10| 699.99|
|  3|   ipad|Electronics|      15| 499.99|
+---+-------+-----------+--------+-------+



Distinct: Get unique rows.

In [14]:
# Get distinct product category
distinct_rows = df.select("category").distinct()
print("Distinct Product Categories:")
distinct_rows.show()

Distinct Product Categories:
+-----------+
|   category|
+-----------+
|Electronics|
+-----------+



Drop: Remove specified columns.

In [15]:
# Drop columns
dropped_columns = df.drop("quantity", "category")
print("Dropped Columns:")
dropped_columns.show(10)

Dropped Columns:
+---+-------+-------+
| id|   name|  price|
+---+-------+-------+
|  1|  phone| 899.99|
|  2|macbook|1299.99|
|  3|   ipad| 499.99|
|  4|samsung| 799.99|
|  5|     lg| 699.99|
+---+-------+-------+



WithColumn: Add new calculated columns.

In [16]:
# Add a new calculated column
df_with_new_column = df.withColumn("revenue", df.quantity * df.price)
print("DataFrame with New Column:")
df_with_new_column.show(10)

DataFrame with New Column:
+---+-------+-----------+--------+-------+-------+
| id|   name|   category|quantity|  price|revenue|
+---+-------+-----------+--------+-------+-------+
|  1|  phone|Electronics|      10| 899.99| 8999.9|
|  2|macbook|Electronics|       5|1299.99|6499.95|
|  3|   ipad|Electronics|      15| 499.99|7499.85|
|  4|samsung|Electronics|       8| 799.99|6399.92|
|  5|     lg|Electronics|      10| 699.99| 6999.9|
+---+-------+-----------+--------+-------+-------+



Alias: Rename columns for better readability.

In [17]:
# Rename columns using alias
df_with_alias = df.withColumnRenamed("price", "product_price")
print("DataFrame with Aliased Column:")
df_with_alias.show(10)

DataFrame with Aliased Column:
+---+-------+-----------+--------+-------------+
| id|   name|   category|quantity|product_price|
+---+-------+-----------+--------+-------------+
|  1|  phone|Electronics|      10|       899.99|
|  2|macbook|Electronics|       5|      1299.99|
|  3|   ipad|Electronics|      15|       499.99|
|  4|samsung|Electronics|       8|       799.99|
|  5|     lg|Electronics|      10|       699.99|
+---+-------+-----------+--------+-------------+



In [18]:
# Stop the SparkSession
spark.stop()