# PySpark DataFrame Basics ðŸ“Š

This notebook covers the fundamental DataFrame operations in PySpark.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, sum, max, min

# Initialize Spark
spark = SparkSession.builder \
    .appName('DataFrame Basics') \
    .master('spark://spark-master:7077') \
    .getOrCreate()

print('Spark session initialized!')

## Creating DataFrames

In [None]:
# Create sample data
data = [
    (1, 'iPhone 14', 'Electronics', 999, 150),
    (2, 'Samsung Galaxy', 'Electronics', 899, 200),
    (3, 'MacBook Pro', 'Electronics', 2399, 75),
    (4, 'Dell XPS', 'Electronics', 1299, 100),
    (5, 'iPad Air', 'Electronics', 599, 120),
    (6, 'AirPods Pro', 'Audio', 249, 300),
    (7, 'Sony Headphones', 'Audio', 349, 150),
    (8, 'Kindle', 'Books', 139, 250),
    (9, 'Fire Tablet', 'Electronics', 149, 180),
    (10, 'Echo Dot', 'Audio', 49, 500)
]

columns = ['id', 'product_name', 'category', 'price', 'quantity']
df = spark.createDataFrame(data, columns)

df.show()

## Basic DataFrame Operations

In [None]:
# Show schema
df.printSchema()

# Get column names
print(f'Columns: {df.columns}')

# Count rows
print(f'Total products: {df.count()}')

# Describe statistics
df.describe().show()

## Selecting and Filtering

In [None]:
# Select specific columns
df.select('product_name', 'price').show()

# Filter products with price > 500
expensive_products = df.filter(col('price') > 500)
expensive_products.show()

# Multiple conditions
electronics_high_stock = df.filter(
    (col('category') == 'Electronics') & (col('quantity') > 100)
)
electronics_high_stock.show()

## Adding and Modifying Columns

In [None]:
# Add a new column: total_value
df_with_value = df.withColumn('total_value', col('price') * col('quantity'))
df_with_value.show()

# Rename a column
df_renamed = df.withColumnRenamed('product_name', 'name')
df_renamed.show(5)

## GroupBy and Aggregations

In [None]:
# Group by category and count
category_counts = df.groupBy('category').count()
category_counts.show()

# Multiple aggregations
category_stats = df.groupBy('category').agg(
    count('*').alias('product_count'),
    avg('price').alias('avg_price'),
    sum('quantity').alias('total_quantity'),
    max('price').alias('max_price'),
    min('price').alias('min_price')
)
category_stats.show()

## Sorting

In [None]:
# Sort by price (descending)
df.orderBy(col('price').desc()).show()

# Sort by multiple columns
df.orderBy(['category', 'price']).show()

## Distinct and Drop Duplicates

In [None]:
# Get distinct categories
df.select('category').distinct().show()

# Drop duplicates based on category
df.dropDuplicates(['category']).show()

## Saving DataFrames

In [None]:
# Save as CSV (uncomment to use)
# df.write.mode('overwrite').csv('/home/jovyan/data/products.csv', header=True)

# Save as Parquet
# df.write.mode('overwrite').parquet('/home/jovyan/data/products.parquet')

print('DataFrame operations completed!')