In [1]:
pip install pyspark



The following command must be run outside of the IPython shell:

    $ pip install pyspark

The Python package manager (pip) can only be used from outside of IPython.
Please reissue the `pip` command in a separate terminal or command prompt.

See the Python documentation for more information on how to install packages:

    https://docs.python.org/3/installing/


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Square Integers Example") \
    .getOrCreate()

# Sample data: a list of integers
data = [(1,), (2,), (3,), (4,), (5,)]

# Create a DataFrame from the sample data
df = spark.createDataFrame(data, ["number"])

# Add a new column with the square of each integer
df_squared = df.withColumn("square", col("number") ** 2)

# Show the resulting DataFrame
df_squared.show()

# Stop the SparkSession
spark.stop()




+------+------+
|number|square|
+------+------+
|     1|   1.0|
|     2|   4.0|
|     3|   9.0|
|     4|  16.0|
|     5|  25.0|
+------+------+



In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import max

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Find Maximum Example") \
    .getOrCreate()

# Sample data: a list of integers
data = [(1,), (3,), (2,), (8,), (5,)]

# Create a DataFrame from the sample data
df = spark.createDataFrame(data, ["number"])

# Find the maximum value in the "number" column
max_value = df.select(max("number")).collect()[0][0]

# Print the maximum value
print(f"The maximum value is: {max_value}")

# Stop the SparkSession
spark.stop()




The maximum value is: 8


In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Find Average Example") \
    .getOrCreate()

# Sample data: a list of integers
data = [(1,), (3,), (2,), (8,), (5,)]

# Create a DataFrame from the sample data
df = spark.createDataFrame(data, ["number"])

# Calculate the average of the "number" column
average_value = df.select(avg("number")).collect()[0][0]

# Print the average value
print(f"The average value is: {average_value}")

# Stop the SparkSession
spark.stop()




The average value is: 3.8


In [6]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Read CSV Example") \
    .getOrCreate()

# Define the path to the CSV file
csv_file_path = "/home/lplab/Documents/220962085_BDA/Week1"

# Read the CSV file into a DataFrame
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Show the DataFrame
df.show()

# Print the schema of the DataFrame
df.printSchema()

# Stop the SparkSession
spark.stop()


+----+---------------------------+--------------------+--------------------+------------------+-------------+--------------------+--------------------+-------+----------------------+
|Year|Industry_aggregation_NZSIOC|Industry_code_NZSIOC|Industry_name_NZSIOC|             Units|Variable_code|       Variable_name|   Variable_category|  Value|Industry_code_ANZSIC06|
+----+---------------------------+--------------------+--------------------+------------------+-------------+--------------------+--------------------+-------+----------------------+
|2023|                    Level 1|               99999|      All industries|Dollars (millions)|          H01|        Total income|Financial perform...| 930995|  ANZSIC06 division...|
|2023|                    Level 1|               99999|      All industries|Dollars (millions)|          H04|Sales, government...|Financial perform...| 821630|  ANZSIC06 division...|
|2023|                    Level 1|               99999|      All industries|Dollars (

In [7]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Display DataFrame Example") \
    .getOrCreate()

# Sample data
data = [(1, "Alice", 29), (2, "Bob", 31), (3, "Cathy", 25)]
columns = ["id", "name", "age"]

# Create a DataFrame
df = spark.createDataFrame(data, columns)

# Display the first few rows of the DataFrame
print("First few rows:")
df.show()  # By default, shows the first 20 rows

# Display the schema of the DataFrame
print("Schema of the DataFrame:")
df.printSchema()

# Stop the SparkSession
spark.stop()




First few rows:
+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 29|
|  2|  Bob| 31|
|  3|Cathy| 25|
+---+-----+---+

Schema of the DataFrame:
root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



In [8]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Summary Statistics Example") \
    .getOrCreate()

# Sample data
data = [(1, "Alice", 29), (2, "Bob", 31), (3, "Cathy", 25), (4, "David", 40), (5, "Eve", 35)]
columns = ["id", "name", "age"]

# Create a DataFrame
df = spark.createDataFrame(data, columns)

# Calculate summary statistics for the 'age' column
summary_stats = df.describe("age")

# Show the summary statistics
summary_stats.show()

# Stop the SparkSession
spark.stop()




+-------+------------------+
|summary|               age|
+-------+------------------+
|  count|                 5|
|   mean|              32.0|
| stddev|5.7445626465380295|
|    min|                25|
|    max|                40|
+-------+------------------+

