
# **Running Pyspark in Colab**

To run spark in Colab, You need proper setup. Next cell will create required setup for running Spark/PySpark:

In [None]:
# innstall java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# install spark (change the version number if needed)
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz

# unzip the spark file to the current folder
!tar xf spark-3.0.0-bin-hadoop3.2.tgz

# set your spark folder to your system path environment.
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"


# install findspark using pip
!pip install -q findspark


In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
path = "/content/BostonHousing.csv"

# (1)Read the CSV file
df = spark.read.csv(path, header = True, inferSchema = True)


# (1) Show the first 5 elements of the dataset
df.show(5)

In [None]:
# (3) Count number of observations in the dataset
observation_count = df.count()
print(f"Number of observations: {observation_count}")


Number of observations: 506


In [None]:
# (4)Show the Schema of  dataset
df.printSchema()


root
 |-- crim: double (nullable = true)
 |-- zn: double (nullable = true)
 |-- indus: double (nullable = true)
 |-- chas: integer (nullable = true)
 |-- nox: double (nullable = true)
 |-- rm: double (nullable = true)
 |-- age: double (nullable = true)
 |-- dis: double (nullable = true)
 |-- rad: integer (nullable = true)
 |-- tax: integer (nullable = true)
 |-- ptratio: double (nullable = true)
 |-- b: double (nullable = true)
 |-- lstat: double (nullable = true)
 |-- medv: double (nullable = true)



In [None]:
# (5)Drop the column "b" from dataset
df = df.drop("b")
df.printSchema()


root
 |-- crim: double (nullable = true)
 |-- zn: double (nullable = true)
 |-- indus: double (nullable = true)
 |-- chas: integer (nullable = true)
 |-- nox: double (nullable = true)
 |-- rm: double (nullable = true)
 |-- age: double (nullable = true)
 |-- dis: double (nullable = true)
 |-- rad: integer (nullable = true)
 |-- tax: integer (nullable = true)
 |-- ptratio: double (nullable = true)
 |-- lstat: double (nullable = true)
 |-- medv: double (nullable = true)



In [None]:

# (6)Round all of the numerical columns into two decimal places
from pyspark.sql.functions import round

numerical_columns = [field.name for field in df.schema.fields if field.dataType in ["IntegerType", "LongType", "DoubleType", "FloatType"]]

for column in numerical_columns:
    df = df.withColumn(column, round(df[column], 2))


In [None]:
# (7)Create a new column (Age10) with 10% increasing of 'age' column
from pyspark.sql.functions import col

df = df.withColumn("Age10", col("age") * 1.10)

# (8)Plot histogram Age10 column on a 2D Plot
import matplotlib.pyplot as plt

# Convert Spark DataFrame to Pandas for plotting
age10_pd = df.select("Age10").toPandas()

# Plot histogram
plt.hist(age10_pd["Age10"], bins=30, edgecolor='black')
plt.xlabel('Age10')
plt.ylabel('Frequency')
plt.title('Histogram of Age10')
plt.show()

# (9)Provide Summary Statistics of all columns (count, mean, stddev, min, max)
summary_statistics = df.describe()
summary_statistics.show()

# (10)Convert Spark DataFrame to Pandas DataFrame
pandas_df = df.toPandas()

# Show the last 5 rows of Pandas DataFrame
print(pandas_df.tail())