In [None]:
import pyspark

In [None]:
path="/content/sample_data/california_housing_test.csv"

In [None]:
# Create Spark Session
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("PySpark Tutorial").getOrCreate()

In [None]:
# Read data from csv
df=spark.read.format("csv") \
.option("header", True) \
.option("inferSchema", True) \
.option("sep", ",") \
.load(path)

In [None]:
df.show(5)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|  -122.05|   37.37|              27.0|     3885.0|         661.0|    1537.0|     606.0|       6.6085|          344700.0|
|   -118.3|   34.26|              43.0|     1510.0|         310.0|     809.0|     277.0|        3.599|          176500.0|
|  -117.81|   33.78|              27.0|     3589.0|         507.0|    1484.0|     495.0|       5.7934|          270500.0|
|  -118.36|   33.82|              28.0|       67.0|          15.0|      49.0|      11.0|       6.1359|          330000.0|
|  -119.67|   36.33|              19.0|     1241.0|         244.0|     850.0|     237.0|       2.9375|           81700.0|
+---------+--------+----

In [None]:
df.count()

3000

In [None]:
df.columns[3:]

['total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value']

In [None]:
df.printSchema()

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)



In [None]:
df.head(4)

[Row(longitude=-122.05, latitude=37.37, housing_median_age=27.0, total_rooms=3885.0, total_bedrooms=661.0, population=1537.0, households=606.0, median_income=6.6085, median_house_value=344700.0),
 Row(longitude=-118.3, latitude=34.26, housing_median_age=43.0, total_rooms=1510.0, total_bedrooms=310.0, population=809.0, households=277.0, median_income=3.599, median_house_value=176500.0),
 Row(longitude=-117.81, latitude=33.78, housing_median_age=27.0, total_rooms=3589.0, total_bedrooms=507.0, population=1484.0, households=495.0, median_income=5.7934, median_house_value=270500.0),
 Row(longitude=-118.36, latitude=33.82, housing_median_age=28.0, total_rooms=67.0, total_bedrooms=15.0, population=49.0, households=11.0, median_income=6.1359, median_house_value=330000.0)]

In [None]:
df.describe().show()

+-------+-------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+
|summary|          longitude|          latitude|housing_median_age|      total_rooms|    total_bedrooms|        population|        households|     median_income|median_house_value|
+-------+-------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+
|  count|               3000|              3000|              3000|             3000|              3000|              3000|              3000|              3000|              3000|
|   mean|-119.58920000000029| 35.63538999999999|28.845333333333333|2599.578666666667| 529.9506666666666|1402.7986666666666|           489.912| 3.807271799999998|        205846.275|
| stddev| 1.9949362939550166|2.1296695233438334|12.555395554955757|2155.593331625582|415.654368

In [None]:
df.columns

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value']

In [None]:
new_df=df.select(df.columns[2:6]).take(4)

In [None]:
for row in new_df:
  print(row.asDict())

{'housing_median_age': 27.0, 'total_rooms': 3885.0, 'total_bedrooms': 661.0, 'population': 1537.0}
{'housing_median_age': 43.0, 'total_rooms': 1510.0, 'total_bedrooms': 310.0, 'population': 809.0}
{'housing_median_age': 27.0, 'total_rooms': 3589.0, 'total_bedrooms': 507.0, 'population': 1484.0}
{'housing_median_age': 28.0, 'total_rooms': 67.0, 'total_bedrooms': 15.0, 'population': 49.0}


In [None]:
df.show(5)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|  -122.05|   37.37|              27.0|     3885.0|         661.0|    1537.0|     606.0|       6.6085|          344700.0|
|   -118.3|   34.26|              43.0|     1510.0|         310.0|     809.0|     277.0|        3.599|          176500.0|
|  -117.81|   33.78|              27.0|     3589.0|         507.0|    1484.0|     495.0|       5.7934|          270500.0|
|  -118.36|   33.82|              28.0|       67.0|          15.0|      49.0|      11.0|       6.1359|          330000.0|
|  -119.67|   36.33|              19.0|     1241.0|         244.0|     850.0|     237.0|       2.9375|           81700.0|
+---------+--------+----

In [None]:
from pyspark.sql.functions import when

In [None]:
new_df=df.withColumn("Value_Type", when(df.median_house_value >= 50000, "Costly").otherwise("Less Costly"))
new_df.show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+----------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|Value_Type|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+----------+
|  -122.05|   37.37|              27.0|     3885.0|         661.0|    1537.0|     606.0|       6.6085|          344700.0|    Costly|
|   -118.3|   34.26|              43.0|     1510.0|         310.0|     809.0|     277.0|        3.599|          176500.0|    Costly|
|  -117.81|   33.78|              27.0|     3589.0|         507.0|    1484.0|     495.0|       5.7934|          270500.0|    Costly|
|  -118.36|   33.82|              28.0|       67.0|          15.0|      49.0|      11.0|       6.1359|          330000.0|    Costly|
|  -119.67|   36.33|              19.0|     1241.0|         244.0|   

In [None]:
new_df.filter(new_df.Value_Type == "Less Costly").show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+-----------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value| Value_Type|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+-----------+
|  -119.42|   35.97|              21.0|      554.0|         121.0|     426.0|     122.0|       2.3516|           47500.0|Less Costly|
|  -122.14|   40.07|              31.0|     2053.0|         465.0|    1193.0|     447.0|       1.4923|           44400.0|Less Costly|
|   -121.3|   37.95|               9.0|      674.0|         242.0|     575.0|     193.0|       2.2024|           45000.0|Less Costly|
|  -119.58|    36.1|              21.0|     1382.0|         327.0|    1469.0|     355.0|       1.3967|           46500.0|Less Costly|
|  -119.02|   36.06|              41.0|     2279.0|         53

In [None]:
new_df=new_df.withColumnRenamed("Value_Type", "Value_Type_New")
new_df.show(5)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+--------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|Value_Type_New|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+--------------+
|  -122.05|   37.37|              27.0|     3885.0|         661.0|    1537.0|     606.0|       6.6085|          344700.0|        Costly|
|   -118.3|   34.26|              43.0|     1510.0|         310.0|     809.0|     277.0|        3.599|          176500.0|        Costly|
|  -117.81|   33.78|              27.0|     3589.0|         507.0|    1484.0|     495.0|       5.7934|          270500.0|        Costly|
|  -118.36|   33.82|              28.0|       67.0|          15.0|      49.0|      11.0|       6.1359|          330000.0|        Costly|
|  -119.67|   36.33|              19.0|  

In [None]:
%%timeit
new_df.drop("Value_Type_New").show(5)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|  -122.05|   37.37|              27.0|     3885.0|         661.0|    1537.0|     606.0|       6.6085|          344700.0|
|   -118.3|   34.26|              43.0|     1510.0|         310.0|     809.0|     277.0|        3.599|          176500.0|
|  -117.81|   33.78|              27.0|     3589.0|         507.0|    1484.0|     495.0|       5.7934|          270500.0|
|  -118.36|   33.82|              28.0|       67.0|          15.0|      49.0|      11.0|       6.1359|          330000.0|
|  -119.67|   36.33|              19.0|     1241.0|         244.0|     850.0|     237.0|       2.9375|           81700.0|
+---------+--------+----