In [1]:
import findspark
findspark.init()
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('usecase_3').getOrCreate()

In [4]:
sc = spark.sparkContext

In [5]:
sc.setLogLevel('ERROR')

#### HVAC Data

In [6]:
hvac_df = spark.read.format('csv').options(header=True, inferSchema=True).load('Sensor data/HVAC.csv')

In [8]:
hvac_df.show(5)

+------+-------+----------+----------+------+---------+----------+
|  Date|   Time|TargetTemp|ActualTemp|System|SystemAge|BuildingID|
+------+-------+----------+----------+------+---------+----------+
|6/1/13|0:00:01|        66|        58|    13|       20|         4|
|6/2/13|1:00:01|        69|        68|     3|       20|        17|
|6/3/13|2:00:01|        70|        73|    17|       20|        18|
|6/4/13|3:00:01|        67|        63|     2|       23|        15|
|6/5/13|4:00:01|        68|        74|    16|        9|         3|
+------+-------+----------+----------+------+---------+----------+
only showing top 5 rows



In [9]:
hvac_df.createOrReplaceTempView('HVAC') 

In [94]:
hvac_1 = spark.sql('select *, CASE WHEN ABS(TargetTemp-ActualTemp)>5 THEN 1 ELSE 0 END AS tempchange FROM HVAC')

In [95]:
hvac_1.show(5)

+------+-------+----------+----------+------+---------+----------+----------+
|  Date|   Time|TargetTemp|ActualTemp|System|SystemAge|BuildingID|tempchange|
+------+-------+----------+----------+------+---------+----------+----------+
|6/1/13|0:00:01|        66|        58|    13|       20|         4|         1|
|6/2/13|1:00:01|        69|        68|     3|       20|        17|         0|
|6/3/13|2:00:01|        70|        73|    17|       20|        18|         0|
|6/4/13|3:00:01|        67|        63|     2|       23|        15|         0|
|6/5/13|4:00:01|        68|        74|    16|        9|         3|         1|
+------+-------+----------+----------+------+---------+----------+----------+
only showing top 5 rows



In [96]:
hvac_1.createOrReplaceTempView('HVAC1')

#### Building Data

In [97]:
building_df = spark.read.format('csv').options(header=True, inferSchema=True).load('Sensor data/building.csv')

In [98]:
building_df.show(5)

+----------+-----------+-----------+-----------+---------+
|BuildingID|BuildingMgr|BuildingAge|HVACproduct|  Country|
+----------+-----------+-----------+-----------+---------+
|         1|         M1|         25|     AC1000|      USA|
|         2|         M2|         27|     FN39TG|   France|
|         3|         M3|         28|     JDNS77|   Brazil|
|         4|         M4|         17|     GG1919|  Finland|
|         5|         M5|          3|    ACMAX22|Hong Kong|
+----------+-----------+-----------+-----------+---------+
only showing top 5 rows



In [99]:
building_df.createOrReplaceTempView('BUILDING')

In [100]:
final_df = spark.sql('SELECT h.*, b.BuildingAge, b.HVACproduct, b.Country FROM HVAC1 h JOIN BUILDING b on h.BuildingID = b.BuildingID WHERE h.tempchange=1')

In [101]:
final_df.createOrReplaceTempView('BUILDINGS_HVAC')

In [102]:
final_df.select('tempchange','Country').show(5)

+----------+-------+
|tempchange|Country|
+----------+-------+
|         1|Finland|
|         1| Brazil|
|         1|Finland|
|         1| France|
|         1|Finland|
+----------+-------+
only showing top 5 rows



### Country with more temperature variation

In [105]:
spark.sql("SELECT Country, count(tempchange) as tempchange FROM BUILDINGS_HVAC GROUP BY Country ORDER BY tempchange DESC").show(5)

+---------+----------+
|  Country|tempchange|
+---------+----------+
|  Finland|       473|
|   France|       251|
|Hong Kong|       248|
|   Turkey|       243|
|Indonesia|       243|
+---------+----------+
only showing top 5 rows



### Closing Spark Session

In [106]:
spark.stop()