# Setting up your spark context
We are going to use the [pyspark api](https://spark.apache.org/docs/2.3.1/quick-start.html), you can also use Scala or R or something else.

## In this Noteboook
1. create a context
2. first data fame steps
3. read data into data frame from file

In [53]:
# Initialize the spark environment (takes ~ 1min)
import pyspark
from pyspark.sql import SparkSession
#conf = pyspark.SparkConf().setAppName('odl').setMaster('local')
#sc = pyspark.SparkContext(conf=conf)
#qlc = pyspark.sql.SQLContext(sc)

spark = SparkSession.builder \
    .master("local") \
    .appName("odl") \
   .getOrCreate()
#sc = spark.sparkContext()
#sqlc = spark.sqlContext()

 

In [25]:
import pyspark.sql.functions as sf

In [54]:
spark

In [55]:
sqlc

<pyspark.sql.context.SQLContext at 0x7f2f867bf080>

# DataFrames
Dataframes are a set of instructions to create a dataset

* NB: In R and python dataframes actually contain the data and take up lots of space in memory
* NB: In spark dataframes are a set of instructions and take up no space in memory

# Reading in data from a file

In [74]:
# Sean
import os
os.getcwd()
os.listdir()

df_ass = spark.read.format("csv")\
  .option("header","true")\
  .option("inferSchema", "true")\
  .load("Real_Estate_Current_Assessments.csv")

df_detail = spark.read.format("csv")\
  .option("header","true")\
  .option("inferSchema", "true")\
  .load("Parcel_Area_Details.csv")

df_resdet = spark.read.format("csv")\
  .option("header","true")\
  .option("inferSchema", "true")\
  .load("Real_Estate_Residential_Details.csv")



In [75]:
print(df_ass.take(2))
print(df_detail.take(2))
print(df_resdet.take(2))

[Row(ParcelNumber='130015000', CurrentAssessedValue=712400, OBJECTID=22485233, st_number='2110', st_name='MINOR RD', st_unit=None, LEGALDESCR='LOT 15 MERRYDEN', LOTSQFT=17380.44), Row(ParcelNumber='090001000', CurrentAssessedValue=950500, OBJECTID=22485234, st_number='1702', st_name='GORDON AVE', st_unit=None, LEGALDESCR='PT LOTS 1 & 2 BK 1 PRESTON HGT', LOTSQFT=0.0)]
[Row(OBJECTID=1, Assessment=400800, FileType='R', GeoParcelIdentificationNumber=3, IsMultiParcelPolygon=None, Label='104', LegalDescription='LOT 63A&PAR X-11 GREENBRIER HG', LotSquareFeet=22172.04, MapPage='42C', ModifiedDate=None, OwnerName='PHAM, HUNG QUANG & SIMONETTA LIUTI', OwnerAddress='2313 GLENN COURT', OwnerCityState='CHARLOTTESVILLE VA', OwnerZipCode='22901', ParcelNumber='42C104000', StreetName='GLENN CT', StreetNumber='2313', TaxYear='2018 Value:', Text=' ', Unit=None, Zoning='R-1'), Row(OBJECTID=6, Assessment=357000, FileType='R', GeoParcelIdentificationNumber=11, IsMultiParcelPolygon=None, Label='102', Legal

In [76]:
#Sean
df_ass.printSchema()
df_detail.printSchema()
df_resdet.printSchema()

root
 |-- ParcelNumber: string (nullable = true)
 |-- CurrentAssessedValue: integer (nullable = true)
 |-- OBJECTID: integer (nullable = true)
 |-- st_number: string (nullable = true)
 |-- st_name: string (nullable = true)
 |-- st_unit: string (nullable = true)
 |-- LEGALDESCR: string (nullable = true)
 |-- LOTSQFT: double (nullable = true)

root
 |-- OBJECTID: integer (nullable = true)
 |-- Assessment: integer (nullable = true)
 |-- FileType: string (nullable = true)
 |-- GeoParcelIdentificationNumber: integer (nullable = true)
 |-- IsMultiParcelPolygon: integer (nullable = true)
 |-- Label: string (nullable = true)
 |-- LegalDescription: string (nullable = true)
 |-- LotSquareFeet: double (nullable = true)
 |-- MapPage: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)
 |-- OwnerName: string (nullable = true)
 |-- OwnerAddress: string (nullable = true)
 |-- OwnerCityState: string (nullable = true)
 |-- OwnerZipCode: string (nullable = true)
 |-- ParcelNumber: st

In [79]:
#flattable.head()
acnt = df_ass.groupBy().agg(sf.count('OBJECTID').alias('cnt'))
acnt.show()

dcnt = df_detail.groupBy().agg(sf.count('OBJECTID').alias('cnt'))
dcnt.show()

rdcnt = df_resdet.groupBy().agg(sf.count('ParcelNumber').alias('cnt'))
rdcnt.show()

+-----+
|  cnt|
+-----+
|15561|
+-----+

+-----+
|  cnt|
+-----+
|13784|
+-----+

+-----+
|  cnt|
+-----+
|14521|
+-----+



In [81]:
# Sean
df_ass.registerTempTable("assessments")
df_detail.registerTempTable("detail")
df_resdet.registerTempTable("resdetail")

In [137]:
#flattable = spark.sql(""" SELECT * FROM assessments ass left join detail det on ass.OBJECTID = det.OBJECTID """)
flattable = spark.sql(""" 
SELECT ass.ParcelNumber,
 CurrentAssessedValue,
 ass.OBJECTID,
 ass.st_number StreetNumber,
 ass.st_name StreetName,
 ass.st_unit Unit,
 LEGALDESCR,
 LOTSQFT,
 Assessment,
 FileType,
 GeoParcelIdentificationNumber,
 IsMultiParcelPolygon,
 Label,
 LegalDescription,
 LotSquareFeet,
 MapPage,
 ModifiedDate,
 OwnerName,
 OwnerAddress,
 OwnerCityState,
 OwnerZipCode,
 TaxYear,
 Text,
 Zoning,
 RecordID_Int,
 UseCode,
 Style,
 Grade,
 Roof,
 Flooring,
 Heating,
 Fireplace,
 YearBuilt,
 TotalRooms,
 Bedrooms,
 HalfBathrooms,
 FullBathrooms,
 BasementGarage,
 Basement,
 FinishedBasement,
 BasementType,
 ExternalWalls,
 NumberOfStories,
 SquareFootageFinishedLiving

FROM assessments ass \
inner join detail det on ass.ParcelNumber = det.ParcelNumber \
inner join resdetail rd on ass.ParcelNumber = rd.ParcelNumber""")



In [138]:
flattable.printSchema()
fcnt = flattable.groupby().agg(sf.count('LEGALDESCR').alias('cnt'))
fcnt.show()

print(flattable.take(2))

root
 |-- ParcelNumber: string (nullable = true)
 |-- CurrentAssessedValue: integer (nullable = true)
 |-- OBJECTID: integer (nullable = true)
 |-- StreetNumber: string (nullable = true)
 |-- StreetName: string (nullable = true)
 |-- Unit: string (nullable = true)
 |-- LEGALDESCR: string (nullable = true)
 |-- LOTSQFT: double (nullable = true)
 |-- Assessment: integer (nullable = true)
 |-- FileType: string (nullable = true)
 |-- GeoParcelIdentificationNumber: integer (nullable = true)
 |-- IsMultiParcelPolygon: integer (nullable = true)
 |-- Label: string (nullable = true)
 |-- LegalDescription: string (nullable = true)
 |-- LotSquareFeet: double (nullable = true)
 |-- MapPage: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)
 |-- OwnerName: string (nullable = true)
 |-- OwnerAddress: string (nullable = true)
 |-- OwnerCityState: string (nullable = true)
 |-- OwnerZipCode: string (nullable = true)
 |-- TaxYear: string (nullable = true)
 |-- Text: string (nullabl

In [139]:
# Save imported data as Parquet file

flattable.write.save("AssessmentDetailResdetail.parquet")

In [140]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation

#r1 = flattable.agg(sf.corr("CurrentAssessedValue", "LOTSQFT").alias('c')).collect()
#r1

#r2 = flattable.agg(sf.corr("CurrentAssessedValue", "TotalRooms").alias('c')).collect()
#r2

r3 = flattable.agg(sf.corr("CurrentAssessedValue", "SquareFootageFinishedLiving").alias('c')).collect()
r3

[Row(c=0.17893087842193411)]

### Do Feature Transformation

In [141]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

#stringIndexer = StringIndexer(inputCol="Zoning", outputCol="ZoningIndex")
#model = stringIndexer.fit(flattable)
#indexed = model.transform(flattable)

#encoder = OneHotEncoder(inputCol="ZoningIndex", outputCol="ZoningVec")
#encoder.transform(indexed).head(2)

cols = ['UseCode'
,'Style'
,'Grade'
,'Roof'
,'Flooring'
,'Heating'
,'BasementType'
,'ExternalWalls']

indexers = [
    StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
    for c in cols
]

encoders = [
    OneHotEncoder(
        inputCol=indexer.getOutputCol(),
        outputCol="{0}_encoded".format(indexer.getOutputCol())) 
    for indexer in indexers
]

assembler = VectorAssembler(
    inputCols=[encoder.getOutputCol() for encoder in encoders],
    outputCol="features"
)

pipeline = Pipeline(stages=indexers + encoders + [assembler])
pipeline.fit(flattable).transform(flattable).show()


+------------+--------------------+--------+------------+-----------+----+--------------------+---------+----------+--------+-----------------------------+--------------------+-----+--------------------+-------------+-------+-------------------+--------------------+--------------------+------------------+------------+-----------+----+------+------------+--------------------+-----------+-----+--------+----------+------------------+---------+---------+----------+--------+-------------+-------------+--------------+--------+----------------+----------------+-------------+---------------+---------------------------+---------------+-------------+-------------+------------+----------------+---------------+--------------------+---------------------+-----------------------+---------------------+---------------------+--------------------+------------------------+-----------------------+----------------------------+-----------------------------+--------------------+
|ParcelNumber|CurrentAssessedV

In [148]:
# View counts of categorical items

for c in cols:
    temp = spark.sql("""SELECT {0}, count({0}) 
    FROM assessments ass \
    inner join detail det on ass.ParcelNumber = det.ParcelNumber \
    inner join resdetail rd on ass.ParcelNumber = rd.ParcelNumber
    group by {0} \
    order by count({0}) desc""".format(c))
    temp.show()


+--------------------+--------------+
|             UseCode|count(UseCode)|
+--------------------+--------------+
|       Single Family|          7829|
|Single Family Att...|          1228|
|         Condominium|          1074|
|         Vacant Land|           975|
|              Duplex|           845|
|Single Family-1 C...|           802|
|Single Family-2 C...|            65|
|         Parking Lot|            63|
|Fraternity\Sorori...|            51|
|   Condo Common Area|            35|
|Single Family-3 C...|            22|
|Apartments 1-10 u...|            10|
|Apartments over 2...|             8|
|              Office|             7|
|     Office Building|             6|
|       Rooming House|             5|
|Apartments 11-20 ...|             5|
|            Quadplex|             5|
|         Common Area|             4|
|  Condominium - Flex|             3|
+--------------------+--------------+
only showing top 20 rows

+------------------+------------+
|             Style|count(St