In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import desc

In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [3]:
sc = spark.sparkContext

In [4]:
df = spark.read.csv('final_data_set.csv',
                   inferSchema=True,
                    header=True
                   )

In [5]:
df = df.drop('_c0')

In [6]:
df.printSchema()

root
 |-- price: integer (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: double (nullable = true)
 |-- sqft_living: integer (nullable = true)
 |-- sqft_lot: integer (nullable = true)
 |-- floors: double (nullable = true)
 |-- waterfront: integer (nullable = true)
 |-- view: integer (nullable = true)
 |-- condition: integer (nullable = true)
 |-- grade: integer (nullable = true)
 |-- sqft_above: integer (nullable = true)
 |-- sqft_basement: integer (nullable = true)
 |-- yr_built: integer (nullable = true)
 |-- sqft_living15: integer (nullable = true)
 |-- sqft_lot15: integer (nullable = true)
 |-- walk_score: integer (nullable = true)
 |-- transit_score: integer (nullable = true)
 |-- pers_crime_score: integer (nullable = true)
 |-- prop_crime_score: integer (nullable = true)
 |-- income: integer (nullable = true)
 |-- renovated_yrs_ago: integer (nullable = true)
 |-- log_price: double (nullable = true)



In [7]:
df.show(3)

+------+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+-------------+----------+----------+-------------+----------------+----------------+------+-----------------+------------------+
| price|bedrooms|bathrooms|sqft_living|sqft_lot|floors|waterfront|view|condition|grade|sqft_above|sqft_basement|yr_built|sqft_living15|sqft_lot15|walk_score|transit_score|pers_crime_score|prop_crime_score|income|renovated_yrs_ago|         log_price|
+------+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+-------------+----------+----------+-------------+----------------+----------------+------+-----------------+------------------+
|221900|       3|      1.0|       1180|    5650|   1.0|         0|   0|        3|    7|      1180|            0|      60|         1340|      5650|        46|           46|               2|               3| 71524|               60|12.309982108920686|


In [8]:
df.describe(['price']).show()

+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|              5852|
|   mean| 553231.8257006152|
| stddev|247486.31361324288|
|    min|             90000|
|    max|           1570000|
+-------+------------------+



In [9]:
x = df.describe().toPandas()

In [10]:
x

Unnamed: 0,summary,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,yr_built,sqft_living15,sqft_lot15,walk_score,transit_score,pers_crime_score,prop_crime_score,income,renovated_yrs_ago,log_price
0,count,5852.0,5852.0,5852.0,5852.0,5852.0,5852.0,5852.0,5852.0,5852.0,...,5852.0,5852.0,5852.0,5852.0,5852.0,5852.0,5852.0,5852.0,5852.0,5852.0
1,mean,553231.8257006152,3.126794258373206,1.9082792207792207,1755.2424812030076,5288.316131237184,1.527682843472317,0.0018796992481203,0.2441900205058099,3.4448051948051948,...,61.30587833219412,1647.2812713602186,5202.725563909775,67.68113465481886,52.604750512645246,1.960868079289132,2.099794941900205,81775.06818181818,57.034005468216,13.13247397082676
2,stddev,247486.31361324288,1.071525033061304,0.7756144290599073,721.24214276076,5906.49018430074,0.6112397050160592,0.0433184329838561,0.7471306086749057,0.7036749588932281,...,35.04861598472822,459.8347984633792,6063.99439546608,18.92226538694838,11.10841208248582,0.831366665959762,0.8804568112835992,26074.238176516246,35.831109923801336,0.4261721665779862
3,min,90000.0,0.0,0.0,370.0,520.0,1.0,0.0,0.0,1.0,...,0.0,460.0,651.0,0.0,0.0,1.0,1.0,12269.0,0.0,11.407564949312402
4,max,1570000.0,33.0,7.5,6070.0,219978.0,3.5,1.0,4.0,5.0,...,115.0,5600.0,216928.0,99.0,95.0,4.0,4.0,199542.0,115.0,14.266586177324491


In [11]:
df.take(2)

[Row(price=221900, bedrooms=3, bathrooms=1.0, sqft_living=1180, sqft_lot=5650, floors=1.0, waterfront=0, view=0, condition=3, grade=7, sqft_above=1180, sqft_basement=0, yr_built=60, sqft_living15=1340, sqft_lot15=5650, walk_score=46, transit_score=46, pers_crime_score=2, prop_crime_score=3, income=71524, renovated_yrs_ago=60, log_price=12.309982108920686),
 Row(price=538000, bedrooms=3, bathrooms=2.25, sqft_living=2570, sqft_lot=7242, floors=2.0, waterfront=0, view=0, condition=3, grade=7, sqft_above=2170, sqft_basement=400, yr_built=64, sqft_living15=1690, sqft_lot15=7639, walk_score=64, transit_score=50, pers_crime_score=2, prop_crime_score=3, income=56208, renovated_yrs_ago=24, log_price=13.195613839143922)]

In [12]:
df.registerTempTable('df')

bedrooms = spark.sql(r'''SELECT avg(  price), bedrooms FROM df GROUP BY bedrooms''')

bedrooms.show()

+-----------------+--------+
|       avg(price)|bedrooms|
+-----------------+--------+
|335107.0202020202|       1|
|724756.8928571428|       6|
|532000.7742316785|       3|
|707386.0604229607|       5|
|893999.8333333334|       9|
|685722.4777358491|       4|
|         715600.0|       8|
|         685830.0|       7|
|         660000.0|      10|
|         520000.0|      11|
|         640000.0|      33|
|435478.5604699378|       2|
|         691500.0|       0|
+-----------------+--------+



In [13]:
df.select('price', 'bedrooms').show(10)

+------+--------+
| price|bedrooms|
+------+--------+
|221900|       3|
|538000|       3|
|180000|       2|
|662500|       3|
|468000|       2|
|530000|       5|
|650000|       4|
|485000|       4|
|385000|       4|
|937000|       3|
+------+--------+
only showing top 10 rows



In [14]:
df.groupBy('bedrooms').count().show()

+--------+-----+
|bedrooms|count|
+--------+-----+
|       1|   99|
|       6|   84|
|       3| 2538|
|       5|  331|
|       9|    6|
|       4| 1325|
|       8|    5|
|       7|   12|
|      10|    1|
|      11|    1|
|      33|    1|
|       2| 1447|
|       0|    2|
+--------+-----+



In [15]:
# Divide one column by another, save result as new column (save as new df to make permanent)
df.withColumn('price_per_sq_ft', df['price'] / df['sqft_living']).show(5)

+------+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+-------------+----------+----------+-------------+----------------+----------------+------+-----------------+------------------+------------------+
| price|bedrooms|bathrooms|sqft_living|sqft_lot|floors|waterfront|view|condition|grade|sqft_above|sqft_basement|yr_built|sqft_living15|sqft_lot15|walk_score|transit_score|pers_crime_score|prop_crime_score|income|renovated_yrs_ago|         log_price|   price_per_sq_ft|
+------+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+-------------+----------+----------+-------------+----------------+----------------+------+-----------------+------------------+------------------+
|221900|       3|      1.0|       1180|    5650|   1.0|         0|   0|        3|    7|      1180|            0|      60|         1340|      5650|        46|           46|               2|     

In [16]:
# Create new column with lambda function - if condition of home lower than 4. Again, did not save this to our dataframe (just for my reference)
condition_udf = udf(lambda condition: True if condition < 4 else False)
df.withColumn('new_column', condition_udf(df['condition'])).show(5)

+------+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+-------------+----------+----------+-------------+----------------+----------------+------+-----------------+------------------+----------+
| price|bedrooms|bathrooms|sqft_living|sqft_lot|floors|waterfront|view|condition|grade|sqft_above|sqft_basement|yr_built|sqft_living15|sqft_lot15|walk_score|transit_score|pers_crime_score|prop_crime_score|income|renovated_yrs_ago|         log_price|new_column|
+------+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+-------------+----------+----------+-------------+----------------+----------------+------+-----------------+------------------+----------+
|221900|       3|      1.0|       1180|    5650|   1.0|         0|   0|        3|    7|      1180|            0|      60|         1340|      5650|        46|           46|               2|               3| 71524|     

In [17]:
df.groupBy('bathrooms').count().sort(desc('count')).show(10)

+---------+-----+
|bathrooms|count|
+---------+-----+
|      1.0| 1629|
|     1.75|  837|
|      2.5|  779|
|      2.0|  667|
|      1.5|  538|
|     2.25|  430|
|      3.0|  252|
|     2.75|  242|
|      3.5|  181|
|     3.25|  176|
+---------+-----+
only showing top 10 rows



#### Train-test split

In [29]:
X = df.select(['bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'sqft_living15',
 'sqft_lot15',
 'walk_score',
 'transit_score',
 'pers_crime_score',
 'prop_crime_score',
 'income',
 'renovated_yrs_ago'])

y = df.select(['log_price'])

In [32]:
X_train, X_test = X.randomSplit([0.7, 0.3], seed=41)
y_train, y_test = X.randomSplit([0.7, 0.3], seed=21)

In [50]:
df.explain()

== Physical Plan ==
*(1) FileScan csv [price#11,bedrooms#12,bathrooms#13,sqft_living#14,sqft_lot#15,floors#16,waterfront#17,view#18,condition#19,grade#20,sqft_above#21,sqft_basement#22,yr_built#23,sqft_living15#24,sqft_lot15#25,walk_score#26,transit_score#27,pers_crime_score#28,prop_crime_score#29,income#30,renovated_yrs_ago#31,log_price#32] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/brenner/project_luther/final_data_set.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<price:int,bedrooms:int,bathrooms:double,sqft_living:int,sqft_lot:int,floors:double,waterfr...
