<a href="https://colab.research.google.com/github/Vasugi2003/Big-Data-Analytics/blob/main/House_price_prediction_Decision_tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=ff5447dfe763032face2c557e2eeef3e02776c3cd894164a448554c537450c6d
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType
import pyspark.sql.functions as F
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Initialize SparkSession
spark = SparkSession.builder.appName("HousePriceClassification").getOrCreate()

In [None]:
filepath = "/content/houseprice.csv"
df1 = spark.read.csv(filepath, header=True, inferSchema=True)
df1.show()


+-------------------+---------+--------+---------+-----------+--------+------+----------+----+---------+----------+-------------+--------+------------+--------------------+----------------+--------+-------+
|               date|    price|bedrooms|bathrooms|sqft_living|sqft_lot|floors|waterfront|view|condition|sqft_above|sqft_basement|yr_built|yr_renovated|              street|            city|statezip|country|
+-------------------+---------+--------+---------+-----------+--------+------+----------+----+---------+----------+-------------+--------+------------+--------------------+----------------+--------+-------+
|2014-05-02 00:00:00| 313000.0|     3.0|      1.5|       1340|    7912|   1.5|         0|   0|        3|      1340|            0|    1955|        2005|18810 Densmore Ave N|       Shoreline|WA 98133|    USA|
|2014-05-02 00:00:00|2384000.0|     5.0|      2.5|       3650|    9050|   2.0|         0|   4|        5|      3370|          280|    1921|           0|     709 W Blaine St|

In [None]:
df1.printSchema()

root
 |-- date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- bedrooms: double (nullable = true)
 |-- bathrooms: double (nullable = true)
 |-- sqft_living: integer (nullable = true)
 |-- sqft_lot: integer (nullable = true)
 |-- floors: double (nullable = true)
 |-- waterfront: integer (nullable = true)
 |-- view: integer (nullable = true)
 |-- condition: integer (nullable = true)
 |-- sqft_above: integer (nullable = true)
 |-- sqft_basement: integer (nullable = true)
 |-- yr_built: integer (nullable = true)
 |-- yr_renovated: integer (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- statezip: string (nullable = true)
 |-- country: string (nullable = true)



In [None]:

df1 = df1.withColumn("price", when(col("price") > 600000, 1).otherwise(0).cast(IntegerType()))

# Show the resulting DataFrame
df1.show()


+-------------------+-----+--------+---------+-----------+--------+------+----------+----+---------+----------+-------------+--------+------------+--------------------+----------------+--------+-------+
|               date|price|bedrooms|bathrooms|sqft_living|sqft_lot|floors|waterfront|view|condition|sqft_above|sqft_basement|yr_built|yr_renovated|              street|            city|statezip|country|
+-------------------+-----+--------+---------+-----------+--------+------+----------+----+---------+----------+-------------+--------+------------+--------------------+----------------+--------+-------+
|2014-05-02 00:00:00|    0|     3.0|      1.5|       1340|    7912|   1.5|         0|   0|        3|      1340|            0|    1955|        2005|18810 Densmore Ave N|       Shoreline|WA 98133|    USA|
|2014-05-02 00:00:00|    1|     5.0|      2.5|       3650|    9050|   2.0|         0|   4|        5|      3370|          280|    1921|           0|     709 W Blaine St|         Seattle|WA 

In [None]:
inputColumns = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'sqft_basement']
outputColumn = "features"

In [None]:
for col_name in inputColumns:
    df1 = df1.withColumn(col_name, df1[col_name].cast(IntegerType()))

In [None]:
price_indexer = StringIndexer(inputCol="price", outputCol="priceIndex")

vector_assembler = VectorAssembler(inputCols=inputColumns, outputCol=outputColumn)

dt_model = DecisionTreeClassifier(labelCol="price", featuresCol=outputColumn)

In [None]:
stages = [price_indexer, vector_assembler, dt_model]

pipeline = Pipeline(stages=stages)

(train_df2, test_df2) = df1.randomSplit([0.8, 0.2], seed=11)

In [None]:
final_pipeline = pipeline.fit(train_df2)

test_predictions_from_pipeline = final_pipeline.transform(test_df2)

In [None]:
test_predictions_from_pipeline.select("price", "prediction").show(5)

+-----+----------+
|price|prediction|
+-----+----------+
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
+-----+----------+
only showing top 5 rows



In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator



evaluator = MulticlassClassificationEvaluator(labelCol="price", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(test_predictions_from_pipeline)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7871396895787139
