In [None]:
import sys
from random import random
from operator import add
from pyspark.sql import SparkSession
import pyspark as ps

In [None]:
spark = SparkSession\
    .builder\
    .appName('PythonPi')\
    .getOrCreate()

In [None]:
path = '/content/kc_house_data.csv'

df = spark.read.csv(path, header=True, inferSchema=True)

In [None]:
type(df)

In [None]:
df.show(10)

+----------+---------------+---------+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+------------+-------+-------+--------+-------------+----------+
|        id|           date|    price|bedrooms|bathrooms|sqft_living|sqft_lot|floors|waterfront|view|condition|grade|sqft_above|sqft_basement|yr_built|yr_renovated|zipcode|    lat|    long|sqft_living15|sqft_lot15|
+----------+---------------+---------+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+------------+-------+-------+--------+-------------+----------+
|7129300520|20141013T000000| 221900.0|       3|      1.0|       1180|    5650|   1.0|         0|   0|        3|    7|      1180|            0|    1955|           0|  98178|47.5112|-122.257|         1340|      5650|
|6414100192|20141209T000000| 538000.0|       3|     2.25|       2570|    7242|   2.0|         0|   0|        3|    7|      2170|          40

In [None]:
df.dtypes

[('id', 'bigint'),
 ('date', 'string'),
 ('price', 'double'),
 ('bedrooms', 'int'),
 ('bathrooms', 'double'),
 ('sqft_living', 'int'),
 ('sqft_lot', 'int'),
 ('floors', 'double'),
 ('waterfront', 'int'),
 ('view', 'int'),
 ('condition', 'int'),
 ('grade', 'int'),
 ('sqft_above', 'int'),
 ('sqft_basement', 'int'),
 ('yr_built', 'int'),
 ('yr_renovated', 'int'),
 ('zipcode', 'int'),
 ('lat', 'double'),
 ('long', 'double'),
 ('sqft_living15', 'int'),
 ('sqft_lot15', 'int')]

In [None]:
df.count

In [None]:
from pyspark.sql.types import IntegerType, FloatType

for col_name, data_type in df.dtypes:
  if data_type == 'int': df = df.withColumn(col_name, df[col_name].cast(FloatType()))

In [None]:
df.dtypes

[('id', 'bigint'),
 ('date', 'string'),
 ('price', 'double'),
 ('bedrooms', 'float'),
 ('bathrooms', 'double'),
 ('sqft_living', 'float'),
 ('sqft_lot', 'float'),
 ('floors', 'double'),
 ('waterfront', 'float'),
 ('view', 'float'),
 ('condition', 'float'),
 ('grade', 'float'),
 ('sqft_above', 'float'),
 ('sqft_basement', 'float'),
 ('yr_built', 'float'),
 ('yr_renovated', 'float'),
 ('zipcode', 'float'),
 ('lat', 'double'),
 ('long', 'double'),
 ('sqft_living15', 'float'),
 ('sqft_lot15', 'float')]

In [None]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- date: string (nullable = true)
 |-- price: double (nullable = true)
 |-- bedrooms: float (nullable = true)
 |-- bathrooms: double (nullable = true)
 |-- sqft_living: float (nullable = true)
 |-- sqft_lot: float (nullable = true)
 |-- floors: double (nullable = true)
 |-- waterfront: float (nullable = true)
 |-- view: float (nullable = true)
 |-- condition: float (nullable = true)
 |-- grade: float (nullable = true)
 |-- sqft_above: float (nullable = true)
 |-- sqft_basement: float (nullable = true)
 |-- yr_built: float (nullable = true)
 |-- yr_renovated: float (nullable = true)
 |-- zipcode: float (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- sqft_living15: float (nullable = true)
 |-- sqft_lot15: float (nullable = true)



ORDENAMIENTO DE RESULTADOS

In [None]:
import pyspark.sql.functions as F

df.sort(F1.col('zipcode').desc()).show(10)

+----------+---------------+---------+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+------------+-------+-------+--------+-------------+----------+
|        id|           date|    price|bedrooms|bathrooms|sqft_living|sqft_lot|floors|waterfront|view|condition|grade|sqft_above|sqft_basement|yr_built|yr_renovated|zipcode|    lat|    long|sqft_living15|sqft_lot15|
+----------+---------------+---------+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+------------+-------+-------+--------+-------------+----------+
|2021200370|20140901T000000|1100000.0|     3.0|      2.0|     3010.0|  5000.0|   2.0|       0.0| 2.0|      5.0|  9.0|    1890.0|       1120.0|  1931.0|         0.0|98199.0|47.6347|-122.396|       2688.0|    5000.0|
|2864600105|20140624T000000| 819000.0|     3.0|      3.5|     2130.0|  6150.0|   2.0|       0.0| 2.0|      5.0|  8.0|    1530.0|        600.

In [None]:
# Conversión de pies² a m²
SQFT_TO_M2 = 0.092903

# Estadísticas por zipcode
stats_zipcode = (
    df.groupBy('zipcode')
      .agg(
          F.round(F.avg('price'), 2).alias('PrecioPromedio'),
          F.round(F.avg(df['sqft_living'] * SQFT_TO_M2), 2).alias('Promedio_m2')
      )
)

# Conteo de casas por zipcode
zipcode_counts = df.groupBy('zipcode').count()

# Unir estadísticas + conteo
stats_conteo = stats_zipcode.join(zipcode_counts, on='zipcode')

# Ordenar por mayor número de casas
stats_ordenado = stats_conteo.orderBy(F.col('count').desc())

print("Estadísticas por Zipcode (ordenado por número de casas):")
stats_ordenado.show()


Estadísticas por Zipcode (ordenado por número de casas):
+-------+--------------+-----------+-----+
|zipcode|PrecioPromedio|Promedio_m2|count|
+-------+--------------+-----------+-----+
|98103.0|     584919.21|     153.37|  602|
|98038.0|      366867.6|     199.53|  590|
|98115.0|     619900.55|      170.5|  583|
|98052.0|     645231.46|     219.59|  574|
|98117.0|     576795.01|      157.2|  553|
|98042.0|     311632.11|     184.16|  548|
|98034.0|     521652.86|     183.82|  545|
|98118.0|     417637.43|      156.7|  508|
|98023.0|     286732.79|     184.85|  499|
|98006.0|     859684.78|     268.33|  498|
|98133.0|      386997.4|     149.99|  494|
|98059.0|     493552.53|     223.31|  468|
|98058.0|     353608.64|     190.42|  455|
|98155.0|      423725.7|     165.98|  446|
|98074.0|     685605.78|     245.81|  441|
|98033.0|     803719.52|     221.23|  432|
|98027.0|     616990.59|     233.61|  412|
|98125.0|     469455.77|     162.16|  410|
|98056.0|     420890.55|     187.39|  40

In [None]:
df_agrupado = df.groupBy('zipcode', 'bedrooms', 'bathrooms').agg(
    F.round(F.avg('price'), 2).alias('PrecioPromedio')
)

df_agrupado.show()

+-------+--------+---------+--------------+
|zipcode|bedrooms|bathrooms|PrecioPromedio|
+-------+--------+---------+--------------+
|98119.0|     3.0|      1.0|     681881.25|
|98040.0|     3.0|      2.5|      889000.0|
|98030.0|     4.0|      2.5|     347197.22|
|98042.0|     4.0|     2.25|     371188.46|
|98122.0|     4.0|      3.0|      664125.0|
|98052.0|     3.0|      2.0|     517635.36|
|98058.0|     4.0|     3.25|      583000.0|
|98065.0|     2.0|      2.5|      786000.0|
|98178.0|     1.0|     0.75|      231000.0|
|98040.0|     5.0|     2.75|     1225587.0|
|98119.0|     3.0|      2.5|     701354.73|
|98112.0|     5.0|      1.5|      766000.0|
|98058.0|     5.0|      3.0|      409890.0|
|98033.0|     5.0|     4.25|     1400000.0|
|98199.0|     4.0|     3.25|     1496825.0|
|98038.0|     2.0|      2.5|     296816.67|
|98040.0|     5.0|     1.75|      840000.0|
|98117.0|     5.0|     1.75|      823000.0|
|98075.0|     6.0|      3.0|      624500.0|
|98103.0|     3.0|     3.75|    