In [1]:
from pyspark.sql import SparkSession

#Crear una sesi√≥n de Spark
spark = (SparkSession.builder
    .appName('KNN Escalable')
    .master("local[*]")
    .getOrCreate())

spark.sparkContext.setLogLevel('WARN')
print("Spark: "+spark.version)

Spark: 3.5.1


Load Dataset

In [2]:
from pyspark import SparkFiles
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

#Crear el esquema
schema = StructType([
    StructField("Size",DoubleType(),True),
    StructField("Weight",IntegerType(),True),
    StructField("Brix",DoubleType(),True),
    StructField("pH",DoubleType(),True),
    StructField("Softness",DoubleType(),True),
    StructField("HarvestTime",IntegerType(),True),
    StructField("Ripeness",DoubleType(),True),
    StructField("Color",StringType(),True),
    StructField("Variety",StringType(),True),
    StructField("Blemishes",StringType(),True),
    StructField("Quality",DoubleType(),True),
])

spark.sparkContext.addFile("https://raw.githubusercontent.com/Royland97/dataset/main/orange.csv")
df = spark.read.schema(schema).csv(SparkFiles.get("orange.csv"), header=True, inferSchema=True)
df.printSchema()
df.show()

root
 |-- Size: double (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- Brix: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- Softness: double (nullable = true)
 |-- HarvestTime: integer (nullable = true)
 |-- Ripeness: double (nullable = true)
 |-- Color: string (nullable = true)
 |-- Variety: string (nullable = true)
 |-- Blemishes: string (nullable = true)
 |-- Quality: double (nullable = true)

+----+------+----+---+--------+-----------+--------+-------------+----------------+-------------+-------+
|Size|Weight|Brix| pH|Softness|HarvestTime|Ripeness|        Color|         Variety|    Blemishes|Quality|
+----+------+----+---+--------+-----------+--------+-------------+----------------+-------------+-------+
| 7.5|   180|12.0|3.2|     2.0|         10|     4.0|       Orange|        Valencia|            N|    4.0|
| 8.2|   220|10.5|3.4|     3.0|         14|     4.5|  Deep Orange|           Navel|            N|    4.5|
| 6.8|   150|14.0|3.0|     1.0|        

Data

In [3]:
df = df.select("Weight","Variety")
rdd = df.rdd.map(lambda row: (row["Weight"],row["Variety"]))
rdd.collect()

[(180, 'Valencia'),
 (220, 'Navel'),
 (150, 'Cara Cara'),
 (250, 'Blood Orange'),
 (210, 'Hamlin'),
 (126, 'Navel'),
 (160, 'Tangelo (Hybrid)'),
 (130, 'Murcott (Hybrid)'),
 (240, 'Moro (Blood)'),
 (190, 'Jaffa'),
 (270, 'Cara Cara'),
 (183, 'Valencia'),
 (200, 'Clementine'),
 (140, 'Washington Navel'),
 (260, 'Star Ruby'),
 (120, 'Tangerine'),
 (230, 'Ambiance'),
 (218, 'Cara Cara'),
 (247, 'Clementine'),
 (170, 'Jaffa'),
 (300, 'Blood Orange'),
 (205, 'Murcott (Hybrid)'),
 (180, 'California Valencia'),
 (280, 'Moro (Blood)'),
 (190, 'Honey Tangerine'),
 (215, 'Navel (Late Season)'),
 (140, 'Clementine (Seedless)'),
 (111, 'Cara Cara'),
 (177, 'Temple'),
 (240, 'Cara Cara'),
 (220, 'Hamlin'),
 (150, 'Minneola (Hybrid)'),
 (185, 'Temple'),
 (245, 'Moro (Blood)'),
 (110, 'Satsuma Mandarin'),
 (225, 'Midsweet (Hybrid)'),
 (165, 'California Valencia'),
 (285, 'Cara Cara'),
 (200, 'Navel (Early Season)'),
 (175, 'Ambiance'),
 (260, 'Star Ruby'),
 (125, 'Tangerine'),
 (190, 'Ortanique (Hybr

KNN Escalable

In [14]:
import numpy as np

def distancia(x1, x2):
    return float(np.sqrt(np.sum((x1 - x2) ** 2)))

xample = 230
k = 10

rdd_distancia = rdd.map(lambda x: (distancia(x[0], xample), x[1]))

k_vecinos = rdd_distancia.sortByKey(lambda x: x[0]).take(k)
rdd_vecinos = spark.sparkContext.parallelize(k_vecinos)

label_count = (rdd_vecinos
           .map(lambda x: (x[1], 1))
           .reduceByKey(lambda a, b: a + b))

variety = label_count.sortBy(lambda x: x[1]).first()[0]
print("Variety: " + variety)

Variety: Honey Tangerine
