In [None]:
from pyspark.sql import SparkSession

#Create SparkSession
spark = (SparkSession.builder
    .appName('Preprocessing with Discretization')
    .master("local[*]")
    .getOrCreate())

spark.sparkContext.setLogLevel('WARN')
print("Spark: "+spark.version)

Spark: 3.5.1


Load Dataset

In [None]:
from pyspark import SparkFiles
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

#Create schema
schema = StructType([
    StructField("Size",DoubleType(),True),
    StructField("Weight",IntegerType(),True),
    StructField("Brix",DoubleType(),True),
    StructField("pH",DoubleType(),True),
    StructField("Softness",DoubleType(),True),
    StructField("HarvestTime",IntegerType(),True),
    StructField("Ripeness",DoubleType(),True),
    StructField("Color",StringType(),True),
    StructField("Variety",StringType(),True),
    StructField("Blemishes",StringType(),True),
    StructField("Quality",DoubleType(),True),
])

spark.sparkContext.addFile("https://raw.githubusercontent.com/Royland97/dataset/main/orange.csv")
df = spark.read.schema(schema).csv(SparkFiles.get("orange.csv"), header=True, inferSchema=True)
df.printSchema()
df.show()

root
 |-- Size: double (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- Brix: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- Softness: double (nullable = true)
 |-- HarvestTime: integer (nullable = true)
 |-- Ripeness: double (nullable = true)
 |-- Color: string (nullable = true)
 |-- Variety: string (nullable = true)
 |-- Blemishes: string (nullable = true)
 |-- Quality: double (nullable = true)

+----+------+----+---+--------+-----------+--------+-------------+----------------+-------------+-------+
|Size|Weight|Brix| pH|Softness|HarvestTime|Ripeness|        Color|         Variety|    Blemishes|Quality|
+----+------+----+---+--------+-----------+--------+-------------+----------------+-------------+-------+
| 7.5|   180|12.0|3.2|     2.0|         10|     4.0|       Orange|        Valencia|            N|    4.0|
| 8.2|   220|10.5|3.4|     3.0|         14|     4.5|  Deep Orange|           Navel|            N|    4.5|
| 6.8|   150|14.0|3.0|     1.0|        

Equal-amplitude discretization for the Weight attribute

In [68]:
from math import floor

df_weight = df.select("Weight")
rdd = df_weight.rdd.map(lambda row: (row["Weight"]))

min_value = rdd.reduce(lambda x,y: x if x<y else y)
max_value = rdd.reduce(lambda x,y: x if x>y else y)

print(min_value)
print(max_value)

k = 5
amplitud = (max_value - min_value) / k

rdd_discretizado = rdd.map(lambda x: (x, (k - 1) if x == max_value else int(floor((x - min_value) / amplitud))))
rdd_discretizado.collect()

100
300


[(180, 2),
 (220, 3),
 (150, 1),
 (250, 3),
 (210, 2),
 (126, 0),
 (160, 1),
 (130, 0),
 (240, 3),
 (190, 2),
 (270, 4),
 (183, 2),
 (200, 2),
 (140, 1),
 (260, 4),
 (120, 0),
 (230, 3),
 (218, 2),
 (247, 3),
 (170, 1),
 (300, 4),
 (205, 2),
 (180, 2),
 (280, 4),
 (190, 2),
 (215, 2),
 (140, 1),
 (111, 0),
 (177, 1),
 (240, 3),
 (220, 3),
 (150, 1),
 (185, 2),
 (245, 3),
 (110, 0),
 (225, 3),
 (165, 1),
 (285, 4),
 (200, 2),
 (175, 1),
 (260, 4),
 (125, 0),
 (190, 2),
 (210, 2),
 (135, 0),
 (185, 2),
 (240, 3),
 (215, 2),
 (275, 4),
 (160, 1),
 (230, 3),
 (105, 0),
 (220, 3),
 (290, 4),
 (185, 2),
 (170, 1),
 (255, 3),
 (120, 0),
 (180, 2),
 (205, 2),
 (130, 0),
 (235, 3),
 (200, 2),
 (165, 1),
 (240, 3),
 (140, 1),
 (215, 2),
 (150, 1),
 (280, 4),
 (190, 2),
 (220, 3),
 (100, 0),
 (210, 2),
 (175, 1),
 (270, 4),
 (135, 0),
 (235, 3),
 (190, 2),
 (225, 3),
 (145, 1),
 (250, 3),
 (115, 0),
 (210, 2),
 (185, 2),
 (240, 3),
 (140, 1),
 (200, 2),
 (170, 1),
 (205, 2),
 (160, 1),
 (280, 4),

Equal frequency discretization for the HarvestTime attribute

In [None]:
df_htime = df.select("HarvestTime")
rdd = df_htime.rdd.map(lambda row: (row["HarvestTime"]))

rdd_ordenado= rdd.sortBy(lambda x: x)

#Calculo de quantiles usando el Dataframe
quantiles = df_htime.approxQuantile("HarvestTime", [0.25, 0.5, 0.75], 0.01)
q1, q2, q3 = quantiles

rdd_discretizado = rdd_ordenado.map(
    lambda x: (x,
               0 if x <= q1 else
               1 if x <= q2 else
               2 if x <= q3 else
               3)
)

rdd_discretizado.collect()

[(4, 0),
 (4, 0),
 (5, 0),
 (5, 0),
 (5, 0),
 (5, 0),
 (5, 0),
 (5, 0),
 (6, 0),
 (6, 0),
 (6, 0),
 (6, 0),
 (6, 0),
 (7, 0),
 (7, 0),
 (7, 0),
 (7, 0),
 (7, 0),
 (7, 0),
 (8, 0),
 (8, 0),
 (8, 0),
 (8, 0),
 (8, 0),
 (9, 0),
 (9, 0),
 (9, 0),
 (9, 0),
 (9, 0),
 (9, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (13, 1),
 (13, 1),
 (13, 1),
 (13, 1),
 (13, 1),
 (13, 1),
 (13, 1),
 (13, 1),
 (13, 1),
 (13, 1),
 (13, 1),
 (13, 1),
 (13, 1),
 (14, 1),
 (14, 1),
 (14, 1),
 (14, 1),
 (14, 1),
 (14, 1),
