In [5]:
from pyspark.sql import SparkSession

#Crear una sesi√≥n de Spark
spark = (SparkSession.builder
    .appName('Preprocessing Techniques')
    .master("local[*]")
    .getOrCreate())

spark.sparkContext.setLogLevel('WARN')
print("Spark: "+spark.version)

Spark: 4.0.1


Load Dataset

In [6]:
from pyspark import SparkFiles
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

#Crear el esquema
schema = StructType([
    StructField("Size",DoubleType(),True),
    StructField("Weight",IntegerType(),True),
    StructField("Brix",DoubleType(),True),
    StructField("pH",DoubleType(),True),
    StructField("Softness",DoubleType(),True),
    StructField("HarvestTime",IntegerType(),True),
    StructField("Ripeness",DoubleType(),True),
    StructField("Color",StringType(),True),
    StructField("Variety",StringType(),True),
    StructField("Blemishes",StringType(),True),
    StructField("Quality",DoubleType(),True),
])

spark.sparkContext.addFile("https://raw.githubusercontent.com/Royland97/dataset/main/orange.csv")
df = spark.read.schema(schema).csv(SparkFiles.get("orange.csv"), header=True, inferSchema=True)
df.printSchema()
df.show()

root
 |-- Size: double (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- Brix: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- Softness: double (nullable = true)
 |-- HarvestTime: integer (nullable = true)
 |-- Ripeness: double (nullable = true)
 |-- Color: string (nullable = true)
 |-- Variety: string (nullable = true)
 |-- Blemishes: string (nullable = true)
 |-- Quality: double (nullable = true)

+----+------+----+---+--------+-----------+--------+-------------+----------------+-------------+-------+
|Size|Weight|Brix| pH|Softness|HarvestTime|Ripeness|        Color|         Variety|    Blemishes|Quality|
+----+------+----+---+--------+-----------+--------+-------------+----------------+-------------+-------+
| 7.5|   180|12.0|3.2|     2.0|         10|     4.0|       Orange|        Valencia|            N|    4.0|
| 8.2|   220|10.5|3.4|     3.0|         14|     4.5|  Deep Orange|           Navel|            N|    4.5|
| 6.8|   150|14.0|3.0|     1.0|        

Normalizacion

In [None]:
from math import floor

df_weight = df.select("Weight")
rdd = df_weight.rdd.map(lambda row: (row["Weight"]))

vmin = rdd.reduce(lambda x,y: x if x<y else y)
vmax = rdd.reduce(lambda x,y: x if x>y else y)

norm_rdd = rdd.map(lambda x: (x - vmin) / (vmax - vmin))
norm_rdd.collect()

[0.4,
 0.6,
 0.25,
 0.75,
 0.55,
 0.13,
 0.3,
 0.15,
 0.7,
 0.45,
 0.85,
 0.415,
 0.5,
 0.2,
 0.8,
 0.1,
 0.65,
 0.59,
 0.735,
 0.35,
 1.0,
 0.525,
 0.4,
 0.9,
 0.45,
 0.575,
 0.2,
 0.055,
 0.385,
 0.7,
 0.6,
 0.25,
 0.425,
 0.725,
 0.05,
 0.625,
 0.325,
 0.925,
 0.5,
 0.375,
 0.8,
 0.125,
 0.45,
 0.55,
 0.175,
 0.425,
 0.7,
 0.575,
 0.875,
 0.3,
 0.65,
 0.025,
 0.6,
 0.95,
 0.425,
 0.35,
 0.775,
 0.1,
 0.4,
 0.525,
 0.15,
 0.675,
 0.5,
 0.325,
 0.7,
 0.2,
 0.575,
 0.25,
 0.9,
 0.45,
 0.6,
 0.0,
 0.55,
 0.375,
 0.85,
 0.175,
 0.675,
 0.45,
 0.625,
 0.225,
 0.75,
 0.075,
 0.55,
 0.425,
 0.7,
 0.2,
 0.5,
 0.35,
 0.525,
 0.3,
 0.9,
 0.15,
 0.65,
 0.45,
 0.55,
 0.1,
 0.075,
 0.6,
 0.25,
 0.29,
 0.46,
 0.135,
 0.09,
 0.6,
 0.51,
 0.15,
 0.115,
 0.215,
 0.845,
 0.1,
 0.26,
 0.85,
 0.42,
 0.185,
 0.68,
 0.895,
 0.965,
 0.545,
 0.73,
 0.765,
 0.99,
 0.345,
 0.8,
 0.76,
 0.985,
 0.715,
 0.04,
 0.085,
 0.67,
 0.88,
 0.845,
 0.67,
 0.275,
 0.49,
 0.99,
 0.11,
 0.965,
 0.215,
 0.785,
 0.285,
 0.4,

Estandarizacion

In [7]:
from math import floor

df_weight = df.select("Weight")
rdd = df_weight.rdd.map(lambda row: (row["Weight"]))

m = rdd.reduce(lambda x,y: x + y) / rdd.count()
s = rdd.map(lambda x: (x - m) ** 2).reduce(lambda x,y: x + y) / rdd.count()
std = s ** 0.5

std_rdd = rdd.map(lambda x: (x - m) / std)
std_rdd.collect()

[-0.4459878891999886,
 0.26393999255164463,
 -0.9784338005137135,
 0.7963859038653696,
 0.08645802211373632,
 -1.4043905295646935,
 -0.8009518300758053,
 -1.3333977413895302,
 0.6189039334274613,
 -0.2685059187620803,
 1.1513498447411863,
 -0.39274329806861613,
 -0.09102394832417199,
 -1.155915770951622,
 0.9738678743032779,
 -1.5108797118274384,
 0.44142196298955294,
 0.22844359846406298,
 0.7431413127339971,
 -0.6234698596378969,
 1.683795756054911,
 -0.0022829631052178364,
 -0.4459878891999886,
 1.3288318151790945,
 -0.2685059187620803,
 0.17519900733269048,
 -1.155915770951622,
 -1.670613485221556,
 -0.4992324803313611,
 0.6189039334274613,
 0.26393999255164463,
 -0.9784338005137135,
 -0.3572469039810345,
 0.7076449186464154,
 -1.6883616822653469,
 0.35268097777059876,
 -0.712210844856851,
 1.4175728003980486,
 -0.09102394832417199,
 -0.5347288744189428,
 0.9738678743032779,
 -1.4221387266084844,
 -0.2685059187620803,
 0.08645802211373632,
 -1.244656756170576,
 -0.3572469039810345,

Discretizacion de igual amplitud para el atributo Weight

In [None]:
from math import floor

df_weight = df.select("Weight")
rdd = df_weight.rdd.map(lambda row: (row["Weight"]))

min_value = rdd.reduce(lambda x,y: x if x<y else y)
max_value = rdd.reduce(lambda x,y: x if x>y else y)

print(min_value)
print(max_value)

k = 5
amplitud = (max_value - min_value) / k

rdd_discretizado = rdd.map(lambda x: (x, (k - 1) if x == max_value else int(floor((x - min_value) / amplitud))))
rdd_discretizado.collect()

100
300


[(180, 2),
 (220, 3),
 (150, 1),
 (250, 3),
 (210, 2),
 (126, 0),
 (160, 1),
 (130, 0),
 (240, 3),
 (190, 2),
 (270, 4),
 (183, 2),
 (200, 2),
 (140, 1),
 (260, 4),
 (120, 0),
 (230, 3),
 (218, 2),
 (247, 3),
 (170, 1),
 (300, 4),
 (205, 2),
 (180, 2),
 (280, 4),
 (190, 2),
 (215, 2),
 (140, 1),
 (111, 0),
 (177, 1),
 (240, 3),
 (220, 3),
 (150, 1),
 (185, 2),
 (245, 3),
 (110, 0),
 (225, 3),
 (165, 1),
 (285, 4),
 (200, 2),
 (175, 1),
 (260, 4),
 (125, 0),
 (190, 2),
 (210, 2),
 (135, 0),
 (185, 2),
 (240, 3),
 (215, 2),
 (275, 4),
 (160, 1),
 (230, 3),
 (105, 0),
 (220, 3),
 (290, 4),
 (185, 2),
 (170, 1),
 (255, 3),
 (120, 0),
 (180, 2),
 (205, 2),
 (130, 0),
 (235, 3),
 (200, 2),
 (165, 1),
 (240, 3),
 (140, 1),
 (215, 2),
 (150, 1),
 (280, 4),
 (190, 2),
 (220, 3),
 (100, 0),
 (210, 2),
 (175, 1),
 (270, 4),
 (135, 0),
 (235, 3),
 (190, 2),
 (225, 3),
 (145, 1),
 (250, 3),
 (115, 0),
 (210, 2),
 (185, 2),
 (240, 3),
 (140, 1),
 (200, 2),
 (170, 1),
 (205, 2),
 (160, 1),
 (280, 4),

Discretizacion de igual frecuencia para el atributo HarvestTime

In [None]:
from math import floor

df_htime = df.select("HarvestTime")
rdd = df_htime.rdd.map(lambda row: (row["HarvestTime"]))

rdd_ordenado= rdd.sortBy(lambda x: x)

#Calculo de quantiles usando el Dataframe
quantiles = df_htime.approxQuantile("HarvestTime", [0.25, 0.5, 0.75], 0.01)
q1, q2, q3 = quantiles

rdd_discretizado = rdd_ordenado.map(
    lambda x: (x,
               0 if x <= q1 else
               1 if x <= q2 else
               2 if x <= q3 else
               3)
)

rdd_discretizado.collect()

[(4, 0),
 (4, 0),
 (5, 0),
 (5, 0),
 (5, 0),
 (5, 0),
 (5, 0),
 (5, 0),
 (6, 0),
 (6, 0),
 (6, 0),
 (6, 0),
 (6, 0),
 (7, 0),
 (7, 0),
 (7, 0),
 (7, 0),
 (7, 0),
 (7, 0),
 (8, 0),
 (8, 0),
 (8, 0),
 (8, 0),
 (8, 0),
 (9, 0),
 (9, 0),
 (9, 0),
 (9, 0),
 (9, 0),
 (9, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (10, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (11, 0),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (12, 1),
 (13, 1),
 (13, 1),
 (13, 1),
 (13, 1),
 (13, 1),
 (13, 1),
 (13, 1),
 (13, 1),
 (13, 1),
 (13, 1),
 (13, 1),
 (13, 1),
 (13, 1),
 (14, 1),
 (14, 1),
 (14, 1),
 (14, 1),
 (14, 1),
 (14, 1),
