# Bringing data to explotation zone for the training of a Linear Regression

## Data Loading

In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.functions import UserDefinedFunction, col, mean, log1p, UserDefinedFunction, explode, rand, when, lit
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, MinMaxScaler
import matplotlib.pyplot as plt
import pandas as pd
from pyspark.ml import Pipeline
from pyspark.sql.types import FloatType
import os

"""
Spark session variables declaration and spark initialization
"""

path = os.getcwd()

conf = SparkConf() \
    .setAppName("PostgreSQL Writing to Formatted Table") \
    .set("spark.jars", path+"/../.."+"/driver/postgresql-42.7.3.jar")

spark = SparkSession.builder \
    .config(conf=conf) \
    .getOrCreate()


"""
Connection details for table from formatted zone read
"""

jdbc_url = "jdbc:postgresql://localhost:5432/bda_project1_db"
driver_class = "org.postgresql.Driver"
user = "postgres"
password = "hola123"
connectionProperties = {"user": "postgres", "password": "hola123"}


df_caract = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/bda_project1_db") \
    .option("dbtable", "dog_caract_trusted") \
    .option("user", connectionProperties["user"]) \
    .option("password", connectionProperties["password"]) \
    .option("driver", "org.postgresql.Driver") \
    .load()

df_caract = df_caract.withColumnRenamed("Breed", "breed")

df_caract.show()

24/04/25 21:18:50 WARN Utils: Your hostname, marcel-compu resolves to a loopback address: 127.0.1.1; using 192.168.1.47 instead (on interface wlp3s0)
24/04/25 21:18:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/04/25 21:18:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


+-------+-----------+--------+------+------------------+--------------------+-------------------+--------+--------------------+-----------+--------------+--------+------------+-----------------+---------------+-------------------+------------------------------+--------------------------------+
|barking|coat_length|drooling|energy|good_with_children|good_with_other_dogs|good_with_strangers|grooming|               breed|playfulness|protectiveness|shedding|trainability|avg_height_female|avg_height_male|avg_life_expectancy|avg_weight_male_log_normalized|avg_weight_female_log_normalized|
+-------+-----------+--------+------+------------------+--------------------+-------------------+--------+--------------------+-----------+--------------+--------+------------+-----------------+---------------+-------------------+------------------------------+--------------------------------+
|      1|          1|       0|     0|                 3|                   3|                  0|       0|    Ameri

In [2]:
df_intel = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/bda_project1_db") \
    .option("dbtable", "dog_intelligence_trusted") \
    .option("user", connectionProperties["user"]) \
    .option("password", connectionProperties["password"]) \
    .option("driver", "org.postgresql.Driver") \
    .load()

df_intel.show()

+--------------------+--------------------+----+--------+
|               breed|      classification|obey|avg_reps|
+--------------------+--------------------+----+--------+
|       Border Collie|      Brightest Dogs|  95|     2.5|
|              Poodle|      Brightest Dogs|  95|     2.5|
|     German Shepherd|      Brightest Dogs|  95|     2.5|
|    Golden Retriever|      Brightest Dogs|  95|     2.5|
|   Doberman Pinscher|      Brightest Dogs|  95|     2.5|
|   Shetland Sheepdog|      Brightest Dogs|  95|     2.5|
|  Labrador Retriever|      Brightest Dogs|  95|     2.5|
|            Papillon|      Brightest Dogs|  95|     2.5|
|          Rottweiler|      Brightest Dogs|  95|     2.5|
|Australian Cattle...|      Brightest Dogs|  95|     2.5|
|Pembroke Welsh Corgi|Excellent Working...|  85|    10.0|
| Miniature Schnauzer|Excellent Working...|  85|    10.0|
|English Springer ...|Excellent Working...|  85|    10.0|
|Belgian Shepherd ...|Excellent Working...|  85|    10.0|
|          Sch

## Table Join

In [3]:
df = df_caract.join(df_intel, 
               df_caract.breed == df_intel.breed, 
               "inner").drop(df_intel.breed) # Drop allows to remove duplicates
df = df.distinct()
print(df.count())
df.show()

112
+-------+-----------+--------+------+------------------+--------------------+-------------------+--------+--------------------+-----------+--------------+--------+------------+-----------------+---------------+-------------------+------------------------------+--------------------------------+--------------------+----+--------+
|barking|coat_length|drooling|energy|good_with_children|good_with_other_dogs|good_with_strangers|grooming|               breed|playfulness|protectiveness|shedding|trainability|avg_height_female|avg_height_male|avg_life_expectancy|avg_weight_male_log_normalized|avg_weight_female_log_normalized|      classification|obey|avg_reps|
+-------+-----------+--------+------+------------------+--------------------+-------------------+--------+--------------------+-----------+--------------+--------+------------+-----------------+---------------+-------------------+------------------------------+--------------------------------+--------------------+----+--------+
|     

## Categorical Variables

In [4]:
#df = df.drop("breed")

In [5]:
"""string_indexer = StringIndexer(inputCol="breed", outputCol="breed_encoded")

pipeline = Pipeline(stages=[string_indexer])

df = pipeline.fit(df).transform(df)

df = df.drop('breed')"""

'string_indexer = StringIndexer(inputCol="breed", outputCol="breed_encoded")\n\npipeline = Pipeline(stages=[string_indexer])\n\ndf = pipeline.fit(df).transform(df)\n\ndf = df.drop(\'breed\')'

In [4]:
print("Classification variable has",len(set(df.rdd.map(lambda r: r.classification).collect())),"modalities")

[Stage 22:>                                                         (0 + 1) / 1]

Classification variable has 5 modalities


                                                                                

In [5]:
pivoted = df.groupBy("breed").pivot("classification").agg(lit(1))
pivoted = pivoted.na.fill(0)
df = df.join(pivoted, 
               df.breed == pivoted.breed, 
               "inner").drop(pivoted.breed) # Drop allows to remove duplicates
df = df.distinct()
print(df.count())
df.show()

24/04/25 21:20:19 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


112
+-------+-----------+--------+------+------------------+--------------------+-------------------+--------+-----------+--------------+--------+------------+-----------------+---------------+-------------------+------------------------------+--------------------------------+--------------------+----+--------+--------------------+--------------------------+--------------------------------------+--------------+----------------------+-----------------------------------+
|barking|coat_length|drooling|energy|good_with_children|good_with_other_dogs|good_with_strangers|grooming|playfulness|protectiveness|shedding|trainability|avg_height_female|avg_height_male|avg_life_expectancy|avg_weight_male_log_normalized|avg_weight_female_log_normalized|      classification|obey|avg_reps|               breed|Above Average Working Dogs|Average Working/Obedience Intelligence|Brightest Dogs|Excellent Working Dogs|Fair Working/Obedience Intelligence|
+-------+-----------+--------+------+------------------+

## Numerical Variables

In [6]:
df = df.drop('breed')
assemblers = []
scalers = []
for name, dtype in df.dtypes:
    if name == 'avg_reps':
        continue
    if dtype in ["int","bigint","double"]:
        assemblers.append(VectorAssembler(inputCols=[name], outputCol=name + "_vec"))
        scalers.append(MinMaxScaler(inputCol=name + "_vec", outputCol=name + "_scaled"))

pipeline = Pipeline(stages=assemblers + scalers)
scalerModel = pipeline.fit(df)
scaledData = scalerModel.transform(df)
scaledData.show()

+-------+-----------+--------+------+------------------+--------------------+-------------------+--------+-----------+--------------+--------+------------+-----------------+---------------+-------------------+------------------------------+--------------------------------+--------------------+----+--------+--------------------------+--------------------------------------+--------------+----------------------+-----------------------------------+-----------+---------------+------------+----------+----------------------+------------------------+-----------------------+------------+---------------+------------------+------------+----------------+---------------------+-------------------+-----------------------+----------------------------------+------------------------------------+--------+------------------------------+------------------------------------------+------------------+--------------------------+---------------------------------------+--------------+------------------+---------

                                                                                

## Variable Selection

In [11]:
scaled_vars = [var for var, dtype in scaledData.dtypes if var[-6:] == "scaled"]
df_regression = scaledData.select('avg_reps',*scaled_vars)
df_regression.show()

24/04/25 11:34:43 WARN DAGScheduler: Broadcasting large task binary with size 1189.2 KiB


+--------+--------------+------------------+---------------+-------------+-------------------------+---------------------------+--------------------------+---------------+------------------+---------------------+---------------+--------------------+------------------------+----------------------+--------------------------+-------------------------------------+---------------------------------------+--------------------+---------------------------------+---------------------------------------------+---------------------+-----------------------------+------------------------------------------+
|avg_reps|barking_scaled|coat_length_scaled|drooling_scaled|energy_scaled|good_with_children_scaled|good_with_other_dogs_scaled|good_with_strangers_scaled|grooming_scaled|playfulness_scaled|protectiveness_scaled|shedding_scaled| trainability_scaled|avg_height_female_scaled|avg_height_male_scaled|avg_life_expectancy_scaled|avg_weight_male_log_normalized_scaled|avg_weight_female_log_normalized_scaled

# Linear Regression Training

In [13]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [14]:
df_regression = df_regression.withColumn('isVal', when(rand() > 0.5, 1).otherwise(0))

train = df_regression.where(col("isVal") > 0.5)

test = df_regression.where(col("isVal") < 0.5)

train = train.drop('isVal')

test = test.drop('isVal')

assembler = VectorAssembler(inputCols=scaled_vars, outputCol="features")

linear_regression = LinearRegression(labelCol="avg_reps",predictionCol="predicted_avg_reps", featuresCol="features",regParam=0.1)

pipeline = Pipeline(stages=[assembler, linear_regression])

model = pipeline.fit(train)

model

24/04/25 11:35:03 WARN DAGScheduler: Broadcasting large task binary with size 1357.1 KiB
24/04/25 11:35:05 WARN DAGScheduler: Broadcasting large task binary with size 1346.2 KiB


PipelineModel_100c784e4164

In [15]:
predictions = model.transform(test)

# Evaluate the model using MSE

evaluator = RegressionEvaluator(labelCol="avg_reps", predictionCol="predicted_avg_reps", metricName="mse")
mse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data: {:.3f}".format(mse))

24/04/25 11:35:10 WARN DAGScheduler: Broadcasting large task binary with size 1347.0 KiB


Root Mean Squared Error (RMSE) on test data: 0.003


In [None]:
predictions

DataFrame[avg_reps: double, classification_onehot: vector, breed_onehot: vector, barking_scaled: vector, coat_length_scaled: vector, drooling_scaled: vector, energy_scaled: vector, good_with_children_scaled: vector, good_with_other_dogs_scaled: vector, good_with_strangers_scaled: vector, grooming_scaled: vector, playfulness_scaled: vector, protectiveness_scaled: vector, shedding_scaled: vector, trainability_scaled: vector, avg_height_female_scaled: vector, avg_height_male_scaled: vector, avg_life_expectancy_scaled: vector, avg_weight_male_log_normalized_scaled: vector, avg_weight_female_log_normalized_scaled: vector, obey_scaled: vector, avg_reps_scaled: vector, breed_indexed_scaled: vector, classification_indexed_scaled: vector, features: vector, prediction: double]