## Steps to follow:
- Import some of the required libraries
- Load dataset
- Exploratory data analysis
- Data preprocessing
- Assemble feature columns to single vector
- Split the data
- Scale features
- Train the regression model
- Evaluate the model
- (Hyperparameter tuning)

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, format_number
# from pyspark.ml.feature import Imputer 
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder, StandardScaler
from pyspark.ml.regression import LinearRegression 

In [2]:
spark = SparkSession.builder.appName("Life Expectancy Model").config("spark.executor.memory", "4g").config("spark.driver.memory", "4g").getOrCreate()

In [3]:
spark

In [4]:
#checking if spark session is active
print(spark._sc._jsc.sc().isStopped())

False


In [5]:
df = spark.read.csv("C:/Users/VICTUS 15/OneDrive/Desktop/Life Expectancy Model Spark/Life Expectancy Data.csv", header = True, inferSchema = True)
df.show()

+-----------+----+----------+----------------+---------------+-------------+-------+----------------------+-----------+--------+-----+------------------+-----+-----------------+-----------+---------+-----------+-----------+---------------------+-------------------+-------------------------------+---------+
|    Country|Year|    Status|Life expectancy |Adult Mortality|infant deaths|Alcohol|percentage expenditure|Hepatitis B|Measles | BMI |under-five deaths |Polio|Total expenditure|Diphtheria | HIV/AIDS|        GDP| Population| thinness  1-19 years| thinness 5-9 years|Income composition of resources|Schooling|
+-----------+----+----------+----------------+---------------+-------------+-------+----------------------+-----------+--------+-----+------------------+-----+-----------------+-----------+---------+-----------+-----------+---------------------+-------------------+-------------------------------+---------+
|Afghanistan|2015|Developing|            65.0|            263|           62|

In [6]:
len(df.columns)

22

In [7]:
df.count()

2938

In [8]:
df.columns

['Country',
 'Year',
 'Status',
 'Life expectancy ',
 'Adult Mortality',
 'infant deaths',
 'Alcohol',
 'percentage expenditure',
 'Hepatitis B',
 'Measles ',
 ' BMI ',
 'under-five deaths ',
 'Polio',
 'Total expenditure',
 'Diphtheria ',
 ' HIV/AIDS',
 'GDP',
 'Population',
 ' thinness  1-19 years',
 ' thinness 5-9 years',
 'Income composition of resources',
 'Schooling']

In [9]:
df.select("Status").show()

+----------+
|    Status|
+----------+
|Developing|
|Developing|
|Developing|
|Developing|
|Developing|
|Developing|
|Developing|
|Developing|
|Developing|
|Developing|
|Developing|
|Developing|
|Developing|
|Developing|
|Developing|
|Developing|
|Developing|
|Developing|
|Developing|
|Developing|
+----------+
only showing top 20 rows



In [10]:
df.describe()

DataFrame[summary: string, Country: string, Year: string, Status: string, Life expectancy : string, Adult Mortality: string, infant deaths: string, Alcohol: string, percentage expenditure: string, Hepatitis B: string, Measles : string,  BMI : string, under-five deaths : string, Polio: string, Total expenditure: string, Diphtheria : string,  HIV/AIDS: string, GDP: string, Population: string,  thinness  1-19 years: string,  thinness 5-9 years: string, Income composition of resources: string, Schooling: string]

In [11]:
df.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Status: string (nullable = true)
 |-- Life expectancy : double (nullable = true)
 |-- Adult Mortality: integer (nullable = true)
 |-- infant deaths: integer (nullable = true)
 |-- Alcohol: double (nullable = true)
 |-- percentage expenditure: double (nullable = true)
 |-- Hepatitis B: integer (nullable = true)
 |-- Measles : integer (nullable = true)
 |--  BMI : double (nullable = true)
 |-- under-five deaths : integer (nullable = true)
 |-- Polio: integer (nullable = true)
 |-- Total expenditure: double (nullable = true)
 |-- Diphtheria : integer (nullable = true)
 |--  HIV/AIDS: double (nullable = true)
 |-- GDP: double (nullable = true)
 |-- Population: double (nullable = true)
 |--  thinness  1-19 years: double (nullable = true)
 |--  thinness 5-9 years: double (nullable = true)
 |-- Income composition of resources: double (nullable = true)
 |-- Schooling: double (nullable = true)



In [12]:
df.groupBy("Status").count().show()

+----------+-----+
|    Status|count|
+----------+-----+
| Developed|  512|
|Developing| 2426|
+----------+-----+



In [13]:
df.groupBy(' HIV/AIDS').count().show()

+---------+-----+
| HIV/AIDS|count|
+---------+-----+
|     13.4|    4|
|     14.9|    1|
|      2.4|   12|
|      8.0|    3|
|     15.7|    3|
|     25.1|    1|
|     24.7|    2|
|     49.9|    1|
|     26.4|    1|
|     46.4|    1|
|      5.4|    4|
|      7.0|    3|
|     50.3|    1|
|     15.9|    2|
|     11.5|    1|
|      3.5|    7|
|      6.1|    1|
|      9.5|    1|
|     11.6|    2|
|      7.7|    4|
+---------+-----+
only showing top 20 rows



In [14]:
df.groupBy('Country').count().show()

+--------------------+-----+
|             Country|count|
+--------------------+-----+
|       Côte d'Ivoire|   16|
|                Chad|   16|
|Micronesia (Feder...|   16|
|            Paraguay|   16|
|               Yemen|   16|
|             Senegal|   16|
|          Cabo Verde|   16|
|              Sweden|   16|
|            Kiribati|   16|
|   Republic of Korea|   16|
|              Guyana|   16|
|             Eritrea|   16|
|         Philippines|   16|
|            Djibouti|   16|
|               Tonga|   16|
|            Malaysia|   16|
|           Singapore|   16|
|                Fiji|   16|
|              Turkey|   16|
|              Malawi|   16|
+--------------------+-----+
only showing top 20 rows



In [15]:
# Strip leading and trailing spaces in column names
df = df.select([col(c).alias(c.strip()) for c in df.columns])
df.groupBy('BMI').count().show()

+----+-----+
| BMI|count|
+----+-----+
|13.4|    3|
|49.8|    2|
|56.8|    7|
|15.5|    6|
|14.9|    7|
|26.7|    7|
|64.2|    4|
|15.4|    8|
|47.5|    4|
| 2.4|    5|
|53.3|    8|
|37.1|    1|
|15.7|    6|
|55.8|   16|
|45.3|    7|
|67.0|    4|
|25.1|    2|
|56.5|   13|
|49.9|   11|
|18.3|    7|
+----+-----+
only showing top 20 rows



In [16]:
#Checking the spacing between the column names
df.columns

['Country',
 'Year',
 'Status',
 'Life expectancy',
 'Adult Mortality',
 'infant deaths',
 'Alcohol',
 'percentage expenditure',
 'Hepatitis B',
 'Measles',
 'BMI',
 'under-five deaths',
 'Polio',
 'Total expenditure',
 'Diphtheria',
 'HIV/AIDS',
 'GDP',
 'Population',
 'thinness  1-19 years',
 'thinness 5-9 years',
 'Income composition of resources',
 'Schooling']

In [17]:
df.agg({'Population':'sum'}).show()

+--------------------+
|     sum(Population)|
+--------------------+
|2.915421552444001E10|
+--------------------+



In [18]:
#########

### Data Preprocessing

In [19]:
df2 = df.na.drop()

In [20]:
df2.groupby("Year").count().show()

+----+-----+
|Year|count|
+----+-----+
|2003|   95|
|2007|  120|
|2015|    2|
|2006|  114|
|2013|  130|
|2014|  131|
|2004|  103|
|2012|  129|
|2009|  126|
|2001|   66|
|2005|  110|
|2000|   61|
|2010|  128|
|2011|  130|
|2008|  123|
|2002|   81|
+----+-----+



In [21]:
df2.count() #from 2938 to 1649

1649

In [22]:
len(df2.columns)

22

In [23]:
columns_to_index = ['Country', 'Status']
indexers = [StringIndexer(inputCol = col, outputCol = f'{col}_indexed') for col in columns_to_index]

pipeline = Pipeline(stages=indexers)
indexed_data = pipeline.fit(df2).transform(df2)


In [24]:
df2.columns

['Country',
 'Year',
 'Status',
 'Life expectancy',
 'Adult Mortality',
 'infant deaths',
 'Alcohol',
 'percentage expenditure',
 'Hepatitis B',
 'Measles',
 'BMI',
 'under-five deaths',
 'Polio',
 'Total expenditure',
 'Diphtheria',
 'HIV/AIDS',
 'GDP',
 'Population',
 'thinness  1-19 years',
 'thinness 5-9 years',
 'Income composition of resources',
 'Schooling']

In [25]:
indexed_data.select('Country_indexed', 'Status_indexed').show()

+---------------+--------------+
|Country_indexed|Status_indexed|
+---------------+--------------+
|            0.0|           0.0|
|            0.0|           0.0|
|            0.0|           0.0|
|            0.0|           0.0|
|            0.0|           0.0|
|            0.0|           0.0|
|            0.0|           0.0|
|            0.0|           0.0|
|            0.0|           0.0|
|            0.0|           0.0|
|            0.0|           0.0|
|            0.0|           0.0|
|            0.0|           0.0|
|            0.0|           0.0|
|            0.0|           0.0|
|            0.0|           0.0|
|            1.0|           0.0|
|            1.0|           0.0|
|            1.0|           0.0|
|            1.0|           0.0|
+---------------+--------------+
only showing top 20 rows



In [26]:
indexed_data.columns

['Country',
 'Year',
 'Status',
 'Life expectancy',
 'Adult Mortality',
 'infant deaths',
 'Alcohol',
 'percentage expenditure',
 'Hepatitis B',
 'Measles',
 'BMI',
 'under-five deaths',
 'Polio',
 'Total expenditure',
 'Diphtheria',
 'HIV/AIDS',
 'GDP',
 'Population',
 'thinness  1-19 years',
 'thinness 5-9 years',
 'Income composition of resources',
 'Schooling',
 'Country_indexed',
 'Status_indexed']

In [27]:
one_hot_encoder = OneHotEncoder(inputCol='Status_indexed', outputCol='Status_encoded')

encoded_data = one_hot_encoder.fit(indexed_data).transform(indexed_data)
encoded_data.select('Status', 'Status_encoded').show()

+----------+--------------+
|    Status|Status_encoded|
+----------+--------------+
|Developing| (1,[0],[1.0])|
|Developing| (1,[0],[1.0])|
|Developing| (1,[0],[1.0])|
|Developing| (1,[0],[1.0])|
|Developing| (1,[0],[1.0])|
|Developing| (1,[0],[1.0])|
|Developing| (1,[0],[1.0])|
|Developing| (1,[0],[1.0])|
|Developing| (1,[0],[1.0])|
|Developing| (1,[0],[1.0])|
|Developing| (1,[0],[1.0])|
|Developing| (1,[0],[1.0])|
|Developing| (1,[0],[1.0])|
|Developing| (1,[0],[1.0])|
|Developing| (1,[0],[1.0])|
|Developing| (1,[0],[1.0])|
|Developing| (1,[0],[1.0])|
|Developing| (1,[0],[1.0])|
|Developing| (1,[0],[1.0])|
|Developing| (1,[0],[1.0])|
+----------+--------------+
only showing top 20 rows



In [28]:
df3 = encoded_data
df3.columns

['Country',
 'Year',
 'Status',
 'Life expectancy',
 'Adult Mortality',
 'infant deaths',
 'Alcohol',
 'percentage expenditure',
 'Hepatitis B',
 'Measles',
 'BMI',
 'under-five deaths',
 'Polio',
 'Total expenditure',
 'Diphtheria',
 'HIV/AIDS',
 'GDP',
 'Population',
 'thinness  1-19 years',
 'thinness 5-9 years',
 'Income composition of resources',
 'Schooling',
 'Country_indexed',
 'Status_indexed',
 'Status_encoded']

In [29]:
assembler = VectorAssembler(inputCols=['Country_indexed','Year','Status_encoded','Adult Mortality','infant deaths','Alcohol','percentage expenditure',
 'Hepatitis B','Measles','BMI','under-five deaths','Polio','Total expenditure','Diphtheria','HIV/AIDS','GDP','Population','thinness  1-19 years','thinness 5-9 years','Income composition of resources',
 'Schooling'], outputCol='features')
output = assembler.transform(df3)

In [30]:
output.select('features','Life expectancy').show()

+--------------------+---------------+
|            features|Life expectancy|
+--------------------+---------------+
|[0.0,2015.0,1.0,2...|           65.0|
|[0.0,2014.0,1.0,2...|           59.9|
|[0.0,2013.0,1.0,2...|           59.9|
|[0.0,2012.0,1.0,2...|           59.5|
|[0.0,2011.0,1.0,2...|           59.2|
|[0.0,2010.0,1.0,2...|           58.8|
|[0.0,2009.0,1.0,2...|           58.6|
|[0.0,2008.0,1.0,2...|           58.1|
|[0.0,2007.0,1.0,2...|           57.5|
|[0.0,2006.0,1.0,2...|           57.3|
|[0.0,2005.0,1.0,2...|           57.3|
|[0.0,2004.0,1.0,2...|           57.0|
|[0.0,2003.0,1.0,2...|           56.7|
|[0.0,2002.0,1.0,3...|           56.2|
|[0.0,2001.0,1.0,3...|           55.3|
|[0.0,2000.0,1.0,3...|           54.8|
|[1.0,2015.0,1.0,7...|           77.8|
|[1.0,2014.0,1.0,8...|           77.5|
|[1.0,2013.0,1.0,8...|           77.2|
|[1.0,2012.0,1.0,8...|           76.9|
+--------------------+---------------+
only showing top 20 rows



In [31]:
df_final = output.select('features','Life expectancy')
df_final.show()

+--------------------+---------------+
|            features|Life expectancy|
+--------------------+---------------+
|[0.0,2015.0,1.0,2...|           65.0|
|[0.0,2014.0,1.0,2...|           59.9|
|[0.0,2013.0,1.0,2...|           59.9|
|[0.0,2012.0,1.0,2...|           59.5|
|[0.0,2011.0,1.0,2...|           59.2|
|[0.0,2010.0,1.0,2...|           58.8|
|[0.0,2009.0,1.0,2...|           58.6|
|[0.0,2008.0,1.0,2...|           58.1|
|[0.0,2007.0,1.0,2...|           57.5|
|[0.0,2006.0,1.0,2...|           57.3|
|[0.0,2005.0,1.0,2...|           57.3|
|[0.0,2004.0,1.0,2...|           57.0|
|[0.0,2003.0,1.0,2...|           56.7|
|[0.0,2002.0,1.0,3...|           56.2|
|[0.0,2001.0,1.0,3...|           55.3|
|[0.0,2000.0,1.0,3...|           54.8|
|[1.0,2015.0,1.0,7...|           77.8|
|[1.0,2014.0,1.0,8...|           77.5|
|[1.0,2013.0,1.0,8...|           77.2|
|[1.0,2012.0,1.0,8...|           76.9|
+--------------------+---------------+
only showing top 20 rows



In [32]:
train_data,test_data = df_final.randomSplit([0.75,0.25], seed=42)

### Performing a `StandardScaler` transformation after splitting
**NOTE:** should only be applied on the training data during the fitting process to avoid data leakage. Once the scaler is fitted to the training data, you can use it to transform both the training and test datasets.

- applying `StandardScaler` ensures that the model performs optimally by standardizing feature scales, improving convergence speed, and maintaining fairness among features in distance-based calculations or regularization techniques.

### Why fit only on training data?
- Fitting on the entire dataset (including test data) would result in data leakage, where information from the test set influences the model during training. This leads to overestimation of model performance.
- 
By fitting only on the training data, you ensure that your model generalizes well to unseen data (test set).

In [33]:
#initialise the StandarScaler
scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True)

#Fit scaler on the trainign data
scaler_model = scaler.fit(train_data)

#Transform both training and the test datasets
train_data_scaled = scaler_model.transform(train_data)
test_data_scaled = scaler_model.transform(test_data)

#Display scaled features
train_data_scaled.select('features','scaled_features').show()

+--------------------+--------------------+
|            features|     scaled_features|
+--------------------+--------------------+
|[0.0,2000.0,1.0,3...|[-1.5974581512860...|
|[0.0,2001.0,1.0,3...|[-1.5974581512860...|
|[0.0,2003.0,1.0,2...|[-1.5974581512860...|
|[0.0,2004.0,1.0,2...|[-1.5974581512860...|
|[0.0,2005.0,1.0,2...|[-1.5974581512860...|
|[0.0,2007.0,1.0,2...|[-1.5974581512860...|
|[0.0,2010.0,1.0,2...|[-1.5974581512860...|
|[0.0,2011.0,1.0,2...|[-1.5974581512860...|
|[0.0,2012.0,1.0,2...|[-1.5974581512860...|
|[0.0,2014.0,1.0,2...|[-1.5974581512860...|
|[0.0,2015.0,1.0,2...|[-1.5974581512860...|
|[1.0,2000.0,1.0,1...|[-1.5691100578394...|
|[1.0,2001.0,1.0,1...|[-1.5691100578394...|
|[1.0,2002.0,1.0,1...|[-1.5691100578394...|
|[1.0,2004.0,1.0,1...|[-1.5691100578394...|
|[1.0,2005.0,1.0,1...|[-1.5691100578394...|
|[1.0,2006.0,1.0,9...|[-1.5691100578394...|
|[1.0,2008.0,1.0,1...|[-1.5691100578394...|
|[1.0,2009.0,1.0,9...|[-1.5691100578394...|
|[1.0,2010.0,1.0,9...|[-1.569110

In [34]:
test_data_scaled.select('features','scaled_features').show()

+--------------------+--------------------+
|            features|     scaled_features|
+--------------------+--------------------+
|[0.0,2002.0,1.0,3...|[-1.5974581512860...|
|[0.0,2006.0,1.0,2...|[-1.5974581512860...|
|[0.0,2008.0,1.0,2...|[-1.5974581512860...|
|[0.0,2009.0,1.0,2...|[-1.5974581512860...|
|[0.0,2013.0,1.0,2...|[-1.5974581512860...|
|[1.0,2003.0,1.0,1...|[-1.5691100578394...|
|[1.0,2007.0,1.0,9...|[-1.5691100578394...|
|[1.0,2013.0,1.0,8...|[-1.5691100578394...|
|[1.0,2014.0,1.0,8...|[-1.5691100578394...|
|[2.0,2000.0,1.0,1...|[-1.5407619643927...|
|[2.0,2002.0,1.0,1...|[-1.5407619643927...|
|[2.0,2003.0,1.0,1...|[-1.5407619643927...|
|[2.0,2010.0,1.0,1...|[-1.5407619643927...|
|[2.0,2013.0,1.0,1...|[-1.5407619643927...|
|[2.0,2014.0,1.0,1...|[-1.5407619643927...|
|[3.0,2000.0,0.0,9...|[-1.5124138709461...|
|[3.0,2002.0,0.0,9...|[-1.5124138709461...|
|[3.0,2004.0,0.0,8...|[-1.5124138709461...|
|[3.0,2008.0,0.0,7...|[-1.5124138709461...|
|[4.0,2000.0,1.0,2...|[-1.484065

In [35]:
train_data_scaled.show()

+--------------------+---------------+--------------------+
|            features|Life expectancy|     scaled_features|
+--------------------+---------------+--------------------+
|[0.0,2000.0,1.0,3...|           54.8|[-1.5974581512860...|
|[0.0,2001.0,1.0,3...|           55.3|[-1.5974581512860...|
|[0.0,2003.0,1.0,2...|           56.7|[-1.5974581512860...|
|[0.0,2004.0,1.0,2...|           57.0|[-1.5974581512860...|
|[0.0,2005.0,1.0,2...|           57.3|[-1.5974581512860...|
|[0.0,2007.0,1.0,2...|           57.5|[-1.5974581512860...|
|[0.0,2010.0,1.0,2...|           58.8|[-1.5974581512860...|
|[0.0,2011.0,1.0,2...|           59.2|[-1.5974581512860...|
|[0.0,2012.0,1.0,2...|           59.5|[-1.5974581512860...|
|[0.0,2014.0,1.0,2...|           59.9|[-1.5974581512860...|
|[0.0,2015.0,1.0,2...|           65.0|[-1.5974581512860...|
|[1.0,2000.0,1.0,1...|           72.6|[-1.5691100578394...|
|[1.0,2001.0,1.0,1...|           73.6|[-1.5691100578394...|
|[1.0,2002.0,1.0,1...|           73.3|[-

### Performing the Linear Regression 

In [36]:
regressor = LinearRegression(featuresCol='scaled_features', labelCol='Life expectancy')

#Fit the model on scaled training data
lr_model = regressor.fit(train_data_scaled)

#Make predictions on scaled test data
predictions = lr_model.transform(test_data_scaled)

In [37]:
lr_model.coefficients

DenseVector([-0.3324, -0.3555, -0.3697, -1.8503, 10.4116, -0.559, 0.6515, -0.2273, -0.1549, 0.7064, -10.4682, 0.2608, 0.3138, 0.2715, -2.8269, 0.1113, -0.1855, 0.1521, -0.3498, 1.9015, 2.3669])

In [38]:
lr_model.intercept

69.63484021823848

In [39]:
lr_model.evaluate(test_data_scaled).predictions.show()

+--------------------+---------------+--------------------+------------------+
|            features|Life expectancy|     scaled_features|        prediction|
+--------------------+---------------+--------------------+------------------+
|[0.0,2002.0,1.0,3...|           56.2|[-1.5974581512860...| 64.84169092964763|
|[0.0,2006.0,1.0,2...|           57.3|[-1.5974581512860...|  61.6339167293237|
|[0.0,2008.0,1.0,2...|           58.1|[-1.5974581512860...|62.750070506916586|
|[0.0,2009.0,1.0,2...|           58.6|[-1.5974581512860...| 63.09453536722481|
|[0.0,2013.0,1.0,2...|           59.9|[-1.5974581512860...| 64.18978232534154|
|[1.0,2003.0,1.0,1...|           72.8|[-1.5691100578394...| 73.09076018489701|
|[1.0,2007.0,1.0,9...|           75.9|[-1.5691100578394...| 73.91826397962433|
|[1.0,2013.0,1.0,8...|           77.2|[-1.5691100578394...| 75.36339836911196|
|[1.0,2014.0,1.0,8...|           77.5|[-1.5691100578394...|  76.5255089329257|
|[2.0,2000.0,1.0,1...|           72.0|[-1.5407619643

- r2 indicates how well the model fits the data. An r2 value closer to 1 means that the model explains most of the variability on the data.

In [40]:
lr_model.evaluate(test_data_scaled).r2

0.835649809452287

In [41]:
#Performance metrics
lr_model.evaluate(test_data_scaled).meanAbsoluteError, lr_model.evaluate(test_data_scaled).meanSquaredError

(2.8107860132835745, 13.015398739543311)

### Evaluate the model(different approach)

In [42]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol='Life expectancy', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)

print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 3.6076860644384388


### Hyperparameter tuning with cross-validation

In [43]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

In [44]:
#using disk-based storage level due to memory overload, this avoids too much memory usage 
from pyspark.storagelevel import StorageLevel

train_data_scaled.persist(StorageLevel.DISK_ONLY)

DataFrame[features: vector, Life expectancy: double, scaled_features: vector]

In [45]:
# Define hyperparameter grid for linear regression
param_grid = ParamGridBuilder() \
    .addGrid(regressor.elasticNetParam, [0.0, 0.5, 1.0]) \
    .addGrid(regressor.maxIter, [10, 50, 100]) \
    .build()

# Initialize evaluator
evaluator = RegressionEvaluator(labelCol="Life expectancy", predictionCol="prediction", metricName="rmse")

# Create cross-validator
cv = CrossValidator(estimator=regressor, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

# Fit cross-validator
cv_model = cv.fit(train_data_scaled)

# Get best model and parameters
best_model = cv_model.bestModel
best_params = best_model.extractParamMap()

In [46]:
# print(spark.conf.get("spark.executor.memory"))
# print(spark.conf.get("spark.driver.memory"))

In [47]:
print(best_params)

{Param(parent='LinearRegression_448664a9b971', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'): 2, Param(parent='LinearRegression_448664a9b971', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0, Param(parent='LinearRegression_448664a9b971', name='epsilon', doc='The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber'): 1.35, Param(parent='LinearRegression_448664a9b971', name='featuresCol', doc='features column name.'): 'scaled_features', Param(parent='LinearRegression_448664a9b971', name='fitIntercept', doc='whether to fit an intercept term.'): True, Param(parent='LinearRegression_448664a9b971', name='labelCol', doc='label column name.'): 'Life expectancy', Param(parent='LinearRegression_448664a9b971', name='loss', doc='The loss function to be optimized. Supported options: squaredError, huber.'): 's

In [48]:
#making it more readable
for param, value in best_params.items():
    print(f"{param.name}: {value}")

aggregationDepth: 2
elasticNetParam: 0.0
epsilon: 1.35
featuresCol: scaled_features
fitIntercept: True
labelCol: Life expectancy
loss: squaredError
maxBlockSizeInMB: 0.0
maxIter: 10
predictionCol: prediction
regParam: 0.0
solver: auto
standardization: True
tol: 1e-06


In [49]:
# Evaluate best model on test data
best_predictions = best_model.transform(test_data_scaled)

#evaluate RMSE
best_rmse = evaluator.evaluate(best_predictions)

#print out the RMSE of the best model 
print(f"Best RMSE: {best_rmse}")

Best RMSE: 3.6076860644384388


- No change in the RMSE as per the last evaluation