In [0]:
from pyspark.sql import SparkSession

In [0]:
#spark = SparkSession.builder.appName('IMMLLR2').getOrCreate()
spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")

In [0]:
# File location and type
file_location = "/FileStore/tables/LifeExpectancyData.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.27962362,65.0,1154,19.1,83,6,8.16,65,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.52358168,62.0,492,18.6,86,58,8.18,62,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.21924272,64.0,430,18.1,89,62,8.13,64,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.1842153,67.0,2787,17.6,93,67,8.52,67,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097108703,68.0,3013,17.2,97,68,7.87,68,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5
Afghanistan,2010,Developing,58.8,279.0,74,0.01,79.67936736,66.0,1989,16.7,102,66,9.2,66,0.1,553.32894,2883167.0,18.4,18.4,0.448,9.2
Afghanistan,2009,Developing,58.6,281.0,77,0.01,56.76221682,63.0,2861,16.2,106,63,9.42,63,0.1,445.8932979,284331.0,18.6,18.7,0.434,8.9
Afghanistan,2008,Developing,58.1,287.0,80,0.03,25.87392536,64.0,1599,15.7,110,64,8.33,64,0.1,373.3611163,2729431.0,18.8,18.9,0.433,8.7
Afghanistan,2007,Developing,57.5,295.0,82,0.02,10.91015598,63.0,1141,15.2,113,63,6.73,63,0.1,369.835796,26616792.0,19.0,19.1,0.415,8.4
Afghanistan,2006,Developing,57.3,295.0,84,0.03,17.17151751,64.0,1990,14.7,116,58,7.43,58,0.1,272.56377,2589345.0,19.2,19.3,0.405,8.1


Data pre-processing

In [0]:
# Import the required libraries

from pyspark.sql.functions import datediff,date_format,to_date,to_timestamp

In [0]:
import pyspark.sql.functions as f

In [0]:
df.printSchema()

#data types are already accurate

root
 |-- Country: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Status: string (nullable = true)
 |-- Life expectancy : double (nullable = true)
 |-- Adult Mortality: integer (nullable = true)
 |-- infant deaths: integer (nullable = true)
 |-- Alcohol: double (nullable = true)
 |-- percentage expenditure: double (nullable = true)
 |-- Hepatitis B: integer (nullable = true)
 |-- Measles : integer (nullable = true)
 |--  BMI : double (nullable = true)
 |-- under-five deaths : integer (nullable = true)
 |-- Polio: integer (nullable = true)
 |-- Total expenditure: double (nullable = true)
 |-- Diphtheria : integer (nullable = true)
 |--  HIV/AIDS: double (nullable = true)
 |-- GDP: double (nullable = true)
 |-- Population: double (nullable = true)
 |--  thinness  1-19 years: double (nullable = true)
 |--  thinness 5-9 years: double (nullable = true)
 |-- Income composition of resources: double (nullable = true)
 |-- Schooling: double (nullable = true)



In [0]:
from pyspark.sql import functions as F

df_under = df.select([F.col(col).alias(col.replace(' ', '_')) for col in df.columns])

In [0]:
df_under.describe

Out[9]: <bound method DataFrame.describe of DataFrame[Country: string, Year: int, Status: string, Life_expectancy_: double, Adult_Mortality: int, infant_deaths: int, Alcohol: double, percentage_expenditure: double, Hepatitis_B: int, Measles_: int, _BMI_: double, under-five_deaths_: int, Polio: int, Total_expenditure: double, Diphtheria_: int, _HIV/AIDS: double, GDP: double, Population: double, _thinness__1-19_years: double, _thinness_5-9_years: double, Income_composition_of_resources: double, Schooling: double]>

In [0]:
#dropping variables with auto-correlation

from pyspark.sql import functions as F

df_dropped = df_under.drop("_thinness_5-9_years","GDP","infant_deaths")

In [0]:
df_dropped.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Status: string (nullable = true)
 |-- Life_expectancy_: double (nullable = true)
 |-- Adult_Mortality: integer (nullable = true)
 |-- Alcohol: double (nullable = true)
 |-- percentage_expenditure: double (nullable = true)
 |-- Hepatitis_B: integer (nullable = true)
 |-- Measles_: integer (nullable = true)
 |-- _BMI_: double (nullable = true)
 |-- under-five_deaths_: integer (nullable = true)
 |-- Polio: integer (nullable = true)
 |-- Total_expenditure: double (nullable = true)
 |-- Diphtheria_: integer (nullable = true)
 |-- _HIV/AIDS: double (nullable = true)
 |-- Population: double (nullable = true)
 |-- _thinness__1-19_years: double (nullable = true)
 |-- Income_composition_of_resources: double (nullable = true)
 |-- Schooling: double (nullable = true)



In [0]:
df_dropped.count()

Out[12]: 2938

In [0]:
df_nona=df_dropped.dropna()
df_nona.count()

Out[13]: 1657

In [0]:
df_nona.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Status: string (nullable = true)
 |-- Life_expectancy_: double (nullable = true)
 |-- Adult_Mortality: integer (nullable = true)
 |-- Alcohol: double (nullable = true)
 |-- percentage_expenditure: double (nullable = true)
 |-- Hepatitis_B: integer (nullable = true)
 |-- Measles_: integer (nullable = true)
 |-- _BMI_: double (nullable = true)
 |-- under-five_deaths_: integer (nullable = true)
 |-- Polio: integer (nullable = true)
 |-- Total_expenditure: double (nullable = true)
 |-- Diphtheria_: integer (nullable = true)
 |-- _HIV/AIDS: double (nullable = true)
 |-- Population: double (nullable = true)
 |-- _thinness__1-19_years: double (nullable = true)
 |-- Income_composition_of_resources: double (nullable = true)
 |-- Schooling: double (nullable = true)



In [0]:
data = df_nona.withColumnRenamed("_thinness__1-19_years", "thinness_upper") \
         # .withColumnRenamed("_thinness_5-9_years", "thinness_lower") 
        

data.show(truncate=False)

+-----------+----+----------+----------------+---------------+-------+----------------------+-----------+--------+-----+------------------+-----+-----------------+-----------+---------+-----------+--------------+-------------------------------+---------+
|Country    |Year|Status    |Life_expectancy_|Adult_Mortality|Alcohol|percentage_expenditure|Hepatitis_B|Measles_|_BMI_|under-five_deaths_|Polio|Total_expenditure|Diphtheria_|_HIV/AIDS|Population |thinness_upper|Income_composition_of_resources|Schooling|
+-----------+----+----------+----------------+---------------+-------+----------------------+-----------+--------+-----+------------------+-----+-----------------+-----------+---------+-----------+--------------+-------------------------------+---------+
|Afghanistan|2015|Developing|65.0            |263            |0.01   |71.27962362           |65         |1154    |19.1 |83                |6    |8.16             |65         |0.1      |3.3736494E7|17.2          |0.479                  

In [0]:
# Create a 70-30 train test split

train_data,test_data=data.randomSplit([0.7,0.3])

Building the Linear Regression model

In [0]:
# Import the required libraries

from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler,StringIndexer
from pyspark.ml import Pipeline

In [0]:
# Use StringIndexer to convert the categorical columns to hold numerical data

status_indexer = StringIndexer(inputCol='Status',outputCol='status_index',handleInvalid='keep')
country_indexer = StringIndexer(inputCol='Country',outputCol='country_index',handleInvalid='keep')


In [0]:
data.describe

Out[19]: <bound method DataFrame.describe of DataFrame[Country: string, Year: int, Status: string, Life_expectancy_: double, Adult_Mortality: int, Alcohol: double, percentage_expenditure: double, Hepatitis_B: int, Measles_: int, _BMI_: double, under-five_deaths_: int, Polio: int, Total_expenditure: double, Diphtheria_: int, _HIV/AIDS: double, Population: double, thinness_upper: double, Income_composition_of_resources: double, Schooling: double]>

In [0]:
# Vector assembler is used to create a vector of input features

assembler = VectorAssembler(inputCols=['status_index','country_index','Year','Adult_Mortality','Alcohol',
                                      'percentage_expenditure','Hepatitis_B','Measles_','_BMI_','under-five_deaths_','Polio',
                                      'Total_expenditure','Diphtheria_','_HIV/AIDS','Population','thinness_upper',
                                      'Income_composition_of_resources','Schooling'],
                            outputCol="features")

In [0]:
# Pipeline is used to pass the data through indexer and assembler simultaneously. Also, it helps to pre-rocess the test data
# in the same way as that of the train data

pipe = Pipeline(stages=[status_indexer,country_indexer,assembler])

In [0]:
fitted_pipe=pipe.fit(train_data)

In [0]:
train_data=fitted_pipe.transform(train_data)
display(train_data)

Country,Year,Status,Life_expectancy_,Adult_Mortality,Alcohol,percentage_expenditure,Hepatitis_B,Measles_,_BMI_,under-five_deaths_,Polio,Total_expenditure,Diphtheria_,_HIV/AIDS,Population,thinness_upper,Income_composition_of_resources,Schooling,status_index,country_index,features
Afghanistan,2000,Developing,54.8,321,0.01,10.42496,62,6532,12.2,122,24,8.2,24,0.1,293756.0,2.3,0.338,5.5,0.0,1.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 1.0, 2000.0, 321.0, 0.01, 10.42496, 62.0, 6532.0, 12.2, 122.0, 24.0, 8.2, 24.0, 0.1, 293756.0, 2.3, 0.338, 5.5))"
Afghanistan,2001,Developing,55.3,316,0.01,10.5747282,63,8762,12.6,122,35,7.8,33,0.1,2966463.0,2.1,0.34,5.9,0.0,1.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 1.0, 2001.0, 316.0, 0.01, 10.5747282, 63.0, 8762.0, 12.6, 122.0, 35.0, 7.8, 33.0, 0.1, 2966463.0, 2.1, 0.34, 5.9))"
Afghanistan,2002,Developing,56.2,3,0.01,16.88735091,64,2486,13.0,122,36,7.76,36,0.1,21979923.0,19.9,0.341,6.2,0.0,1.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 1.0, 2002.0, 3.0, 0.01, 16.88735091, 64.0, 2486.0, 13.0, 122.0, 36.0, 7.76, 36.0, 0.1, 2.1979923E7, 19.9, 0.341, 6.2))"
Afghanistan,2003,Developing,56.7,295,0.01,11.08905273,65,798,13.4,122,41,8.82,41,0.1,2364851.0,19.7,0.373,6.5,0.0,1.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 1.0, 2003.0, 295.0, 0.01, 11.08905273, 65.0, 798.0, 13.4, 122.0, 41.0, 8.82, 41.0, 0.1, 2364851.0, 19.7, 0.373, 6.5))"
Afghanistan,2004,Developing,57.0,293,0.02,15.29606643,67,466,13.8,120,5,8.79,5,0.1,24118979.0,19.5,0.381,6.8,0.0,1.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 1.0, 2004.0, 293.0, 0.02, 15.29606643, 67.0, 466.0, 13.8, 120.0, 5.0, 8.79, 5.0, 0.1, 2.4118979E7, 19.5, 0.381, 6.8))"
Afghanistan,2007,Developing,57.5,295,0.02,10.91015598,63,1141,15.2,113,63,6.73,63,0.1,26616792.0,19.0,0.415,8.4,0.0,1.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 1.0, 2007.0, 295.0, 0.02, 10.91015598, 63.0, 1141.0, 15.2, 113.0, 63.0, 6.73, 63.0, 0.1, 2.6616792E7, 19.0, 0.415, 8.4))"
Afghanistan,2008,Developing,58.1,287,0.03,25.87392536,64,1599,15.7,110,64,8.33,64,0.1,2729431.0,18.8,0.433,8.7,0.0,1.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 1.0, 2008.0, 287.0, 0.03, 25.87392536, 64.0, 1599.0, 15.7, 110.0, 64.0, 8.33, 64.0, 0.1, 2729431.0, 18.8, 0.433, 8.7))"
Afghanistan,2009,Developing,58.6,281,0.01,56.76221682,63,2861,16.2,106,63,9.42,63,0.1,284331.0,18.6,0.434,8.9,0.0,1.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 1.0, 2009.0, 281.0, 0.01, 56.76221682, 63.0, 2861.0, 16.2, 106.0, 63.0, 9.42, 63.0, 0.1, 284331.0, 18.6, 0.434, 8.9))"
Afghanistan,2010,Developing,58.8,279,0.01,79.67936736,66,1989,16.7,102,66,9.2,66,0.1,2883167.0,18.4,0.448,9.2,0.0,1.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 1.0, 2010.0, 279.0, 0.01, 79.67936736, 66.0, 1989.0, 16.7, 102.0, 66.0, 9.2, 66.0, 0.1, 2883167.0, 18.4, 0.448, 9.2))"
Afghanistan,2011,Developing,59.2,275,0.01,7.097108703,68,3013,17.2,97,68,7.87,68,0.1,2978599.0,18.2,0.454,9.5,0.0,1.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 1.0, 2011.0, 275.0, 0.01, 7.097108703, 68.0, 3013.0, 17.2, 97.0, 68.0, 7.87, 68.0, 0.1, 2978599.0, 18.2, 0.454, 9.5))"


In [0]:
# Create an object for the Linear Regression model

lr_model = LinearRegression(labelCol='Life_expectancy_')

In [0]:
# Fit the model on the train data

fit_model = lr_model.fit(train_data.select(['features','Life_expectancy_']))

In [0]:
# Transform the test data using the model to predict the duration

test_data=fitted_pipe.transform(test_data)
display(test_data)

Country,Year,Status,Life_expectancy_,Adult_Mortality,Alcohol,percentage_expenditure,Hepatitis_B,Measles_,_BMI_,under-five_deaths_,Polio,Total_expenditure,Diphtheria_,_HIV/AIDS,Population,thinness_upper,Income_composition_of_resources,Schooling,status_index,country_index,features
Afghanistan,2005,Developing,57.3,291,0.02,1.388647732,66,1296,14.2,118,58,8.7,58,0.1,257798.0,19.3,0.396,7.9,0.0,1.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 1.0, 2005.0, 291.0, 0.02, 1.388647732, 66.0, 1296.0, 14.2, 118.0, 58.0, 8.7, 58.0, 0.1, 257798.0, 19.3, 0.396, 7.9))"
Afghanistan,2006,Developing,57.3,295,0.03,17.17151751,64,1990,14.7,116,58,7.43,58,0.1,2589345.0,19.2,0.405,8.1,0.0,1.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 1.0, 2006.0, 295.0, 0.03, 17.17151751, 64.0, 1990.0, 14.7, 116.0, 58.0, 7.43, 58.0, 0.1, 2589345.0, 19.2, 0.405, 8.1))"
Afghanistan,2015,Developing,65.0,263,0.01,71.27962362,65,1154,19.1,83,6,8.16,65,0.1,33736494.0,17.2,0.479,10.1,0.0,1.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 1.0, 2015.0, 263.0, 0.01, 71.27962362, 65.0, 1154.0, 19.1, 83.0, 6.0, 8.16, 65.0, 0.1, 3.3736494E7, 17.2, 0.479, 10.1))"
Albania,2002,Developing,73.3,15,3.73,104.5169157,96,16,46.9,1,98,6.3,98,0.1,3511.0,2.0,0.67,10.7,0.0,56.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 56.0, 2002.0, 15.0, 3.73, 104.5169157, 96.0, 16.0, 46.9, 1.0, 98.0, 6.3, 98.0, 0.1, 3511.0, 2.0, 0.67, 10.7))"
Albania,2003,Developing,72.8,18,4.29,14.71928882,97,8,47.9,1,97,6.27,97,0.1,339616.0,1.9,0.674,10.7,0.0,56.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 56.0, 2003.0, 18.0, 4.29, 14.71928882, 97.0, 8.0, 47.9, 1.0, 97.0, 6.27, 97.0, 0.1, 339616.0, 1.9, 0.674, 10.7))"
Albania,2004,Developing,73.0,17,4.54,221.8428,99,7,48.9,1,98,6.38,97,0.1,326939.0,1.8,0.681,10.9,0.0,56.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 56.0, 2004.0, 17.0, 4.54, 221.8428, 99.0, 7.0, 48.9, 1.0, 98.0, 6.38, 97.0, 0.1, 326939.0, 1.8, 0.681, 10.9))"
Albania,2005,Developing,73.5,15,5.16,26.99312143,98,6,49.9,1,97,6.12,98,0.1,311487.0,1.8,0.685,10.8,0.0,56.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 56.0, 2005.0, 15.0, 5.16, 26.99312143, 98.0, 6.0, 49.9, 1.0, 97.0, 6.12, 98.0, 0.1, 311487.0, 1.8, 0.685, 10.8))"
Albania,2007,Developing,75.9,9,5.58,32.24655228,98,22,51.7,1,99,6.1,98,0.1,29717.0,1.6,0.703,11.6,0.0,56.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 56.0, 2007.0, 9.0, 5.58, 32.24655228, 98.0, 22.0, 51.7, 1.0, 99.0, 6.1, 98.0, 0.1, 29717.0, 1.6, 0.703, 11.6))"
Albania,2014,Developing,77.5,8,4.51,428.7490668,98,0,57.2,1,98,5.88,98,0.1,288914.0,1.2,0.761,14.2,0.0,56.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 56.0, 2014.0, 8.0, 4.51, 428.7490668, 98.0, 0.0, 57.2, 1.0, 98.0, 5.88, 98.0, 0.1, 288914.0, 1.2, 0.761, 14.2))"
Albania,2015,Developing,77.8,74,4.6,364.9752287,99,0,58.0,0,99,6.0,99,0.1,28873.0,1.2,0.762,14.2,0.0,56.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 56.0, 2015.0, 74.0, 4.6, 364.9752287, 99.0, 0.0, 58.0, 0.0, 99.0, 6.0, 99.0, 0.1, 28873.0, 1.2, 0.762, 14.2))"


In [0]:
# Store the results in a dataframe

results = fit_model.transform(test_data)
display(results)

Country,Year,Status,Life_expectancy_,Adult_Mortality,Alcohol,percentage_expenditure,Hepatitis_B,Measles_,_BMI_,under-five_deaths_,Polio,Total_expenditure,Diphtheria_,_HIV/AIDS,Population,thinness_upper,Income_composition_of_resources,Schooling,status_index,country_index,features,prediction
Afghanistan,2005,Developing,57.3,291,0.02,1.388647732,66,1296,14.2,118,58,8.7,58,0.1,257798.0,19.3,0.396,7.9,0.0,1.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 1.0, 2005.0, 291.0, 0.02, 1.388647732, 66.0, 1296.0, 14.2, 118.0, 58.0, 8.7, 58.0, 0.1, 257798.0, 19.3, 0.396, 7.9))",61.26062681159118
Afghanistan,2006,Developing,57.3,295,0.03,17.17151751,64,1990,14.7,116,58,7.43,58,0.1,2589345.0,19.2,0.405,8.1,0.0,1.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 1.0, 2006.0, 295.0, 0.03, 17.17151751, 64.0, 1990.0, 14.7, 116.0, 58.0, 7.43, 58.0, 0.1, 2589345.0, 19.2, 0.405, 8.1))",61.24377702052431
Afghanistan,2015,Developing,65.0,263,0.01,71.27962362,65,1154,19.1,83,6,8.16,65,0.1,33736494.0,17.2,0.479,10.1,0.0,1.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 1.0, 2015.0, 263.0, 0.01, 71.27962362, 65.0, 1154.0, 19.1, 83.0, 6.0, 8.16, 65.0, 0.1, 3.3736494E7, 17.2, 0.479, 10.1))",63.38903966486612
Albania,2002,Developing,73.3,15,3.73,104.5169157,96,16,46.9,1,98,6.3,98,0.1,3511.0,2.0,0.67,10.7,0.0,56.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 56.0, 2002.0, 15.0, 3.73, 104.5169157, 96.0, 16.0, 46.9, 1.0, 98.0, 6.3, 98.0, 0.1, 3511.0, 2.0, 0.67, 10.7))",73.01889811924988
Albania,2003,Developing,72.8,18,4.29,14.71928882,97,8,47.9,1,97,6.27,97,0.1,339616.0,1.9,0.674,10.7,0.0,56.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 56.0, 2003.0, 18.0, 4.29, 14.71928882, 97.0, 8.0, 47.9, 1.0, 97.0, 6.27, 97.0, 0.1, 339616.0, 1.9, 0.674, 10.7))",72.73466431542607
Albania,2004,Developing,73.0,17,4.54,221.8428,99,7,48.9,1,98,6.38,97,0.1,326939.0,1.8,0.681,10.9,0.0,56.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 56.0, 2004.0, 17.0, 4.54, 221.8428, 99.0, 7.0, 48.9, 1.0, 98.0, 6.38, 97.0, 0.1, 326939.0, 1.8, 0.681, 10.9))",72.97133034489383
Albania,2005,Developing,73.5,15,5.16,26.99312143,98,6,49.9,1,97,6.12,98,0.1,311487.0,1.8,0.685,10.8,0.0,56.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 56.0, 2005.0, 15.0, 5.16, 26.99312143, 98.0, 6.0, 49.9, 1.0, 97.0, 6.12, 98.0, 0.1, 311487.0, 1.8, 0.685, 10.8))",72.63350903702616
Albania,2007,Developing,75.9,9,5.58,32.24655228,98,22,51.7,1,99,6.1,98,0.1,29717.0,1.6,0.703,11.6,0.0,56.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 56.0, 2007.0, 9.0, 5.58, 32.24655228, 98.0, 22.0, 51.7, 1.0, 99.0, 6.1, 98.0, 0.1, 29717.0, 1.6, 0.703, 11.6))",73.37810368750013
Albania,2014,Developing,77.5,8,4.51,428.7490668,98,0,57.2,1,98,5.88,98,0.1,288914.0,1.2,0.761,14.2,0.0,56.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 56.0, 2014.0, 8.0, 4.51, 428.7490668, 98.0, 0.0, 57.2, 1.0, 98.0, 5.88, 98.0, 0.1, 288914.0, 1.2, 0.761, 14.2))",75.99977378644334
Albania,2015,Developing,77.8,74,4.6,364.9752287,99,0,58.0,0,99,6.0,99,0.1,28873.0,1.2,0.762,14.2,0.0,56.0,"Map(vectorType -> dense, length -> 18, values -> List(0.0, 56.0, 2015.0, 74.0, 4.6, 364.9752287, 99.0, 0.0, 58.0, 0.0, 99.0, 6.0, 99.0, 0.1, 28873.0, 1.2, 0.762, 14.2))",74.87149423295611


In [0]:
results.select(['Life_expectancy_','prediction']).show()

+----------------+------------------+
|Life_expectancy_|        prediction|
+----------------+------------------+
|            57.3| 61.26062681159118|
|            57.3|61.243777020524305|
|            65.0| 63.38903966486612|
|            73.3| 73.01889811924988|
|            72.8| 72.73466431542607|
|            73.0| 72.97133034489383|
|            73.5| 72.63350903702616|
|            75.9| 73.37810368750013|
|            77.5| 75.99977378644334|
|            77.8| 74.87149423295611|
|            73.8| 70.72173391492038|
|            74.1| 72.37501060770927|
|            75.1| 74.81139784656051|
|            75.3| 74.93344322569294|
|            49.1|56.352257684968265|
|            49.6| 57.50110728805453|
|            74.1| 76.55842944901761|
|            74.1|  76.3864144009244|
|            74.7| 78.43800692989734|
|            75.2| 76.11169118067608|
+----------------+------------------+
only showing top 20 rows



Evaluating the model

In [0]:
test_results = fit_model.evaluate(test_data)

In [0]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| -3.960626811591183|
|-3.9437770205243083|
|  1.610960335133882|
| 0.2811018807501142|
|0.06533568457392391|
|0.02866965510617092|
| 0.8664909629738418|
| 2.5218963124998766|
| 1.5002262135566582|
| 2.9285057670438874|
|  3.078266085079619|
| 1.7249893922907233|
|0.28860215343948425|
| 0.3665567743070568|
| -7.252257684968264|
| -7.901107288054526|
| -2.458429449017615|
|   -2.2864144009244|
| -3.738006929897338|
|-0.9116911806760726|
+-------------------+
only showing top 20 rows



In [0]:
test_results.rootMeanSquaredError

Out[31]: 3.656351388863434

Explain the RMSE: The low RMSE shows that the model is a good predictor

In [0]:
test_results.r2

Out[32]: 0.8205629037068413

Explain the R2: The r-squared value implies that the model explains 83% of the variance which means this is a reliable model

Coefficient Analysis

In [0]:
#ordering up the input columns to the coefficient values below, we can see that income_composition_of_resources, status_index, and schooling 
#have the greatest impact on the model; more time should be spent creating a view of the features next to their coefficient values for faster analysis
#'status_index','country_index','Year','Adult_Mortality','Alcohol', 'percentage_expenditure','Hepatitis_B','Measles_','_BMI_','under-five_deaths_','Polio','Total_expenditure','Diphtheria_','_HIV/AIDS','Population','thinness_upper','Income_composition_of_resources','Schooling'

fit_model.coefficients

Out[33]: DenseVector([1.2621, -0.0103, -0.1327, -0.0157, -0.2026, 0.0004, -0.0026, 0.0, 0.0378, -0.0033, 0.0087, 0.1208, 0.0212, -0.4763, 0.0, -0.0212, 10.9561, 0.8939])