In [25]:
#install pySpark
!pip install pySpark




In [2]:
#import libraries and load dataset
from pyspark.sql import SparkSession
spark= SparkSession.builder.appName("test_pyspark").getOrCreate()

In [None]:
df= spark.read.csv("/content/sample_data/california_housing_train.csv",header=True,inferSchema= True)

In [None]:
df

DataFrame[longitude: double, latitude: double, housing_median_age: double, total_rooms: double, total_bedrooms: double, population: double, households: double, median_income: double, median_house_value: double]

In [4]:
df_train= spark.read.csv("/content/Cost_of_Living_Index_2022.csv",header=True,inferSchema= True)

In [5]:
df_train.show()

+----+--------------------+--------------------+----------+------------------------------+---------------+----------------------+----------------------------+
|Rank|             Country|Cost of Living Index|Rent Index|Cost of Living Plus Rent Index|Groceries Index|Restaurant Price Index|Local Purchasing Power Index|
+----+--------------------+--------------------+----------+------------------------------+---------------+----------------------+----------------------------+
|   1|         Afghanistan|               20.37|      2.72|                         12.09|          14.92|                 12.41|                       23.04|
|   2|             Albania|                35.5|      8.47|                         22.83|          29.32|                 25.82|                       30.19|
|   3|             Algeria|               26.87|      4.59|                         16.43|          28.82|                 14.48|                       24.63|
|   4|           Argentina|               34.6

In [6]:
#Checking attributes with their data type
df_train.printSchema()

root
 |-- Rank: integer (nullable = true)
 |-- Country: string (nullable = true)
 |-- Cost of Living Index: double (nullable = true)
 |-- Rent Index: double (nullable = true)
 |-- Cost of Living Plus Rent Index: double (nullable = true)
 |-- Groceries Index: double (nullable = true)
 |-- Restaurant Price Index: double (nullable = true)
 |-- Local Purchasing Power Index: double (nullable = true)



In [7]:
#assembling the columns
from pyspark.ml.feature import VectorAssembler
featureassembler= VectorAssembler(inputCols=['Rent Index', 'Cost of Living Plus Rent Index', 'Groceries Index', 'Restaurant Price Index', 'Local Purchasing Power Index'], outputCol='Independant Features')

In [8]:
output=featureassembler.transform(df_train)

In [9]:
output.show()

+----+--------------------+--------------------+----------+------------------------------+---------------+----------------------+----------------------------+--------------------+
|Rank|             Country|Cost of Living Index|Rent Index|Cost of Living Plus Rent Index|Groceries Index|Restaurant Price Index|Local Purchasing Power Index|Independant Features|
+----+--------------------+--------------------+----------+------------------------------+---------------+----------------------+----------------------------+--------------------+
|   1|         Afghanistan|               20.37|      2.72|                         12.09|          14.92|                 12.41|                       23.04|[2.72,12.09,14.92...|
|   2|             Albania|                35.5|      8.47|                         22.83|          29.32|                 25.82|                       30.19|[8.47,22.83,29.32...|
|   3|             Algeria|               26.87|      4.59|                         16.43|          

In [10]:
output.columns

['Rank',
 'Country',
 'Cost of Living Index',
 'Rent Index',
 'Cost of Living Plus Rent Index',
 'Groceries Index',
 'Restaurant Price Index',
 'Local Purchasing Power Index',
 'Independant Features']

Create the final output with the dataset target variable

In [11]:
#The select() is a transformation function that is used to select the columns from the dataset
fin_output=output.select("Independant Features", "Cost of Living Index")

In [12]:
fin_output.show()

+--------------------+--------------------+
|Independant Features|Cost of Living Index|
+--------------------+--------------------+
|[2.72,12.09,14.92...|               20.37|
|[8.47,22.83,29.32...|                35.5|
|[4.59,16.43,28.82...|               26.87|
|[7.71,22.04,28.17...|               34.69|
|[11.61,23.45,27.5...|               33.89|
|[36.84,58.57,77.4...|               77.75|
|[27.13,50.46,65.8...|               71.04|
|[7.86,19.48,26.57...|               29.73|
|[35.34,61.19,70.5...|                84.0|
|[29.22,42.79,44.5...|               54.77|
|[4.42,19.67,30.41...|               33.13|
|[21.99,59.38,87.8...|               92.37|
|[9.81,21.01,27.24...|               30.89|
|[25.79,50.67,63.3...|               72.61|
|[11.64,32.71,48.7...|                51.3|
|[98.58,123.8,148....|              146.04|
|[10.18,23.24,31.2...|               34.77|
|[6.82,22.39,31.14...|               36.12|
|[10.21,26.12,35.1...|               40.17|
|[8.27,21.54,28.16...|          

In [13]:
#Appling Linear regression on the dataset
from pyspark.ml.regression import LinearRegression

In [14]:
train_x, test_x = fin_output.randomSplit([0.8, 0.2])

In [21]:
reg=LinearRegression(featuresCol = 'Independant Features', labelCol= 'Cost of Living Index')

In [16]:
reg= reg.fit(train_x)

In [17]:
reg.coefficients

DenseVector([-0.8826, 1.883, -0.0002, -0.0002, 0.0])

In [18]:
reg.intercept

-0.0007573925783394953

In [19]:
pred=reg.evaluate(test_x)

In [20]:
pred.predictions.show()

+--------------------+--------------------+------------------+
|Independant Features|Cost of Living Index|        prediction|
+--------------------+--------------------+------------------+
|[8.27,21.54,28.16...|               33.24| 33.24987339052216|
|[8.47,22.83,29.32...|                35.5|35.502116278348936|
|[8.73,22.11,31.27...|               33.92|  33.9159544116544|
|[9.07,24.64,34.02...|               38.38|38.378930579389376|
|[9.26,20.37,26.44...|               30.18|30.172988237161853|
|[9.44,19.66,25.35...|               28.68|28.677552445112475|
|[9.59,24.18,37.84...|               37.06|37.054252815575644|
|[10.04,20.75,25.4...|                30.2| 30.20080047596861|
|[10.28,26.31,34.7...|               40.46|  40.4557406526874|
|[11.22,22.54,30.7...|               32.53| 32.52909263783733|
|[11.64,32.71,48.7...|                51.3|51.302327181930195|
|[12.36,23.23,30.0...|               32.81|32.821570814314406|
|[12.55,24.66,34.8...|               35.35|35.344643954

In [None]:
pred.meanAbsoluteError, pred.rootMeanSquaredError

(0.006005548006720167, 0.007391279461881669)