<a href="https://colab.research.google.com/github/NushratRia/Big-Data/blob/main/Lab2_Linear_Regression_on_Cost_of_Living_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install PySpark

Collecting PySpark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: PySpark
  Building wheel for PySpark (setup.py) ... [?25l[?25hdone
  Created wheel for PySpark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285397 sha256=bd40a3f82698e5384502e4d7259b638a3927a6852b3a94cbd4be605b50f7f93c
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built PySpark
Installing collected packages: PySpark
Successfully installed PySpark-3.4.1


In [None]:
import pyspark

Creating the first Pyspark session

In [None]:
#The entry point into all functionality in PySpark is the SparkSession class.
#To create a basic SparkSession, just use SparkSession.builder:

from pyspark.sql import SparkSession
spark= SparkSession.builder.appName('LivingExpenses').getOrCreate()

In [None]:
df=spark.read.csv("/content/Cost_of_Living_Index_2022.csv", header=True, inferSchema=True)

Read the I/P file and check the columns

In [None]:
#If we don't use the header metadata, the headers are _c0, _c1 and _c2 which is not correct.
#Therefore we can fix that using header=True.

#Similarly, without inferschema, all columns would be treated as default datatype(String)
#Therefore to fix this, we put inferSchema=true , PySpark will automatically go through the csv file and infer the schema of each column
df_train= spark.read.csv("/content/Cost_of_Living_Index_2022.csv", header=True, inferSchema=True)
df_train.show()

+----+--------------------+--------------------+----------+------------------------------+---------------+----------------------+----------------------------+
|Rank|             Country|Cost of Living Index|Rent Index|Cost of Living Plus Rent Index|Groceries Index|Restaurant Price Index|Local Purchasing Power Index|
+----+--------------------+--------------------+----------+------------------------------+---------------+----------------------+----------------------------+
|   1|         Afghanistan|               20.37|      2.72|                         12.09|          14.92|                 12.41|                       23.04|
|   2|             Albania|                35.5|      8.47|                         22.83|          29.32|                 25.82|                       30.19|
|   3|             Algeria|               26.87|      4.59|                         16.43|          28.82|                 14.48|                       24.63|
|   4|           Argentina|               34.6

In [None]:
df_train.printSchema()

root
 |-- Rank: integer (nullable = true)
 |-- Country: string (nullable = true)
 |-- Cost of Living Index: double (nullable = true)
 |-- Rent Index: double (nullable = true)
 |-- Cost of Living Plus Rent Index: double (nullable = true)
 |-- Groceries Index: double (nullable = true)
 |-- Restaurant Price Index: double (nullable = true)
 |-- Local Purchasing Power Index: double (nullable = true)



In [None]:
df_train.columns

['Rank',
 'Country',
 'Cost of Living Index',
 'Rent Index',
 'Cost of Living Plus Rent Index',
 'Groceries Index',
 'Restaurant Price Index',
 'Local Purchasing Power Index']

**Invoking VectorAssembler for grouping the required features**

In [None]:
#VectorAssembler is a transformer that combines a given list of columns into a single vector column.
#It is useful for combining raw features and features generated by different feature transformers
#into a single feature vector, in order to train ML models like logistic regression and decision trees.


#VectorAssembler accepts the following input column types: all numeric types, boolean type, and vector type.
#In each row, the values of the input columns will be concatenated into a vector in the specified order.


from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=['Rent Index','Cost of Living Plus Rent Index','Groceries Index','Restaurant Price Index','Local Purchasing Power Index'], outputCol='Independant Features')

In [None]:
#transform each element of the input array + Independant features column
output=featureassembler.transform(df_train)

In [None]:
output.show()

+----+--------------------+--------------------+----------+------------------------------+---------------+----------------------+----------------------------+--------------------+
|Rank|             Country|Cost of Living Index|Rent Index|Cost of Living Plus Rent Index|Groceries Index|Restaurant Price Index|Local Purchasing Power Index|Independant Features|
+----+--------------------+--------------------+----------+------------------------------+---------------+----------------------+----------------------------+--------------------+
|   1|         Afghanistan|               20.37|      2.72|                         12.09|          14.92|                 12.41|                       23.04|[2.72,12.09,14.92...|
|   2|             Albania|                35.5|      8.47|                         22.83|          29.32|                 25.82|                       30.19|[8.47,22.83,29.32...|
|   3|             Algeria|               26.87|      4.59|                         16.43|          

In [None]:
output.columns

['Rank',
 'Country',
 'Cost of Living Index',
 'Rent Index',
 'Cost of Living Plus Rent Index',
 'Groceries Index',
 'Restaurant Price Index',
 'Local Purchasing Power Index',
 'Independant Features']

**Create the final output with the desired target variable**

In [None]:
# The .select() is a transformation function that is used to select the columns from DataFrame and Dataset
fin_output= output.select("Independant Features", "Cost of Living Index")

In [None]:
fin_output.show()

+--------------------+--------------------+
|Independant Features|Cost of Living Index|
+--------------------+--------------------+
|[2.72,12.09,14.92...|               20.37|
|[8.47,22.83,29.32...|                35.5|
|[4.59,16.43,28.82...|               26.87|
|[7.71,22.04,28.17...|               34.69|
|[11.61,23.45,27.5...|               33.89|
|[36.84,58.57,77.4...|               77.75|
|[27.13,50.46,65.8...|               71.04|
|[7.86,19.48,26.57...|               29.73|
|[35.34,61.19,70.5...|                84.0|
|[29.22,42.79,44.5...|               54.77|
|[4.42,19.67,30.41...|               33.13|
|[21.99,59.38,87.8...|               92.37|
|[9.81,21.01,27.24...|               30.89|
|[25.79,50.67,63.3...|               72.61|
|[11.64,32.71,48.7...|                51.3|
|[98.58,123.8,148....|              146.04|
|[10.18,23.24,31.2...|               34.77|
|[6.82,22.39,31.14...|               36.12|
|[10.21,26.12,35.1...|               40.17|
|[8.27,21.54,28.16...|          

**Baseline Model Training using Linear Regression**

In [None]:
from pyspark.ml.regression import LinearRegression

#train_test_split
#featuresCol will be the input column and labelCol will be the target column
train_X, test_X= fin_output.randomSplit([0.8, 0.2])
reg=LinearRegression(featuresCol='Independant Features', labelCol='Cost of Living Index')
reg=reg.fit(train_X)

In [None]:
#Getting the set of coefficients and intercepsts.
reg.coefficients

DenseVector([-0.8827, 1.8829, -0.0002, -0.0001, 0.0])

In [None]:
reg.intercept

-0.0018674836437061618

**Model Evaluation**

In [None]:
pred=reg.evaluate(test_X)

In [None]:
pred.predictions.show()

+--------------------+--------------------+------------------+
|Independant Features|Cost of Living Index|        prediction|
+--------------------+--------------------+------------------+
|[3.96,16.89,26.45...|               28.29| 28.29911528768681|
|[4.42,19.67,30.41...|               33.13|33.126856720937894|
|[5.16,20.79,39.39...|               34.58| 34.58053232074577|
|[8.18,18.03,23.47...|               26.72| 26.72165568727273|
|[8.27,21.54,28.16...|               33.24|33.249751771268414|
|[8.47,22.83,29.32...|                35.5| 35.50199834558498|
|[8.68,25.13,38.13...|               39.64|  39.6452283183071|
|[8.75,28.37,39.88...|               45.68| 45.68261164683714|
|[9.26,20.37,26.44...|               30.18|30.172987803813886|
|[9.82,23.65,38.29...|               35.85| 35.85349805845656|
|[9.87,24.29,39.45...|               37.02|37.014948364690405|
|[10.95,28.55,45.6...|               44.08| 44.07956686046864|
|[12.68,35.81,44.7...|               56.22|  56.2208822

In [None]:
pred.meanAbsoluteError, pred.meanSquaredError

(0.005114915497241561, 4.395970631337705e-05)