In [3]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=fa27a66233d66a4d611df5d7da7d2e33a38807026c9a681509ab0f48e6539d05
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3


### LOADING:

In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
import logging

In [5]:
spark = SparkSession.builder.appName("Boston").getOrCreate()

In [6]:
data = spark.read.option("header", "true").option("inferSchema", "true").csv("/content/boston.csv")

In [13]:
data.show()

+-----------+----+-----------+-----------+-----------+-----------+-----------+---+---+-----------+-----------+-----------+-----------+
|       CRIM|  ZN|      INDUS|        NOX|         RM|        AGE|        DIS|RAD|TAX|         PT|          B|      LSTAT|         MV|
+-----------+----+-----------+-----------+-----------+-----------+-----------+---+---+-----------+-----------+-----------+-----------+
|    0.00632|18.0|2.309999943|0.537999988|6.574999809|65.19999695|4.090000153|  1|296|15.30000019|396.8999939|4.980000019|       24.0|
|0.027310001| 0.0|7.070000172|0.469000012|6.421000004|78.90000153|4.967100143|  2|242|17.79999924|396.8999939|9.140000343|21.60000038|
|    0.02729| 0.0|7.070000172|0.469000012|7.184999943|61.09999847|4.967100143|  2|242|17.79999924|392.8299866| 4.03000021|34.70000076|
|0.032370001| 0.0|2.180000067|0.458000004|6.998000145|45.79999924|6.062200069|  3|222|18.70000076|394.6300049|2.940000057|33.40000153|
|0.069049999| 0.0|2.180000067|0.458000004|7.146999836|5

In [7]:
data.printSchema()

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: integer (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: integer (nullable = true)
 |-- PT: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MV: double (nullable = true)



### EDA:

In [8]:
data.count()

506

In [10]:
from pyspark.sql.functions import isnull, col

for column in data.columns:
  print(f"Nulls in column {column}: {data.filter(isnull(col(column))).count()}")

Nulls in column CRIM: 0
Nulls in column ZN: 0
Nulls in column INDUS: 0
Nulls in column CHAS: 0
Nulls in column NOX: 0
Nulls in column RM: 0
Nulls in column AGE: 0
Nulls in column DIS: 0
Nulls in column RAD: 0
Nulls in column TAX: 0
Nulls in column PT: 0
Nulls in column B: 0
Nulls in column LSTAT: 0
Nulls in column MV: 0


### Q3. CORRELATION BETWEEN DEPENDENT AND INDEPENDENT VARIABLE:

In [11]:
# List of independent variables
independent_vars = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE','DIS','RAD','TAX','PT','B','LSTAT']

# Dependent variable
dependent_var = 'MV'

# Loop through each independent variable and calculate correlation
for col in independent_vars:
    correlation = data.stat.corr(col, dependent_var)
    print(f"Correlation between {col} and {dependent_var}: {correlation}")

Correlation between CRIM and MV: -0.3883046116575089
Correlation between ZN and MV: 0.360445344637529
Correlation between INDUS and MV: -0.48372517128143366
Correlation between CHAS and MV: 0.1752601777529185
Correlation between NOX and MV: -0.4273207763683772
Correlation between RM and MV: 0.6953599371272672
Correlation between AGE and MV: -0.3769545671428867
Correlation between DIS and MV: 0.24992873873512172
Correlation between RAD and MV: -0.38162623156691683
Correlation between TAX and MV: -0.46853593528654536
Correlation between PT and MV: -0.5077867038116086
Correlation between B and MV: 0.3334608226834165
Correlation between LSTAT and MV: -0.7376627294671615


In [12]:
data = data.drop("CHAS")

In [14]:
data.show()

+-----------+----+-----------+-----------+-----------+-----------+-----------+---+---+-----------+-----------+-----------+-----------+
|       CRIM|  ZN|      INDUS|        NOX|         RM|        AGE|        DIS|RAD|TAX|         PT|          B|      LSTAT|         MV|
+-----------+----+-----------+-----------+-----------+-----------+-----------+---+---+-----------+-----------+-----------+-----------+
|    0.00632|18.0|2.309999943|0.537999988|6.574999809|65.19999695|4.090000153|  1|296|15.30000019|396.8999939|4.980000019|       24.0|
|0.027310001| 0.0|7.070000172|0.469000012|6.421000004|78.90000153|4.967100143|  2|242|17.79999924|396.8999939|9.140000343|21.60000038|
|    0.02729| 0.0|7.070000172|0.469000012|7.184999943|61.09999847|4.967100143|  2|242|17.79999924|392.8299866| 4.03000021|34.70000076|
|0.032370001| 0.0|2.180000067|0.458000004|6.998000145|45.79999924|6.062200069|  3|222|18.70000076|394.6300049|2.940000057|33.40000153|
|0.069049999| 0.0|2.180000067|0.458000004|7.146999836|5

In [15]:
data1 = data.select(
    data["MV"].alias("label"),
    data["CRIM"],
    data["ZN"],
    data["INDUS"],
    data["NOX"],
    data["RM"],
    data["AGE"],
    data["DIS"],
    data["RAD"],
    data["TAX"],
    data["PT"],
    data["B"],
    data["LSTAT"]
)

### MODEL ASSEMBLING:

In [17]:
assembler = VectorAssembler(inputCols=["CRIM", "ZN", "INDUS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PT", "B","LSTAT"], outputCol="features")
data2 = assembler.transform(data1).select("label", "features")

### Q4. MODEL BUILDING:

In [18]:
lr = LinearRegression()
lrModel = lr.fit(data2)

In [19]:
# Printing model coefficients and intercept
print(f"Coefficients: {lrModel.coefficients} Intercept: {lrModel.intercept}")

Coefficients: [-0.11313907399132583,0.04705245542131341,0.040311506977772936,-17.36700309454723,3.8504914637418364,0.0027837556944060094,-1.4853738430173982,0.3283110497824966,-0.013755828222706521,-0.9909580560187735,0.009741451123718657,-0.5341576394742901] Intercept: 36.89196286592349


In [20]:
# Model summary
trainingSummary = lrModel.summary
print(f"numIterations: {trainingSummary.totalIterations}")
print(f"objectiveHistory: {trainingSummary.objectiveHistory}")

numIterations: 0
objectiveHistory: [0.0]


In [26]:
# Showing residuals
trainingSummary.residuals.show()

+--------------------+
|           residuals|
+--------------------+
|  -6.212370690500425|
| -3.6672336730292585|
|    3.85064222896559|
|   4.635238660816807|
|   8.096805941870254|
|   3.303628766669327|
|-0.37727258201447356|
|   7.287065344823542|
|   4.820608385585643|
| -0.2775169112028202|
| -4.2626066242455956|
| -2.9822096233181945|
|  0.6127136540275799|
|  0.7740615451797765|
| -1.1862807001587008|
|   0.545700556871104|
|  2.5566531228394354|
|  0.5332710284743669|
|   4.092257934498541|
|-0.26375695634771645|
+--------------------+
only showing top 20 rows



### Q5. PRINTING RESULTS:

In [28]:
print(f"RMSE: {trainingSummary.rootMeanSquaredError}")

RMSE: 4.725206763042685


In [27]:
print(f"MSE: {trainingSummary.meanSquaredError}")

MSE: 22.32757895350433


In [24]:
print(f"r2: {trainingSummary.r2}")

r2: 0.73551650979604
