In [1]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 68kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 42.7MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=d153e8a716de8b006a764301dad0adbd531cf7880e1488aaad83319ce80bb902
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1


In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import col
import numpy as np
import os

In [3]:
os.environ["HADOOP_HOME"] = "C:/winutils"

In [4]:
# Creating spark session
spark = SparkSession.builder.appName("ICP7").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [5]:
# Load data and select feature and label columns
data = spark.read.format("csv").option("header", True).option("inferSchema", True).option("delimiter", ",").load("/content/drive/MyDrive/imports-85.csv")

In [6]:
import pandas as pd
pd.DataFrame(data.take(5), columns=data.columns).transpose()

Unnamed: 0,0,1,2,3,4
symboling,3,3,1,2,2
normalized-losses,?,?,?,164,164
make,alfa-romero,alfa-romero,alfa-romero,audi,audi
fuel-type,gas,gas,gas,gas,gas
aspiration,std,std,std,std,std
num-of-doors,two,two,two,four,four
body-style,convertible,convertible,hatchback,sedan,sedan
drive-wheels,rwd,rwd,rwd,fwd,4wd
engine-location,front,front,front,front,front
wheel-base,88.6,88.6,94.5,99.8,99.4


In [8]:
from pyspark.sql.types import *
data = data.withColumnRenamed("wheel-base", "label").select("length", "width", "height","label")
data = data.withColumn("label", data["label"].cast(IntegerType()))

In [9]:
import pandas as pd
pd.DataFrame(data.take(5), columns=data.columns).transpose()


Unnamed: 0,0,1,2,3,4
length,168.8,168.8,171.2,176.6,176.6
width,64.1,64.1,65.5,66.2,66.4
height,48.8,48.8,52.4,54.3,54.3
label,88.0,88.0,94.0,99.0,99.0


In [10]:
# Create vector assembler for feature columns
assembler = VectorAssembler(inputCols=data.columns[:2], outputCol="features")
data = assembler.transform(data)

In [11]:
data = data.select("label", "features")


# Linear Regression

In [12]:
from pyspark.ml.regression import LinearRegression
model1 = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [14]:
# Fit the model
model = model1.fit(data)

In [15]:
# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(model.coefficients))
print("Intercept: %s" % str(model.intercept))


Coefficients: [0.3337891635819007,0.5150505011624908]
Intercept: 6.2559533571945725


In [16]:
# Summarize the model over the training set and print out some metrics
trainingSummary = model.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

numIterations: 6
objectiveHistory: [0.5, 0.39634946019346834, 0.1536353728360829, 0.15116447772451408, 0.14653853943833373, 0.14653853141273573, 0.14653853141271792]
+--------------------+
|           residuals|
+--------------------+
|  -7.614301294335078|
|  -7.614301294335078|
| -3.1364659885591237|
|-0.29946282271511393|
| -0.4024729229476236|
| -0.5846202873387085|
| -2.3517309624286753|
| -2.3517309624286753|
| -2.3517309624286753|
| -1.7091113364223958|
|  2.3548500461959776|
|  2.3548500461959776|
|  2.3548500461959776|
|  2.3548500461959776|
| -0.7989838019444448|
| -0.7989838019444448|
| -2.9162222883000624|
|   1.470500884750379|
|   3.588850441301048|
|   2.949104166452699|
+--------------------+
only showing top 20 rows

RMSE: 2.837581
r2: 0.780117


# Logistic Regression

In [17]:
# Load data and select feature and label columns
data = spark.read.format("csv").option("header", True).option("inferSchema", True).option("delimiter", ",").load("/content/drive/MyDrive/imports-85.csv")

In [19]:
import pandas as pd
pd.DataFrame(data.take(5), columns=data.columns).transpose()

Unnamed: 0,0,1,2,3,4
symboling,3,3,1,2,2
normalized-losses,?,?,?,164,164
make,alfa-romero,alfa-romero,alfa-romero,audi,audi
fuel-type,gas,gas,gas,gas,gas
aspiration,std,std,std,std,std
num-of-doors,two,two,two,four,four
body-style,convertible,convertible,hatchback,sedan,sedan
drive-wheels,rwd,rwd,rwd,fwd,4wd
engine-location,front,front,front,front,front
wheel-base,88.6,88.6,94.5,99.8,99.4


In [20]:
from pyspark.sql.functions import col, when

data = data.withColumn("label", when(col("num-of-doors") == "four", 1).otherwise(0)).select("length", "width", "height","label")

In [21]:
import pandas as pd
pd.DataFrame(data.take(5), columns=data.columns).transpose()

Unnamed: 0,0,1,2,3,4
length,168.8,168.8,171.2,176.6,176.6
width,64.1,64.1,65.5,66.2,66.4
height,48.8,48.8,52.4,54.3,54.3
label,0.0,0.0,0.0,1.0,1.0


In [22]:
# Create vector assembler for feature columns
assembler = VectorAssembler(inputCols=data.columns[:2], outputCol="features")
data = assembler.transform(data)

In [23]:
data = data.select("label", "features")

In [24]:
from pyspark.ml.classification import LogisticRegression
model1 = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [25]:
# Fit the model
model = model1.fit(data)

In [26]:
# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(model.coefficients))
print("Intercept: %s" % str(model.intercept))

Coefficients: [0.0,0.0]
Intercept: 0.22533894187764542


In [30]:
# Summarize the model over the training set and print out some metrics
trainingSummary = model.summary