In [1]:
# Instalação do PySpark
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=354b48046d670a2cf866875349dbd5518d69bf724bab5c8d88e3f10637aaf2fc
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
# Instalação do Java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [3]:
# Download e extração do Spark
!wget https://dlcdn.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar xf spark-3.5.0-bin-hadoop3.tgz

--2024-01-15 13:23:26--  https://dlcdn.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
Resolving dlcdn.apache.org (dlcdn.apache.org)... 151.101.2.132, 2a04:4e42::644
Connecting to dlcdn.apache.org (dlcdn.apache.org)|151.101.2.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 400395283 (382M) [application/x-gzip]
Saving to: ‘spark-3.5.0-bin-hadoop3.tgz’


2024-01-15 13:23:30 (89.9 MB/s) - ‘spark-3.5.0-bin-hadoop3.tgz’ saved [400395283/400395283]



In [4]:
!pip install -q findspark

In [5]:
# Configuração das variáveis de ambiente
import os

# versão do spark
spark_version = "3.5.0"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/spark-3.5.0-bin-hadoop3"

In [6]:
import findspark


# Inicialize o findspark
findspark.init()

from pyspark.sql import SparkSession

# Crie uma SparkSession
spark = SparkSession.builder\
                 .master("local[*]")\
                 .appName('sparkcolab')\
                 .getOrCreate()

In [7]:
# Instale as bibliotecas necessárias
!pip install gdown



In [8]:
from google.colab import files

# Faça o upload do arquivo
uploaded = files.upload()

Saving BostonHousing.csv to BostonHousing.csv


In [9]:
!ls

BostonHousing.csv  sample_data	spark-3.5.0-bin-hadoop3  spark-3.5.0-bin-hadoop3.tgz


In [10]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

dataset = spark.read.csv('BostonHousing.csv',inferSchema=True, header =True)

In [11]:

dataset.printSchema()

root
 |-- crim: double (nullable = true)
 |-- zn: double (nullable = true)
 |-- indus: double (nullable = true)
 |-- chas: integer (nullable = true)
 |-- nox: double (nullable = true)
 |-- rm: double (nullable = true)
 |-- age: double (nullable = true)
 |-- dis: double (nullable = true)
 |-- rad: integer (nullable = true)
 |-- tax: integer (nullable = true)
 |-- ptratio: double (nullable = true)
 |-- b: double (nullable = true)
 |-- lstat: double (nullable = true)
 |-- medv: double (nullable = true)



In [12]:

#Inserir todos os recursos em uma coluna de vetor
assembler = VectorAssembler(inputCols=['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat'], outputCol = 'Attributes')

output = assembler.transform(dataset)

#Input vs Output
finalized_data = output.select("Attributes","medv")

finalized_data.show()

+--------------------+----+
|          Attributes|medv|
+--------------------+----+
|[0.00632,18.0,2.3...|24.0|
|[0.02731,0.0,7.07...|21.6|
|[0.02729,0.0,7.07...|34.7|
|[0.03237,0.0,2.18...|33.4|
|[0.06905,0.0,2.18...|36.2|
|[0.02985,0.0,2.18...|28.7|
|[0.08829,12.5,7.8...|22.9|
|[0.14455,12.5,7.8...|27.1|
|[0.21124,12.5,7.8...|16.5|
|[0.17004,12.5,7.8...|18.9|
|[0.22489,12.5,7.8...|15.0|
|[0.11747,12.5,7.8...|18.9|
|[0.09378,12.5,7.8...|21.7|
|[0.62976,0.0,8.14...|20.4|
|[0.63796,0.0,8.14...|18.2|
|[0.62739,0.0,8.14...|19.9|
|[1.05393,0.0,8.14...|23.1|
|[0.7842,0.0,8.14,...|17.5|
|[0.80271,0.0,8.14...|20.2|
|[0.7258,0.0,8.14,...|18.2|
+--------------------+----+
only showing top 20 rows



In [13]:
#Divisão treino/teste
train_data,test_data = finalized_data.randomSplit([0.8,0.2])


regressor = LinearRegression(featuresCol = 'Attributes', labelCol = 'medv')


regressor = regressor.fit(train_data)

#prever os preços no conjunto de testes
pred = regressor.evaluate(test_data)

#Modelo de predição
pred.predictions.show()

+--------------------+----+------------------+
|          Attributes|medv|        prediction|
+--------------------+----+------------------+
|[0.01432,100.0,1....|31.6| 33.78124294652607|
|[0.01439,60.0,2.9...|29.1|31.918429849752464|
|[0.02055,85.0,0.7...|24.7|25.633133305640627|
|[0.02187,60.0,2.9...|31.1| 32.28132197103747|
|[0.02543,55.0,3.7...|23.9|28.000551197304702|
|[0.03041,0.0,5.19...|18.5| 19.42382482566856|
|[0.03502,80.0,4.9...|28.5| 34.77091989953546|
|[0.03871,52.5,5.3...|23.2| 27.45259417517564|
|[0.03932,0.0,3.41...|22.0|27.116669245879137|
|[0.03961,0.0,5.19...|21.1| 20.71938495795158|
|[0.04113,25.0,4.8...|28.0|28.423600410962315|
|[0.04294,28.0,15....|20.6|27.398026660653716|
|[0.04666,80.0,1.5...|30.3|   32.227402393492|
|[0.04741,0.0,11.9...|11.9|22.885682947863362|
|[0.04819,80.0,3.6...|21.9|24.706187561830518|
|[0.04932,33.0,2.1...|28.2| 32.98340246712081|
|[0.04981,21.0,5.6...|23.4| 24.02635645256758|
|[0.0536,21.0,5.64...|25.0|27.275308791470422|
|[0.05497,0.0

In [14]:

#coeficiente de regressão linear
coeff = regressor.coefficients

#Interceptação X e Y
intr = regressor.intercept

print ("O coeficiente do modelo é : %a" %coeff)
print ("A interseção do modelo é : %f" %intr)

O coeficiente do modelo é : DenseVector([-0.1104, 0.0693, 0.0169, 3.2332, -17.8931, 2.9141, -0.0021, -1.6372, 0.3204, -0.013, -0.8035, 0.0082, -0.5568])
A interseção do modelo é : 40.899367


#### Analise estatística do modelo

In [15]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(labelCol="medv", predictionCol="prediction", metricName="rmse")

# Root Mean Square Error
rmse = eval.evaluate(pred.predictions)
print("RMSE: %.3f" % rmse)

# Mean Square Error
mse = eval.evaluate(pred.predictions, {eval.metricName: "mse"})
print("MSE: %.3f" % mse)

# Mean Absolute Error
mae = eval.evaluate(pred.predictions, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)

# r2 - coeficiente de determinação
r2 = eval.evaluate(pred.predictions, {eval.metricName: "r2"})
print("r2: %.3f" %r2)

RMSE: 5.051
MSE: 25.511
MAE: 3.515
r2: 0.745
