# Importação e pré análise

In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 36 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 39.6 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=0d81c43dc1f0c2911e32fd404f0716445f8e643449132fdb46b714765466e5f4
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0


In [None]:
import pyspark
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName('lin_reg').getOrCreate()

In [None]:
df = spark.read.csv('Salary.csv', inferSchema=True, header=True)

In [None]:
df.show(5)

+---------------+------+
|YearsExperience|Salary|
+---------------+------+
|            1.1| 39343|
|            1.3| 46205|
|            1.5| 37731|
|            2.0| 43525|
|            2.2| 39891|
+---------------+------+
only showing top 5 rows



In [None]:
df.printSchema()

root
 |-- YearsExperience: double (nullable = true)
 |-- Salary: integer (nullable = true)



In [None]:
df.head()

Row(YearsExperience=1.1, Salary=39343)

# Configurando o DataFrame

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [None]:
df.columns

['YearsExperience', 'Salary']

In [None]:
assembler = VectorAssembler(inputCols=['YearsExperience'], outputCol='features')
assembler

VectorAssembler_5036c3ccdb37

In [None]:
output = assembler.transform(df)
type(output)

pyspark.sql.dataframe.DataFrame

In [None]:
output.select('features', 'Salary').show()

+--------+------+
|features|Salary|
+--------+------+
|   [1.1]| 39343|
|   [1.3]| 46205|
|   [1.5]| 37731|
|   [2.0]| 43525|
|   [2.2]| 39891|
|   [2.9]| 56642|
|   [3.0]| 60150|
|   [3.2]| 54445|
|   [3.2]| 64445|
|   [3.7]| 57189|
|   [3.9]| 63218|
|   [4.0]| 55794|
|   [4.0]| 56957|
|   [4.1]| 57081|
|   [4.5]| 61111|
|   [4.9]| 67938|
|   [5.1]| 66029|
|   [5.3]| 83088|
|   [5.9]| 81363|
|   [6.0]| 93940|
+--------+------+
only showing top 20 rows



In [None]:
final_df = output.select('features', 'Salary')
final_df.head(5)[3][0]

DenseVector([2.0])

In [None]:
df_train, df_test = final_df.randomSplit([0.8, 0.2])

In [None]:
df_train.describe().show()

+-------+-----------------+
|summary|           Salary|
+-------+-----------------+
|  count|               25|
|   mean|         92691.44|
| stddev|31802.71816747744|
|    min|            37731|
|    max|           139465|
+-------+-----------------+



In [None]:
df_test.describe().show()

+-------+------------------+
|summary|            Salary|
+-------+------------------+
|  count|                10|
|   mean|           62081.0|
| stddev|21613.053483691023|
|    min|             39343|
|    max|            105582|
+-------+------------------+



# Treinando o modelo

In [None]:
from pyspark.ml.regression import LinearRegression

In [None]:
lm = LinearRegression(labelCol='Salary')
lm

LinearRegression_d52134fbfbdf

In [None]:
model = lm.fit(df_train)

In [None]:
import pandas as pd

In [None]:
pd.DataFrame({'Coefficients':model.coefficients}, index=['YearsExperience'])

Unnamed: 0,Coefficients
YearsExperience,8591.039056


In [None]:
res = model.evaluate(df_test)

In [None]:
res.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| -530.7318847327551|
|  4613.060304100451|
|-4080.6670349833294|
| -3469.913901984095|
| -8993.745146651272|
| -7830.745146651272|
|  -8565.84905223467|
| -4581.680296901846|
|  11970.17674168077|
|-2160.9404258211434|
+-------------------+





In [None]:
data_t = df_test.select('features')

In [None]:
predict = model.transform(data_t)

In [None]:
predict.show()

+--------+------------------+
|features|        prediction|
+--------+------------------+
|   [1.1]|39873.731884732755|
|   [1.3]| 41591.93969589955|
|   [2.0]| 47605.66703498333|
|   [3.2]|57914.913901984095|
|   [4.0]| 64787.74514665127|
|   [4.0]| 64787.74514665127|
|   [4.1]| 65646.84905223467|
|   [4.9]| 72519.68029690185|
|   [6.0]| 81969.82325831923|
|   [9.0]|107742.94042582114|
+--------+------------------+



# Avaliando o modelo

In [None]:
print(f'Mae: {res.meanAbsoluteError}')
print(f'MSE: {res.meanSquaredError}')
print(f'RMSE: {res.rootMeanSquaredError}')
print(f'R2: {res.r2}')
print(f'Adj R2: {res.r2adj}')

Mae: 5679.75099357416
MSE: 43478252.803474866
RMSE: 6593.804122316257
R2: 0.8965817183098088
Adj R2: 0.883654433098535
