## Dependencias

In [45]:
from pyspark import SparkContext
from pyspark.sql import SQLContext,SparkSession
from pyspark.sql import functions as F

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

import os
import subprocess
import numpy as np
import pandas as pd

import seaborn as sns

from functools import reduce
from datetime import date

import findspark

## Lectura de datos

In [46]:
tad = pd.read_pickle('Xt.pkl')

In [47]:
type(tad)

pandas.core.frame.DataFrame

In [48]:
spark = SparkSession.builder.appName("bigdatita").getOrCreate()

In [49]:
df = spark.createDataFrame(tad)

In [50]:
df = df.drop('ESTACION','ancla')

In [51]:
df.show(2)

+------------------+-----------------+---------------+------------------+--------------+--------------+-----------------+---------------+------------------+
|      CO_x_desv_30|    NO2_x_desv_20|NO2_x_minimo_20|      O3_x_desv_10|O3_x_maximo_30|O3_x_minimo_20|    SO2_x_desv_20|SO2_x_minimo_20|                 y|
+------------------+-----------------+---------------+------------------+--------------+--------------+-----------------+---------------+------------------+
|0.4541449982334068|13.26256804696009|            6.0|26.755381278360197|         102.0|           0.0|7.653916824425838|            0.0|25.333333333333332|
|0.3011073712505642|9.734586620167187|            1.0|20.638525421246108|         110.0|           1.0|3.385170397962881|            0.0|41.958333333333336|
+------------------+-----------------+---------------+------------------+--------------+--------------+-----------------+---------------+------------------+
only showing top 2 rows



## Variables

In [52]:
var = sorted(tad.filter(like='_x_').columns)

## Regresión con MLlib

### Vectorización

In [53]:
assembler = VectorAssembler(inputCols=var,outputCol='features')

In [54]:
v = assembler.transform(df)

### Regresión

In [55]:
mod = LinearRegression(featuresCol='features',labelCol='y')

In [56]:
mod = mod.fit(v)

In [57]:
print("Coefficients: " + str(mod.coefficients))
print("Intercept: " + str(mod.intercept))

Coefficients: [-8.206280887433351,-0.9149990844478887,0.4508611135816686,0.42865940499691785,0.03284174057273983,1.3653983580954356,-0.42426494020808747,-2.373808350126092]
Intercept: 26.93698727463204


### Evaluación

In [58]:
Xv = spark.createDataFrame(pd.read_pickle('Xv.pkl')).drop('ESTACION','ancla')

In [59]:
Xv = assembler.transform(Xv)
Xv.printSchema()

root
 |-- CO_x_desv_30: double (nullable = true)
 |-- NO2_x_desv_20: double (nullable = true)
 |-- NO2_x_minimo_20: double (nullable = true)
 |-- O3_x_desv_10: double (nullable = true)
 |-- O3_x_maximo_30: double (nullable = true)
 |-- O3_x_minimo_20: double (nullable = true)
 |-- SO2_x_desv_20: double (nullable = true)
 |-- SO2_x_minimo_20: double (nullable = true)
 |-- y: double (nullable = true)
 |-- features: vector (nullable = true)



In [60]:
ev = RegressionEvaluator(predictionCol='prediction',labelCol='y',metricName='mae')

In [61]:
ev.evaluate(mod.transform(v).select('features','y','prediction'))

5.475177145360918

In [62]:
ev.evaluate(mod.transform(Xv).select('features','y','prediction'))

5.156807582757976