In [1]:
import findspark
findspark.init('/opt/spark')

In [2]:
from pyspark import SparkContext
from pyspark.sql import SQLContext,SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

import os
import subprocess
import pandas as pd

from functools import reduce
from datetime import date

# Lectura de datos

In [3]:
filename="hdfs://localhost:9000"+"/user/hive/warehouse/Bicicletas/muestra.csv"

In [4]:
filename

'hdfs://localhost:9000/user/hive/warehouse/Bicicletas/muestra.csv'

In [4]:
sparkSession = SparkSession.builder.appName("bigdatita").getOrCreate()

In [6]:
df = sparkSession.read.csv(filename,inferSchema=False,header=True)

In [7]:
df.printSchema()

root
 |-- trip_id: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- week: string (nullable = true)
 |-- day: string (nullable = true)
 |-- hour: string (nullable = true)
 |-- usertype: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- starttime: string (nullable = true)
 |-- stoptime: string (nullable = true)
 |-- tripduration: string (nullable = true)
 |-- temperature: string (nullable = true)
 |-- events: string (nullable = true)
 |-- from_station_id: string (nullable = true)
 |-- from_station_name: string (nullable = true)
 |-- latitude_start: string (nullable = true)
 |-- longitude_start: string (nullable = true)
 |-- dpcapacity_start: string (nullable = true)
 |-- to_station_id: string (nullable = true)
 |-- to_station_name: string (nullable = true)
 |-- latitude_end: string (nullable = true)
 |-- longitude_end: string (nullable = true)
 |-- dpcapacity_end: string (nullable = true)



# Ingeniería de datos

In [8]:
aux=df.select("year","week","tripduration","from_station_id","to_station_id")
#aux.show(5)

In [9]:
aux=aux.withColumn("ruta" ,F.concat_ws("|","from_station_id","to_station_id") )
aux=aux.drop("from_station_id","to_station_id")

In [10]:
# Catalogo de fechas
alfa=aux.select("year","week")
alfa=alfa.drop_duplicates()
alfa=alfa.withColumn("ancla2",F.concat("year","week"))
alfa=alfa.orderBy("year","week")
alfa=alfa.drop("year","week")
alfa=alfa.withColumn("id_fh",F.row_number().over(Window.orderBy(F.monotonically_increasing_id())))
#alfa.show(5)

In [11]:
fh_i=1
fh_f=alfa.count()

In [12]:
aux=aux.withColumn("ancla",F.concat("year","week") ) 
aux=aux.drop("year","week")

In [13]:
aux=aux.join(alfa, aux.ancla == alfa.ancla2, "inner")
aux=aux.drop("ancla2")

In [14]:
aux=aux.withColumn("duracion",F.col("tripduration").cast('double') )
aux=aux.drop("tripduration")

In [15]:
aux=aux.withColumn("viaje",F.lit(1))

In [16]:
#aux.show(5)

+-------+------+-----+------------------+-----+
|   ruta| ancla|id_fh|          duracion|viaje|
+-------+------+-----+------------------+-----+
|337|175|201427|   20|18.966666666666665|    1|
|140|212|201427|   20|               5.8|    1|
|  23|94|201427|   20|             10.45|    1|
| 47|287|201427|   20| 8.983333333333333|    1|
| 210|61|201427|   20|              3.55|    1|
+-------+------+-----+------------------+-----+
only showing top 5 rows



In [17]:
vobs=12
vdes=1
step=3
anclai,anclaf=fh_i+vobs-1,fh_f-vdes # Cada ancal es una semana
anclai,anclaf

(12, 209)

In [18]:
df=aux.drop("ancla")
um=['ruta','id_fh']
df.show(5)

In [19]:
def ing(df,k,ancla):
    u=df.filter( ( df['id_fh']>= (ancla-k+1) ) & (df['id_fh']<=ancla) ).orderBy("id_fh")
    expr = [F.sum(F.col('viaje')).alias(f'x_num_tot_viajes_{k}')]
    expr.append(F.mean(F.col('duracion')).alias(f'x_duracion_prom_viaje_{k}'))
    u = u.groupBy('ruta').agg(*expr).withColumn('id_fh',F.lit(ancla))
    return u

In [20]:
def ing_tgt(df,ancla):
    u=df.filter( df['id_fh']== ancla + 1 ).orderBy("id_fh")
    expr = [F.sum(F.col('viaje')).alias('prediccion')]
    
    u = u.groupBy('ruta').agg(*expr).withColumn('id_fh',F.lit(ancla))
    return u

In [21]:
anclaf=23
step=3
u= reduce(lambda x,y:x.union(y),map(lambda ancla:reduce(lambda x,y:x.join(y,um,'outer'),
                       map(lambda k:ing(df,k,ancla),
                           range(step,vobs+step,step)) ).join(ing_tgt(df,ancla),um,how='inner'),
                                    range(anclai,anclaf+1)))

In [23]:
u.printSchema()

root
 |-- ruta: string (nullable = true)
 |-- id_fh: integer (nullable = true)
 |-- x_num_tot_viajes_3: long (nullable = true)
 |-- x_duracion_prom_viaje_3: double (nullable = true)
 |-- x_num_tot_viajes_6: long (nullable = true)
 |-- x_duracion_prom_viaje_6: double (nullable = true)
 |-- x_num_tot_viajes_9: long (nullable = true)
 |-- x_duracion_prom_viaje_9: double (nullable = true)
 |-- x_num_tot_viajes_12: long (nullable = true)
 |-- x_duracion_prom_viaje_12: double (nullable = true)
 |-- prediccion: long (nullable = true)



In [25]:
tad=u.join(alfa,'id_fh',"inner")

In [27]:
tad=tad.withColumnRenamed('ancla2','ancla')

In [28]:
tad.printSchema()

root
 |-- id_fh: integer (nullable = true)
 |-- ruta: string (nullable = true)
 |-- x_num_tot_viajes_3: long (nullable = true)
 |-- x_duracion_prom_viaje_3: double (nullable = true)
 |-- x_num_tot_viajes_6: long (nullable = true)
 |-- x_duracion_prom_viaje_6: double (nullable = true)
 |-- x_num_tot_viajes_9: long (nullable = true)
 |-- x_duracion_prom_viaje_9: double (nullable = true)
 |-- x_num_tot_viajes_12: long (nullable = true)
 |-- x_duracion_prom_viaje_12: double (nullable = true)
 |-- prediccion: long (nullable = true)
 |-- ancla: string (nullable = true)



In [29]:
tad=tad.select('ruta','ancla','x_num_tot_viajes_3','x_num_tot_viajes_6','x_num_tot_viajes_9',
           'x_num_tot_viajes_12','x_duracion_prom_viaje_3','x_duracion_prom_viaje_6',
           'x_duracion_prom_viaje_9','x_duracion_prom_viaje_12','prediccion')

In [31]:
tad.printSchema()

root
 |-- ruta: string (nullable = true)
 |-- ancla: string (nullable = true)
 |-- x_num_tot_viajes_3: long (nullable = true)
 |-- x_num_tot_viajes_6: long (nullable = true)
 |-- x_num_tot_viajes_9: long (nullable = true)
 |-- x_num_tot_viajes_12: long (nullable = true)
 |-- x_duracion_prom_viaje_3: double (nullable = true)
 |-- x_duracion_prom_viaje_6: double (nullable = true)
 |-- x_duracion_prom_viaje_9: double (nullable = true)
 |-- x_duracion_prom_viaje_12: double (nullable = true)
 |-- prediccion: long (nullable = true)



In [32]:
tad.show()

+-------+------+------------------+------------------+------------------+-------------------+-----------------------+-----------------------+-----------------------+------------------------+----------+
|   ruta| ancla|x_num_tot_viajes_3|x_num_tot_viajes_6|x_num_tot_viajes_9|x_num_tot_viajes_12|x_duracion_prom_viaje_3|x_duracion_prom_viaje_6|x_duracion_prom_viaje_9|x_duracion_prom_viaje_12|prediccion|
+-------+------+------------------+------------------+------------------+-------------------+-----------------------+-----------------------+-----------------------+------------------------+----------+
|287|289|201421|                 1|                 1|                 1|                  1|      22.18333333333333|      22.18333333333333|      22.18333333333333|       22.18333333333333|         1|
| 91|195|201421|                 1|                 1|                 1|                  1|                   19.5|                   19.5|                   19.5|                    19.5|  

In [35]:
tad.toPandas().to_csv('mycsv.csv')