In [1]:
import numpy as np
import pandas as pd 

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from functools import reduce

In [2]:
df = spark.read.csv("hdfs://bigdatita-m:8020/chicago",header=True,inferSchema=True)

In [3]:
df.printSchema()

root
 |-- taxi_id: string (nullable = true)
 |-- trip_start_timestamp: string (nullable = true)
 |-- trip_end_timestamp: string (nullable = true)
 |-- trip_seconds: string (nullable = true)
 |-- trip_miles: string (nullable = true)
 |-- pickup_census_tract: string (nullable = true)
 |-- dropoff_census_tract: string (nullable = true)
 |-- pickup_community_area: string (nullable = true)
 |-- dropoff_community_area: string (nullable = true)
 |-- fare: string (nullable = true)
 |-- tips: string (nullable = true)
 |-- tolls: string (nullable = true)
 |-- extras: string (nullable = true)
 |-- trip_total: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- company: string (nullable = true)
 |-- pickup_latitude: string (nullable = true)
 |-- pickup_longitude: string (nullable = true)
 |-- dropoff_latitude: string (nullable = true)
 |-- dropoff_longitude: string (nullable = true)



In [4]:
df = df.withColumn("fh",F.to_timestamp(F.col("trip_start_timestamp")))
df = df.withColumn("week",F.weekofyear(F.col("fh")))

In [5]:
df = df.select(*["taxi_id","week","trip_total","tips"]).withColumn("n",F.lit(1))

In [6]:
cat = df.select("week").drop_duplicates().toPandas()
cat = cat.sort_values(by="week").reset_index(drop=True)
wini,wfin = cat["week"].min(), cat["week"].max()
cat = spark.createDataFrame(cat)
wini,wfin

(1, 53)

In [7]:
vobs = 8
vdes = 3
anclai,anclaf = wini+vobs-1, wfin-vdes
anclai,anclaf 

(8, 50)

In [8]:
df = df.withColumn("ratio_tips",F.col("tips")/F.col("trip_total"))

In [9]:
varc = ["trip_total","tips","n","ratio_tips"]

In [10]:
def ing_X(df,varc,k,ancla):
    aux = df.filter((df["week"]<=ancla)&(df["week"]>=(ancla-k+1)))
    expr = [y(x).alias(f"v_{z}_{x}_{k}") for x in varc for y,z in zip([F.sum,F.min,F.max,F.avg,F.stddev],
                                                                      ["suma","minimo","maximo","media","desv"]
                                                                     )]
    aux = aux.groupBy("taxi_id").agg(*expr).withColumn("ancla",F.lit(ancla))
    return aux

In [11]:
step = 2

In [12]:
%%time
X = reduce(lambda x,y:x.union(y),map(lambda ancla:reduce(lambda x,y:x.join(y,["taxi_id","ancla"],"outer"),
       map(lambda k:ing_X(df,varc,k,ancla),range(step,vobs+step,step))),range(anclai,anclaf+1)))

CPU times: user 2.19 s, sys: 833 ms, total: 3.02 s
Wall time: 15.7 s


In [13]:
def ing_y(df,vdes,ancla):
    aux = df.filter((df["week"]>ancla)&(df["week"]<=(ancla+vdes))).select("taxi_id").drop_duplicates()
    aux = aux.withColumn("target",F.lit(0)).withColumn("ancla",F.lit(ancla))
    return aux

In [14]:
y = reduce(lambda x,y:x.union(y),map(lambda ancla:ing_y(df,vdes,ancla),range(anclai,anclaf+1)))

In [15]:
tad = X.join(y,["taxi_id","ancla"],"left")

In [16]:
tad  = tad.fillna({"target":1})

In [17]:
var = [v for v in tad.columns if v[:2]=="v_"]

In [18]:
assembler = VectorAssembler(inputCols=var,outputCol="features")

In [19]:
v = assembler.transform(tad)

IllegalArgumentException: 'Data type string of column v_minimo_trip_total_2 is not supported.\nData type string of column v_maximo_trip_total_2 is not supported.\nData type string of column v_minimo_tips_2 is not supported.\nData type string of column v_maximo_tips_2 is not supported.\nData type string of column v_minimo_trip_total_4 is not supported.\nData type string of column v_maximo_trip_total_4 is not supported.\nData type string of column v_minimo_tips_4 is not supported.\nData type string of column v_maximo_tips_4 is not supported.\nData type string of column v_minimo_trip_total_6 is not supported.\nData type string of column v_maximo_trip_total_6 is not supported.\nData type string of column v_minimo_tips_6 is not supported.\nData type string of column v_maximo_tips_6 is not supported.\nData type string of column v_minimo_trip_total_8 is not supported.\nData type string of column v_maximo_trip_total_8 is not supported.\nData type string of column v_minimo_tips_8 is not supported.\nData type string of column v_maximo_tips_8 is not supported.'

In [None]:
mod = LogisticRegression(featuresCol="features",labelCol="target")

tad.toPandas().to_csv("taxi_churn.csv",index=False)

In [None]:
print("Coefficients: " + str(mod.coefficients))
print("Intercept: " + str(mod.intercept))