In [None]:
spark

In [None]:
df = sqlContext.read.csv('s3a://linear-regression-mlc/train.csv', header=True, inferSchema=True)
# Using a smaller dataset of 10M rows
df = df.limit(1_000_000)
df = df.dropna()
df.cache()

In [None]:
df.show()

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [None]:
inputCols = [
    'pickup_latitude',
    'pickup_longitude',
    'dropoff_longitude',
    'dropoff_latitude',
    'passenger_count'
]
assembler = VectorAssembler(inputCols = inputCols , outputCol = 'features')
dataset = assembler.transform(df)

In [None]:
lr = LinearRegression(featuresCol = 'features', labelCol = 'fare_amount')
model = lr.fit(dataset)
summary = model.evaluate(dataset)

In [None]:
summary.r2

In [None]:
df.agg({'pickup_latitude':'max'}).collect()

In [None]:
#NYC lies between 73 and 75 degrees West, and 40 and 42 degrees north

TOP, BOTTOM, LEFT, RIGHT = 42, 40 ,-75, -73
df = df.filter(df['pickup_latitude'] >= BOTTOM)
df = df.filter(df['pickup_latitude'] <= TOP)
df = df.filter(df['pickup_longitude'] <= RIGHT)
df = df.filter(df['pickup_longitude'] >= LEFT)

df = df.filter(df['dropoff_latitude'] >= BOTTOM)
df = df.filter(df['dropoff_latitude'] <= TOP)
df = df.filter(df['dropoff_longitude'] <= RIGHT)
df = df.filter(df['dropoff_longitude'] >= LEFT)

In [None]:
df.count()

In [None]:
df.agg({'passenger_count':'min'}).collect()

In [None]:
df =df.filter(df['passenger_count']>0)

In [None]:
df.agg({'fare_amount':'min'}).collect()

In [None]:
df =df.filter(df['fare_amount']>0)

In [None]:
df.count()

In [None]:
inputCols = [
    'pickup_latitude',
    'pickup_longitude',
    'dropoff_longitude',
    'dropoff_latitude',
    'passenger_count'
]
assembler = VectorAssembler(inputCols = inputCols, outputCol = 'featuresClean')
dataset = assembler.transform(df)

In [None]:
lr = LinearRegression(featuresCol = 'featuresClean', labelCol = 'fare_amount')
model = lr.fit(dataset)
summary = model.evaluate(dataset)

In [None]:
summary.r2

In [None]:
df.show()

In [None]:
df.select(('pickup_datetime')).show(truncate=False)

In [None]:
df=df.withColumn('datetime',df['pickup_datetime'].substr(0,19))
df.select('datetime').show(truncate=False)

In [None]:
from pyspark.sql.functions import to_timestamp

In [None]:
df = df.withColumn('timestamp', to_timestamp(df['datetime']))
df.select('timestamp').show(truncate =False)

In [None]:
from pyspark.sql.functions import year,month,dayofweek, hour

df = df.withColumn('year', year(df['timestamp']))
df = df.withColumn('month', month(df['timestamp']))
df = df.withColumn('dayofweek', dayofweek(df['timestamp']))
df = df.withColumn('hour', hour(df['timestamp']))
df.select('year','month', 'dayofweek', 'hour').show(truncate =  False)

In [None]:
inputCols = [
    'pickup_latitude',
    'pickup_longitude',
    'dropoff_longitude',
    'dropoff_latitude',
    'passenger_count', 'year', 'month', 'dayofweek' , 'hour'
]
assembler = VectorAssembler(inputCols = inputCols, outputCol = 'featuresCleanWithDate')
dataset = assembler.transform(df)

In [None]:
lr = LinearRegression(featuresCol = 'featuresCleanWithDate', labelCol = 'fare_amount')
model = lr.fit(dataset)
summary = model.evaluate(dataset)

In [None]:
from pyspark.sql.functions import from_utc_timestamp
df = df.withColumn('NYTime', from_utc_timestamp(df['timestamp'], 'EST'))
df.select('NYTime').show()

In [None]:
from pyspark.sql.functions import year,month,dayofweek, hour

df = df.withColumn('year', year(df['NYTime']))
df = df.withColumn('month', month(df['NYTime']))
df = df.withColumn('dayofweek', dayofweek(df['NYTime']))
df = df.withColumn('hour', hour(df['NYTime']))
df.select('year','month', 'dayofweek', 'hour').show()

In [None]:
inputCols = [
    'pickup_latitude',
    'pickup_longitude',
    'dropoff_longitude',
    'dropoff_latitude',
    'passenger_count', 'year', 'month', 'dayofweek' , 'hour'
]
assembler = VectorAssembler(inputCols = inputCols, outputCol = 'featuresCleanWithDate')
dataset = assembler.transform(df)

In [None]:
lr = LinearRegression(featuresCol = 'featuresCleanWithDate', labelCol = 'fare_amount')
model = lr.fit(dataset)
summary = model.evaluate(dataset)

In [None]:
summary.r2

In [None]:
x1 = df['pickup_longitude']
y1 = df['pickup_latitude']
x2 = df['dropoff_longitude']
y2 = df['dropoff_latitude']

from pyspark.sql.functions import abs as psabs
df = df.withColumn('l1', psabs(x1 - x2) + psabs(y1 - y2))

In [None]:
inputCols = [
    'pickup_latitude',
    'pickup_longitude',
    'dropoff_longitude',
    'dropoff_latitude',
    'passenger_count', 'year', 'month', 'dayofweek' , 'hour', 'l1'
]
assembler = VectorAssembler(inputCols = inputCols, outputCol = 'featuresCleanWithDatel1')
dataset = assembler.transform(df)

In [None]:
lr = LinearRegression(featuresCol = 'featuresCleanWithDatel1', labelCol = 'fare_amount')
model = lr.fit(dataset)
summary = model.evaluate(dataset)

In [None]:
summary.r2

In [None]:
train, test = df.randomSplit([0.66, 0.33])

In [None]:
trainDataset = assembler.transform(train)
testDataset = assembler.transform(test)

In [None]:
lr = LinearRegression(featuresCol = 'featuresCleanWithDatel1', labelCol = 'fare_amount')
model = lr.fit(trainDataset)
summary = model.evaluate(testDataset)

In [None]:
summary.r2