In [1]:
#!/usr/bin/python

# Intialization
import os
import sys
os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'23

In [2]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("TaxiFarePrediction").enableHiveSupport().getOrCreate()

sc = spark.sparkContext

In [3]:
df = spark.read.csv("Final_Project/2020_taxi_trips.csv", header=True, inferSchema=True)

# Display the DataFrame

df.show()

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------------------+------------+------------+---------+--------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------------------+------------+------------+---------+--------------------+
|       2| 2020-01-01 06:47:28|  2020-01-01 06:52:54|                 N|         1|          74|          75|              1|         1.47|        6.5|  0.0|    0.5|       0.0|         0.0|    

In [4]:
df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- trip_type: integer (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [5]:
row_count = df.count()
column_count = len(df.columns)
print('Shape: ',(row_count,column_count))

Shape:  (1734051, 19)


# Dropping nulls, columns

In [6]:
from pyspark.sql.functions import col
# Specifying irrelevant columns
columns_to_drop = ['VendorID','passenger_count', 'store_and_fwd_flag', 'RatecodeID', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'payment_type', 'trip_type', 'congestion_surcharge']


# Drop the specified columns from the DataFrame
df_new = df.drop(*columns_to_drop)

print(df_new.count())

1734051


In [7]:
df_new = df_new.na.drop()
df_new = df_new.filter(~(col("total_amount") == "NaN"))

In [8]:
from pyspark.sql.functions import min, max

# Compute min and max
df_new.select(
    min("total_amount").alias("min_total"),
    max("total_amount").alias("max_total")
    
).show()

+---------+---------+
|min_total|max_total|
+---------+---------+
|   -210.3|    803.8|
+---------+---------+



In [9]:
# Filter rows where total_amount <= 0
negative_total_amount_df = df_new.filter(df_new.total_amount <= 0)

# Count how many such rows exist
amount_negative = negative_total_amount_df.count()

print("Number of rows with total_amount <= 0:", amount_negative)

Number of rows with total_amount <= 0: 8639


In [10]:
# Filter rows where total_distance <= 0
negative_trip_distance_df = df_new.filter(df_new.trip_distance <= 0)

# Count how many such rows exist
distance_negative = negative_trip_distance_df.count()

print("Number of rows with trip_distance <= 0:", distance_negative)

Number of rows with trip_distance <= 0: 65297


In [11]:
df_new = df_new[df_new['total_amount'] > 0]
df_new = df_new[df_new['trip_distance'] > 0]

In [12]:
df_new = df_new[~df_new['PULocationID'].isin([264, 265]) & ~df_new['DOLocationID'].isin([264, 265])]

In [13]:
df_new.count()

1652872

# Creating time category

In [14]:
from pyspark.sql.functions import hour, dayofmonth, month, when, col

# Extract hour, day, and month from pickup time
df_new = df_new.withColumn("hour", hour(col("lpep_pickup_datetime")))
df_new = df_new.withColumn("day", dayofmonth(col("lpep_pickup_datetime")))
df_new = df_new.withColumn("month", month(col("lpep_pickup_datetime")))

# Categorize time into parts of day (label)
df_new = df_new.withColumn(
    "time_category",
    when((col("hour") >= 0) & (col("hour") < 6), "Early Morning")
    .when((col("hour") >= 6) & (col("hour") < 12), "Morning")
    .when((col("hour") >= 12) & (col("hour") < 16), "Afternoon")
    .when((col("hour") >= 16) & (col("hour") < 20), "Evening")
    .otherwise("Night")
)

# Create time category encoding
df_new = df_new.withColumn(
    "time_category_encoded",
    when((col("hour") >= 0) & (col("hour") < 6), 1)
    .when((col("hour") >= 6) & (col("hour") < 12), 2)
    .when((col("hour") >= 12) & (col("hour") < 16), 3)
    .when((col("hour") >= 16) & (col("hour") < 20), 4)
    .otherwise(5)
)


In [15]:
df_new.filter(col("time_category_encoded").isNull()).count()

0

In [16]:
df_new = df_new[['time_category_encoded','month', 'day', 'PULocationID', 'DOLocationID', 'trip_distance', 'total_amount']]

In [17]:
df_new = df_new.orderBy("month","day","time_category_encoded")

In [18]:
df_new.show()

+---------------------+-----+---+------------+------------+-------------+------------+
|time_category_encoded|month|day|PULocationID|DOLocationID|trip_distance|total_amount|
+---------------------+-----+---+------------+------------+-------------+------------+
|                    1|    1|  1|          65|          40|         1.42|        9.36|
|                    1|    1|  1|          82|         129|         1.06|        9.36|
|                    1|    1|  1|          41|         238|         1.56|        9.32|
|                    1|    1|  1|          82|          82|         0.95|        8.16|
|                    1|    1|  1|          95|          95|         0.61|        7.88|
|                    1|    1|  1|          97|          33|         1.47|        11.0|
|                    1|    1|  1|          25|          25|         0.61|        8.19|
|                    1|    1|  1|          82|         157|         1.49|       10.56|
|                    1|    1|  1|          

In [19]:
df_new.count()

1652872

In [20]:
import pandas as pd

In [21]:
preprocessed_data = df_new.toPandas()

In [22]:
preprocessed_data.head()

Unnamed: 0,time_category_encoded,month,day,PULocationID,DOLocationID,trip_distance,total_amount
0,1,1,1,74,74,0.46,4.8
1,1,1,1,74,69,1.88,9.8
2,1,1,1,41,41,0.76,7.8
3,1,1,1,49,181,2.24,11.8
4,1,1,1,244,244,0.6,6.3


In [23]:
preprocessed_data.to_csv("/home/talentum/preprocessed_data_2020.csv", index=False)