In [0]:
from pyspark.sql.types import *
from pyspark.sql import DataFrame
from functools import reduce
from pyspark.sql.functions import *
from pyspark.sql import functions as F
import os
from pyspark.sql import *
from pyspark.sql.functions import col, sum

In [0]:
%run "/Users/vaishnavi.balureddy2@cognizant.com/Uber Pick Ups/Formatting Silver Dataframes"

In [0]:
schema = StructType([
    StructField("Date", StringType(), True),
    StructField("Time", StringType(), True),
    StructField("Lat", DoubleType(), True),
    StructField("Lon", DoubleType(), True),
    StructField("Dispatching_Base_Num", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("Pickup_Address", StringType(), True),
    StructField("Active_Vehicles", IntegerType(), True),
    StructField("Trips", IntegerType(), True),
    StructField("Base_Name", StringType(), True),
    StructField("Dropoff_Address", StringType(), True),
    StructField("Routing_Details", StringType(), True),
    StructField("Status", StringType(), True)
])

In [0]:
all_columns = set([
    "Date", "Time", "Lat", "Lon", "Dispatching_Base_Num", "Category", 
    "Pickup_Address", "Active_Vehicles", "Trips", "Base_Name", 
    "Dropoff_Address", "Routing_Details", "Status"
])

combined_df = spark.createDataFrame([], schema=schema)

# Iterate over the existing DataFrames and add columns to the combined DataFrame
for table_name, df in dfs.items():
    existing_columns = set(df.columns)
    missing_columns = all_columns - existing_columns
    
    # Add missing columns with null values
    for col in missing_columns:
        df = df.withColumn(col, lit(None))
    #Remove white spaces in column names to resolve columns
    tempList = [] 
    for col in df.columns:
        new_name = col.strip()
        new_name = "".join(new_name.split())
        new_name = new_name.replace('.','') 
        tempList.append(new_name)

    try:
        combined_df = combined_df.unionByName(df)
    except Exception as e:
        print(f"Error occurred with DataFrame: {table_name}")
        print(e)
        break

In [0]:
combined_df.show(5)
combined_df.count()

+----------+-----+-------+--------+--------------------+-------------------+--------------+---------------+-----+---------+---------------+---------------+------+
|      Date| Time|    Lat|     Lon|Dispatching_Base_Num|           Category|Pickup_Address|Active_Vehicles|Trips|Base_Name|Dropoff_Address|Routing_Details|Status|
+----------+-----+-------+--------+--------------------+-------------------+--------------+---------------+-----+---------+---------------+---------------+------+
|2014-04-01|00:11| 40.769|-73.9549|              B02512|uber_raw_data_apr14|          null|           null| null|     null|           null|           null|  null|
|2014-04-01|00:17|40.7267|-74.0345|              B02512|uber_raw_data_apr14|          null|           null| null|     null|           null|           null|  null|
|2014-04-01|00:21|40.7316|-73.9873|              B02512|uber_raw_data_apr14|          null|           null| null|     null|           null|           null|  null|
|2014-04-01|00:28|40.7

In [0]:
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

Out[94]: DataFrame[key: string, value: string]

In [0]:
#Print the entire DataFrame for each unique value
unique_category_df = combined_df.drop_duplicates(subset=['Category'])
unique_category_df.show()
unique_category_df.count()

+----------+-----------+--------+---------+--------------------+--------------------+--------------------+---------------+-----+-----------------+--------------------+--------------------+---------+
|      Date|       Time|     Lat|      Lon|Dispatching_Base_Num|            Category|      Pickup_Address|Active_Vehicles|Trips|        Base_Name|     Dropoff_Address|     Routing_Details|   Status|
+----------+-----------+--------+---------+--------------------+--------------------+--------------------+---------------+-----+-----------------+--------------------+--------------------+---------+
|01-01-2015|       null|    null|     null|              B02512|   Uber_Jan_Feb_FOIL|                null|            190| 1132|             null|                null|                null|     null|
|07-01-2014|12:00:00 AM|    null|     null|                null|other_American_B0...| 874 E 139th St M...|           null| null|             null|                null|                null|     null|
|07-0

In [0]:
Final = combined_df.withColumn('timestamp_string',to_timestamp('Time')) 
Final = Final.withColumn("timestamp_string1", split("timestamp_string", " ")[1])

Final = Final.drop('Time')
Final = Final.withColumnRenamed("timestamp_string1", "Time")
Final = Final.drop('timestamp_string')
Final = Final.drop('timestamp_string1')
                    
                  
Final.show(5)


+----------+-------+--------+--------------------+-------------------+--------------+---------------+-----+---------+---------------+---------------+------+--------+
|      Date|    Lat|     Lon|Dispatching_Base_Num|           Category|Pickup_Address|Active_Vehicles|Trips|Base_Name|Dropoff_Address|Routing_Details|Status|    Time|
+----------+-------+--------+--------------------+-------------------+--------------+---------------+-----+---------+---------------+---------------+------+--------+
|2014-04-01| 40.769|-73.9549|              B02512|uber_raw_data_apr14|          null|           null| null|     null|           null|           null|  null|00:11:00|
|2014-04-01|40.7267|-74.0345|              B02512|uber_raw_data_apr14|          null|           null| null|     null|           null|           null|  null|00:17:00|
|2014-04-01|40.7316|-73.9873|              B02512|uber_raw_data_apr14|          null|           null| null|     null|           null|           null|  null|00:21:00|
|201

In [0]:
# Base_Condition = [(df["Dispatching_Base_Num"] == ""),
#                   (df["Dispatching_Base_Num"] == ""),]
# for condition, new_name in Base_Condition:
#     Final = Final.withColumn("Base_Name",
#                          when(condition, new_name).otherwise(df["Base_Name"]))