In [12]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName('Project') \
        .config("spark.master", "yarn") \
        .config("spark.executor.instances", "4") \
        .config("spark.executor.cores", "1") \
        .config("spark.executor.memory", "1g") \
        .getOrCreate()

23/12/27 23:20:22 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


## Ζητούμενο 2

In [13]:
from pyspark.sql.functions import col, to_date, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType


# Read the CSVs file into a DataFrames
df1 = spark.read.csv('hdfs://okeanos-master:54310/user/project/Crime_Data_from_2010_to_2019.csv', header=True, inferSchema=True)
df2 = spark.read.csv('hdfs://okeanos-master:54310/user/project/Crime_Data_from_2020_to_Present.csv', header=True, inferSchema=True)

df = df1.union(df2)

df = df.withColumn("Date Rptd", to_date(col("Date Rptd"), 'MM/dd/yyyy hh:mm:ss a'))
df = df.withColumn("DATE OCC", to_date(col("DATE OCC"), 'MM/dd/yyyy hh:mm:ss a'))

                                                                                

In [16]:
print("Number of rows in the DataFrame:")
df.count()

Number of rows in the DataFrame:


                                                                                

2988445

# Ζητούμενο 3

## DataFrame API


In [57]:
from pyspark.sql.functions import year, month, count, row_number
from pyspark.sql import Window

date_rptd = df.select('Date Rptd')
date_rptd = date_rptd.withColumn("Year", year("Date Rptd")).withColumn("Month", month("Date Rptd")).drop("Date Rptd")


crime_total = date_rptd.groupBy("Year", "Month").agg(count("*").alias("crime_total"))

# Define a window specification to partition by the "Year" column and order by the "crime_total" column
window_spec = Window().partitionBy("Year").orderBy(col("crime_total").desc())

# Use the row_number function to assign row numbers within each group
df_sorted = crime_total.withColumn("row_number", row_number().over(window_spec))

# Filter to keep only the top three within each group
df_top_three_DF = df_sorted.filter(col("row_number") <= 3)

df_top_three_DF.show(truncate=False)



+----+-----+-----------+----------+
|Year|Month|crime_total|row_number|
+----+-----+-----------+----------+
|2010|3    |17595      |1         |
|2010|7    |17520      |2         |
|2010|5    |17338      |3         |
|2011|8    |17139      |1         |
|2011|5    |17050      |2         |
|2011|3    |16951      |3         |
|2012|8    |17696      |1         |
|2012|10   |17477      |2         |
|2012|5    |17391      |3         |
|2013|8    |17329      |1         |
|2013|7    |16714      |2         |
|2013|5    |16671      |3         |
|2014|10   |12789      |1         |
|2014|7    |12696      |2         |
|2014|9    |12498      |3         |
|2015|8    |18951      |1         |
|2015|10   |18916      |2         |
|2015|7    |18528      |3         |
|2016|8    |19779      |1         |
|2016|10   |19615      |2         |
+----+-----+-----------+----------+
only showing top 20 rows



                                                                                

In [21]:
# Save the DataFrame to a CSV file
df_top_three_DF \
  .coalesce(1) \
  .write \
  .mode('overwrite') \
  .option('header', 'true') \
  .csv('results/q1Dt.csv')
# df_top_three_DF.write.csv("results/q1Dt.csv", header=True,  mode="overwrite")

import subprocess

hdfs_path = "hdfs://okeanos-master:54310/user/user/results/q1Dt.csv"
local_path = "/home/user/Project/results/"

subprocess.run(["hadoop", "fs", "-copyToLocal", hdfs_path, local_path])

                                                                                

CompletedProcess(args=['hadoop', 'fs', '-copyToLocal', 'hdfs://okeanos-master:54310/user/user/results/q1Dt.csv', '/home/user/Project/results/'], returncode=0)

## SQL API

In [14]:
# Register the DataFrame as a temporary SQL table
df.createOrReplaceTempView("crime_data")

# Write the SQL query
sql_query = """
    SELECT Year, Month, crime_total, row_number
    FROM (
        SELECT Year, Month, crime_total,
               ROW_NUMBER() OVER (PARTITION BY Year ORDER BY crime_total DESC) AS row_number
        FROM (
            SELECT YEAR(`Date Rptd`) AS Year, MONTH(`Date Rptd`) AS Month, COUNT(*) AS crime_total
            FROM crime_data
            GROUP BY Year, Month
        ) tmp
    ) tmp2
    WHERE row_number <= 3
"""

# Execute the SQL query
df_top_three_sql = spark.sql(sql_query)

# Show the result
df_top_three_sql.show(truncate=False)



+----+-----+-----------+----------+
|Year|Month|crime_total|row_number|
+----+-----+-----------+----------+
|2010|3    |17595      |1         |
|2010|7    |17520      |2         |
|2010|5    |17338      |3         |
|2011|8    |17139      |1         |
|2011|5    |17050      |2         |
|2011|3    |16951      |3         |
|2012|8    |17696      |1         |
|2012|10   |17477      |2         |
|2012|5    |17391      |3         |
|2013|8    |17329      |1         |
|2013|7    |16714      |2         |
|2013|5    |16671      |3         |
|2014|10   |12789      |1         |
|2014|7    |12696      |2         |
|2014|9    |12498      |3         |
|2015|8    |18951      |1         |
|2015|10   |18916      |2         |
|2015|7    |18528      |3         |
|2016|8    |19779      |1         |
|2016|10   |19615      |2         |
+----+-----+-----------+----------+
only showing top 20 rows



                                                                                

In [22]:
# Save the DataFrame to a CSV file
df_top_three_sql.write.csv("results/q1SQL.csv", header=True,  mode="overwrite")

import subprocess

hdfs_path = "hdfs://okeanos-master:54310/user/user/results/q1SQL.csv"
local_path = "/home/user/Project/results/"

subprocess.run(["hadoop", "fs", "-copyToLocal", hdfs_path, local_path])

                                                                                

CompletedProcess(args=['hadoop', 'fs', '-copyToLocal', 'hdfs://okeanos-master:54310/user/user/results/q1SQL.csv', '/home/user/Project/results/'], returncode=0)

In [23]:
is_same = df_top_three_DF.exceptAll(df_top_three_sql).count() == 0
if is_same:
    print("The DataFrames are identical.")
else:
    print("The DataFrames are different.")



The DataFrames are identical.


                                                                                

# Ζητούμενο 4

## DataFrame API

In [13]:
from pyspark.sql.functions import col, unix_timestamp, from_unixtime, date_format
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Read the CSVs file into a DataFrames
df1 = spark.read.csv('hdfs://okeanos-master:54310/user/project/Crime_Data_from_2010_to_2019.csv', header=True, inferSchema=False).select("TIME OCC","Premis Cd")
df2 = spark.read.csv('hdfs://okeanos-master:54310/user/project/Crime_Data_from_2020_to_Present.csv', header=True, inferSchema=False).select("TIME OCC","Premis Cd")

df = df1.union(df2)


# Convert the 'TIME OCC' column to a timestamp
df = df.withColumn(
    "TIME OCC",
    from_unixtime(unix_timestamp(col("TIME OCC"), "HHmm")).cast("timestamp")
)

df = df.withColumn(
    "TIME OCC",
    date_format(col("TIME OCC").cast("timestamp"), "HH:mm:ss")
)

df = df.withColumn(
    "Premis Cd",
    col("Premis Cd").cast("int"))

df.show()

[Stage 2:>                                                          (0 + 1) / 1]

+--------+---------+
|TIME OCC|Premis Cd|
+--------+---------+
|13:50:00|      501|
|00:45:00|      101|
|15:15:00|      103|
|01:50:00|      101|
|21:00:00|      103|
|16:50:00|      404|
|20:05:00|      101|
|21:00:00|      710|
|02:30:00|      108|
|21:00:00|      710|
|14:45:00|      101|
|20:00:00|      101|
|02:45:00|      102|
|17:45:00|      738|
|20:30:00|      102|
|17:35:00|      103|
|12:25:00|      502|
|11:00:00|      101|
|20:00:00|      502|
|18:20:00|      102|
+--------+---------+
only showing top 20 rows



                                                                                

In [14]:
from pyspark.sql.functions import col, when, sum

filtered_df = df.filter(col("Premis Cd") == 101).select("TIME OCC")

# Define time intervals
morning_interval = ((col("TIME OCC") >= "05:00:00") & (col("TIME OCC") < "12:00:00"))
afternoon_interval = ((col("TIME OCC") >= "12:00:00") & (col("TIME OCC") < "17:00:00"))
evening_interval = ((col("TIME OCC") >= "17:00:00") & (col("TIME OCC") < "21:00:00"))
night_interval = ((col("TIME OCC") >= "21:00:00") | (col("TIME OCC") < "05:00:00"))

# Apply conditions and sum within each interval
result_df = filtered_df.groupBy().agg(
    sum(when(morning_interval, 1).otherwise(0)).alias("Morning"),
    sum(when(afternoon_interval, 1).otherwise(0)).alias("Afternoon"),
    sum(when(evening_interval, 1).otherwise(0)).alias("Evening"),
    sum(when(night_interval, 1).otherwise(0)).alias("Night")
)

# Show the result
result_df.show(truncate=False)




+-------+---------+-------+------+
|Morning|Afternoon|Evening|Night |
+-------+---------+-------+------+
|123748 |148077   |186896 |237137|
+-------+---------+-------+------+



                                                                                

In [16]:
df.count()
#690

                                                                                

2988445

In [None]:
[('Afternoon', 126476), ('Night', 205687), ('Morning', 107927), ('Evening', 165672)]

## RDD API

In [17]:
spark.stop()

In [18]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("RDD query") \
    .getOrCreate() \
    .sparkContext

23/12/27 23:23:05 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [34]:
import datetime
import csv

# Load the first CSV file into an RDD
rdd1 = spark.textFile("hdfs://okeanos-master:54310/user/project/Crime_Data_from_2010_to_2019.csv") \
    .map(lambda x: next(csv.reader([x])))
    
header1 = rdd1.first()
rdd1 = rdd1.filter(lambda row: row != header1)

# Load the second CSV file into an RDD
rdd2 = spark.textFile("hdfs://okeanos-master:54310/user/project/Crime_Data_from_2020_to_Present.csv") \
    .map(lambda x: next(csv.reader([x])))

    
header2 = rdd2.first()  
rdd2 = rdd2.filter(lambda row: row != header2)

# Merge the two RDDs
rdd = rdd1.union(rdd2)
rdd = rdd.map(lambda col: (col[3], col[14]))




filtered_rdd = rdd.filter(lambda row: (row[1] == '101') or (row[1] == 101) )

def get_interval(time_occ):
    # Convert the time_occ to a datetime object for easier comparison
    time_object = datetime.datetime.strptime(time_occ, "%H%M")

    if datetime.time(5, 0) <= time_object.time() < datetime.time(12, 0):
        return "Morning"
    elif datetime.time(12, 0) <= time_object.time() < datetime.time(17, 0):
        return "Afternoon"
    elif datetime.time(17, 0) <= time_object.time() < datetime.time(21, 0):
        return "Evening"
    elif (datetime.time(21, 0) <= time_object.time()) or  (time_object.time() < datetime.time(5, 0)):
        return "Night"

    
# Map each row to a tuple of (interval, 1)
mapped_rdd = filtered_rdd.map(lambda col: (get_interval(col[0]), 1))

# Reduce by key to sum occurrences within each interval
result_rdd = mapped_rdd.reduceByKey(lambda x, y: x + y)

print(result_rdd.collect())



[('Afternoon', 148077), ('Night', 237137), ('Morning', 123748), ('Evening', 186896)]


                                                                                

# Ζητούμενο 5

In [121]:
zipcodes = spark.read.csv('hdfs://okeanos-master:54310/user/project/income/LA_income_2015.csv', header=True, inferSchema=True)

23/12/27 01:17:34 WARN BlockManager: Putting block broadcast_182_piece0 failed due to exception java.nio.file.NoSuchFileException: /tmp/blockmgr-66f8f9b6-de16-4e6c-9872-80e42fc18e89/14.
23/12/27 01:17:34 WARN BlockManager: Block broadcast_182_piece0 was not removed normally.
23/12/27 01:17:34 ERROR TorrentBroadcast: Store broadcast broadcast_182 fail, remove all pieces of the broadcast


Py4JJavaError: An error occurred while calling o675.csv.
: java.nio.file.NoSuchFileException: /tmp/blockmgr-66f8f9b6-de16-4e6c-9872-80e42fc18e89/14
	at java.base/sun.nio.fs.UnixException.translateToIOException(UnixException.java:92)
	at java.base/sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:111)
	at java.base/sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:116)
	at java.base/sun.nio.fs.UnixFileSystemProvider.createDirectory(UnixFileSystemProvider.java:389)
	at java.base/java.nio.file.Files.createDirectory(Files.java:690)
	at org.apache.spark.storage.DiskBlockManager.getFile(DiskBlockManager.scala:108)
	at org.apache.spark.storage.DiskBlockManager.containsBlock(DiskBlockManager.scala:157)
	at org.apache.spark.storage.DiskStore.contains(DiskStore.scala:154)
	at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$getCurrentBlockStatus(BlockManager.scala:885)
	at org.apache.spark.storage.BlockManager.removeBlockInternal(BlockManager.scala:2075)
	at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1555)
	at org.apache.spark.storage.BlockManager$BlockStoreUpdater.save(BlockManager.scala:380)
	at org.apache.spark.storage.BlockManager.putBytes(BlockManager.scala:1468)
	at org.apache.spark.broadcast.TorrentBroadcast.$anonfun$writeBlocks$1(TorrentBroadcast.scala:171)
	at org.apache.spark.broadcast.TorrentBroadcast.$anonfun$writeBlocks$1$adapted(TorrentBroadcast.scala:165)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at org.apache.spark.broadcast.TorrentBroadcast.writeBlocks(TorrentBroadcast.scala:165)
	at org.apache.spark.broadcast.TorrentBroadcast.<init>(TorrentBroadcast.scala:99)
	at org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast(TorrentBroadcastFactory.scala:38)
	at org.apache.spark.broadcast.BroadcastManager.newBroadcast(BroadcastManager.scala:78)
	at org.apache.spark.SparkContext.broadcastInternal(SparkContext.scala:1662)
	at org.apache.spark.SparkContext.broadcast(SparkContext.scala:1644)
	at org.apache.spark.sql.execution.datasources.text.TextFileFormat.buildReader(TextFileFormat.scala:106)
	at org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues(FileFormat.scala:138)
	at org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues$(FileFormat.scala:129)
	at org.apache.spark.sql.execution.datasources.TextBasedFileFormat.buildReaderWithPartitionValues(FileFormat.scala:346)
	at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD$lzycompute(DataSourceScanExec.scala:548)
	at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD(DataSourceScanExec.scala:537)
	at org.apache.spark.sql.execution.FileSourceScanExec.doExecute(DataSourceScanExec.scala:575)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:527)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:455)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:454)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:498)
	at org.apache.spark.sql.execution.FilterExec.inputRDDs(basicPhysicalOperators.scala:242)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:751)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:364)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:498)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4344)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3326)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4334)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4332)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4332)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3326)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3549)
	at org.apache.spark.sql.execution.datasources.csv.TextInputCSVDataSource$.infer(CSVDataSource.scala:111)
	at org.apache.spark.sql.execution.datasources.csv.CSVDataSource.inferSchema(CSVDataSource.scala:64)
	at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.inferSchema(CSVFileFormat.scala:62)
	at org.apache.spark.sql.execution.datasources.DataSource.$anonfun$getOrInferFileFormatSchema$11(DataSource.scala:208)
	at scala.Option.orElse(Option.scala:447)
	at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:205)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:407)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:229)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:211)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
	at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:538)
	at jdk.internal.reflect.GeneratedMethodAccessor86.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [72]:
zipcodes.show()

23/12/27 01:16:30 WARN BlockManager: Putting block broadcast_130 failed due to exception java.nio.file.NoSuchFileException: /tmp/blockmgr-66f8f9b6-de16-4e6c-9872-80e42fc18e89/30.
23/12/27 01:16:30 WARN BlockManager: Block broadcast_130 was not removed normally.


Py4JJavaError: An error occurred while calling o347.showString.
: java.nio.file.NoSuchFileException: /tmp/blockmgr-66f8f9b6-de16-4e6c-9872-80e42fc18e89/30
	at java.base/sun.nio.fs.UnixException.translateToIOException(UnixException.java:92)
	at java.base/sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:111)
	at java.base/sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:116)
	at java.base/sun.nio.fs.UnixFileSystemProvider.createDirectory(UnixFileSystemProvider.java:389)
	at java.base/java.nio.file.Files.createDirectory(Files.java:690)
	at org.apache.spark.storage.DiskBlockManager.getFile(DiskBlockManager.scala:108)
	at org.apache.spark.storage.DiskStore.remove(DiskStore.scala:132)
	at org.apache.spark.storage.BlockManager.removeBlockInternal(BlockManager.scala:2080)
	at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1555)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1592)
	at org.apache.spark.storage.BlockManager.putIterator(BlockManager.scala:1425)
	at org.apache.spark.storage.BlockManager.putSingle(BlockManager.scala:1928)
	at org.apache.spark.broadcast.TorrentBroadcast.writeBlocks(TorrentBroadcast.scala:154)
	at org.apache.spark.broadcast.TorrentBroadcast.<init>(TorrentBroadcast.scala:99)
	at org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast(TorrentBroadcastFactory.scala:38)
	at org.apache.spark.broadcast.BroadcastManager.newBroadcast(BroadcastManager.scala:78)
	at org.apache.spark.SparkContext.broadcastInternal(SparkContext.scala:1662)
	at org.apache.spark.SparkContext.broadcast(SparkContext.scala:1644)
	at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.buildReader(CSVFileFormat.scala:102)
	at org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues(FileFormat.scala:138)
	at org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues$(FileFormat.scala:129)
	at org.apache.spark.sql.execution.datasources.TextBasedFileFormat.buildReaderWithPartitionValues(FileFormat.scala:346)
	at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD$lzycompute(DataSourceScanExec.scala:548)
	at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD(DataSourceScanExec.scala:537)
	at org.apache.spark.sql.execution.FileSourceScanExec.doExecute(DataSourceScanExec.scala:575)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:527)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:455)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:454)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:498)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:51)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:751)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:364)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:498)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4344)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3326)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4334)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4332)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4332)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3326)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3549)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:280)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:315)
	at jdk.internal.reflect.GeneratedMethodAccessor75.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [22]:
geocoding = spark.read.csv('hdfs://okeanos-master:54310/user/project/revgecoding.csv', header=True, inferSchema=True)

In [37]:
temp = df.join(geocoding, (df.LAT == geocoding.LAT) & (df.LON == df.LON))

23/12/27 01:12:25 WARN Column: Constructing trivially true equals predicate, 'LON#44 = LON#44'. Perhaps you need to use aliases.


In [38]:
temp = temp.select("Vict Descent","Zipcode")

In [40]:
temp.show()

+------------+-------+
|Vict Descent|Zipcode|
+------------+-------+
|           H|  90292|
|           H|  90292|
|           H|  90292|
|           H|  90096|
|           H|  90301|
|           H|  90302|
|           H|  90062|
|           H|  90003|
|           H|  90052|
|           W|  90045|
|           W|  90293|
|           W|  90045|
|           W|  90293|
|           W|  90293|
|           W|  90045|
|           W|  90293|
|           W|  90293|
|           W|  90293|
|           W|  90293|
|           W|  90045|
+------------+-------+
only showing top 20 rows



                                                                                