In [1]:
!./start_hadoop_spark.sh

Starting HDFS...


Starting namenodes on [okeanos-master]
Starting datanodes
Starting secondary namenodes [okeanos-master]
Starting YARN...
Starting resourcemanager
Starting nodemanagers
Starting Spark...
starting org.apache.spark.deploy.history.HistoryServer, logging to /home/user/opt/spark/logs/spark-user-org.apache.spark.deploy.history.HistoryServer-1-okeanos-master.out


In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName('Project') \
        .config("spark.master", "yarn") \
        .config("spark.executor.instances", "4") \
        .config("spark.executor.cores", "1") \
        .config("spark.executor.memory", "1g") \
        .getOrCreate()
spark


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/24 17:49:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/12/24 17:49:52 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


## Ζητούμενο 2

In [55]:
from pyspark.sql.functions import col, to_date, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType


# Read the CSVs file into a DataFrames
df1 = spark.read.csv('hdfs://okeanos-master:54310/user/project/Crime_Data_from_2010_to_2019.csv', header=True, inferSchema=True)
df2 = spark.read.csv('hdfs://okeanos-master:54310/user/project/Crime_Data_from_2020_to_Present.csv', header=True, inferSchema=True)

df = df1.union(df2)

df = df.withColumn("Date Rptd", to_date(col("Date Rptd"), 'MM/dd/yyyy hh:mm:ss a'))
df = df.withColumn("DATE OCC", to_date(col("DATE OCC"), 'MM/dd/yyyy hh:mm:ss a'))

                                                                                

In [None]:
df.printSchema()

In [56]:
print("Number of rows in the DataFrame:")
df.count()

Number of rows in the DataFrame:


                                                                                

2988445

## Ζητούμενο 3

In [57]:
from pyspark.sql.functions import year, month, count, row_number
from pyspark.sql import Window

date_rptd = df.select('Date Rptd')
date_rptd = date_rptd.withColumn("Year", year("Date Rptd")).withColumn("Month", month("Date Rptd")).drop("Date Rptd")


crime_total = date_rptd.groupBy("Year", "Month").agg(count("*").alias("crime_total"))

# Define a window specification to partition by the "Year" column and order by the "crime_total" column
window_spec = Window().partitionBy("Year").orderBy(col("crime_total").desc())

# Use the row_number function to assign row numbers within each group
df_sorted = crime_total.withColumn("row_number", row_number().over(window_spec))

# Filter to keep only the top three within each group
df_top_three_DF = df_sorted.filter(col("row_number") <= 3)

df_top_three_DF.show(truncate=False)



+----+-----+-----------+----------+
|Year|Month|crime_total|row_number|
+----+-----+-----------+----------+
|2010|3    |17595      |1         |
|2010|7    |17520      |2         |
|2010|5    |17338      |3         |
|2011|8    |17139      |1         |
|2011|5    |17050      |2         |
|2011|3    |16951      |3         |
|2012|8    |17696      |1         |
|2012|10   |17477      |2         |
|2012|5    |17391      |3         |
|2013|8    |17329      |1         |
|2013|7    |16714      |2         |
|2013|5    |16671      |3         |
|2014|10   |12789      |1         |
|2014|7    |12696      |2         |
|2014|9    |12498      |3         |
|2015|8    |18951      |1         |
|2015|10   |18916      |2         |
|2015|7    |18528      |3         |
|2016|8    |19779      |1         |
|2016|10   |19615      |2         |
+----+-----+-----------+----------+
only showing top 20 rows



                                                                                

In [21]:
# Save the DataFrame to a CSV file
df_top_three_DF \
  .coalesce(1) \
  .write \
  .mode('overwrite') \
  .option('header', 'true') \
  .csv('results/q1Dt.csv')
# df_top_three_DF.write.csv("results/q1Dt.csv", header=True,  mode="overwrite")

import subprocess

hdfs_path = "hdfs://okeanos-master:54310/user/user/results/q1Dt.csv"
local_path = "/home/user/Project/results/"

subprocess.run(["hadoop", "fs", "-copyToLocal", hdfs_path, local_path])

                                                                                

CompletedProcess(args=['hadoop', 'fs', '-copyToLocal', 'hdfs://okeanos-master:54310/user/user/results/q1Dt.csv', '/home/user/Project/results/'], returncode=0)

In [14]:
# Register the DataFrame as a temporary SQL table
df.createOrReplaceTempView("crime_data")

# Write the SQL query
sql_query = """
    SELECT Year, Month, crime_total, row_number
    FROM (
        SELECT Year, Month, crime_total,
               ROW_NUMBER() OVER (PARTITION BY Year ORDER BY crime_total DESC) AS row_number
        FROM (
            SELECT YEAR(`Date Rptd`) AS Year, MONTH(`Date Rptd`) AS Month, COUNT(*) AS crime_total
            FROM crime_data
            GROUP BY Year, Month
        ) tmp
    ) tmp2
    WHERE row_number <= 3
"""

# Execute the SQL query
df_top_three_sql = spark.sql(sql_query)

# Show the result
df_top_three_sql.show(truncate=False)



+----+-----+-----------+----------+
|Year|Month|crime_total|row_number|
+----+-----+-----------+----------+
|2010|3    |17595      |1         |
|2010|7    |17520      |2         |
|2010|5    |17338      |3         |
|2011|8    |17139      |1         |
|2011|5    |17050      |2         |
|2011|3    |16951      |3         |
|2012|8    |17696      |1         |
|2012|10   |17477      |2         |
|2012|5    |17391      |3         |
|2013|8    |17329      |1         |
|2013|7    |16714      |2         |
|2013|5    |16671      |3         |
|2014|10   |12789      |1         |
|2014|7    |12696      |2         |
|2014|9    |12498      |3         |
|2015|8    |18951      |1         |
|2015|10   |18916      |2         |
|2015|7    |18528      |3         |
|2016|8    |19779      |1         |
|2016|10   |19615      |2         |
+----+-----+-----------+----------+
only showing top 20 rows



                                                                                

In [22]:
# Save the DataFrame to a CSV file
df_top_three_sql.write.csv("results/q1SQL.csv", header=True,  mode="overwrite")

import subprocess

hdfs_path = "hdfs://okeanos-master:54310/user/user/results/q1SQL.csv"
local_path = "/home/user/Project/results/"

subprocess.run(["hadoop", "fs", "-copyToLocal", hdfs_path, local_path])

                                                                                

CompletedProcess(args=['hadoop', 'fs', '-copyToLocal', 'hdfs://okeanos-master:54310/user/user/results/q1SQL.csv', '/home/user/Project/results/'], returncode=0)

In [23]:
is_same = df_top_three_DF.exceptAll(df_top_three_sql).count() == 0
if is_same:
    print("The DataFrames are identical.")
else:
    print("The DataFrames are different.")



The DataFrames are identical.


                                                                                

# Ζητούμενο 4

In [74]:
from pyspark.sql.functions import col, unix_timestamp, from_unixtime, date_format
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Read the CSVs file into a DataFrames
df1 = spark.read.csv('hdfs://okeanos-master:54310/user/project/Crime_Data_from_2010_to_2019.csv', header=True, inferSchema=False).select("TIME OCC","Premis Cd")
df2 = spark.read.csv('hdfs://okeanos-master:54310/user/project/Crime_Data_from_2020_to_Present.csv', header=True, inferSchema=False).select("TIME OCC","Premis Cd")

df = df1.union(df2)


# Convert the 'TIME OCC' column to a timestamp
df = df.withColumn(
    "TIME OCC",
    from_unixtime(unix_timestamp(col("TIME OCC"), "HHmm")).cast("timestamp")
)

df = df.withColumn(
    "TIME OCC",
    date_format(col("TIME OCC").cast("timestamp"), "HH:mm:ss")
)

df = df.withColumn(
    "Premis Cd",
    col("Premis Cd").cast("int"))

df.show()

+--------+---------+
|TIME OCC|Premis Cd|
+--------+---------+
|13:50:00|      501|
|00:45:00|      101|
|15:15:00|      103|
|01:50:00|      101|
|21:00:00|      103|
|16:50:00|      404|
|20:05:00|      101|
|21:00:00|      710|
|02:30:00|      108|
|21:00:00|      710|
|14:45:00|      101|
|20:00:00|      101|
|02:45:00|      102|
|17:45:00|      738|
|20:30:00|      102|
|17:35:00|      103|
|12:25:00|      502|
|11:00:00|      101|
|20:00:00|      502|
|18:20:00|      102|
+--------+---------+
only showing top 20 rows



In [79]:
from pyspark.sql.functions import col, when, sum

filtered_df = df.filter(col("Premis Cd") == 101).select("TIME OCC")

# Define time intervals
morning_interval = ((col("TIME OCC") >= "05:00:00") & (col("TIME OCC") < "12:00:00"))
afternoon_interval = ((col("TIME OCC") >= "12:00:00") & (col("TIME OCC") < "17:00:00"))
evening_interval = ((col("TIME OCC") >= "17:00:00") & (col("TIME OCC") < "21:00:00"))
night_interval = ((col("TIME OCC") >= "21:00:00") | (col("TIME OCC") < "04:00:00"))

# Apply conditions and sum within each interval
result_df = df.groupBy().agg(
    sum(when(morning_interval, 1).otherwise(0)).alias("Morning"),
    sum(when(afternoon_interval, 1).otherwise(0)).alias("Afternoon"),
    sum(when(evening_interval, 1).otherwise(0)).alias("Evening"),
    sum(when(night_interval, 1).otherwise(0)).alias("Night")
)

# Show the result
result_df.show(truncate=False)




+-------+---------+-------+------+
|Morning|Afternoon|Evening|Night |
+-------+---------+-------+------+
|659301 |813370   |684082 |785595|
+-------+---------+-------+------+



                                                                                