# 1. Data Engineering (DE) Spark

## Spark Preparation (Google Colab Only)

In [1]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

In [2]:
if IN_COLAB:
    !apt-get install openjdk-8-jdk-headless -qq > /dev/null
    !wget -q https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
    !tar xf spark-3.3.2-bin-hadoop3.tgz
    !mv spark-3.3.2-bin-hadoop3 spark
    !pip install -q findspark
    import os
    os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
    os.environ["SPARK_HOME"] = "/content/spark"

## Start a Local Cluster

In [3]:
import findspark
findspark.init()
spark_url = 'local'

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master(spark_url)\
        .appName('Spark ML')\
        .getOrCreate()

## Spark SQL Data Preparation

In [5]:
from pyspark import SparkFiles

# Data from https://www.traffy.in.th/?page_id=27351
# Update every 3hr

# url = 'https://publicapi.traffy.in.th/dump-csv-chadchart/bangkok_traffy.csv'
# spark.sparkContext.addFile(url)

In [6]:
# download data from google drive
# https://drive.google.com/file/d/1Fd1R_ZUBlRGuYwx4AH52u1VKWEbJpV1L/view?usp=share_link
# data at 12-May-2023 21:46
!gdown 1Fd1R_ZUBlRGuYwx4AH52u1VKWEbJpV1L

Downloading...
From: https://drive.google.com/uc?id=1Fd1R_ZUBlRGuYwx4AH52u1VKWEbJpV1L
To: /content/bangkok_traffy.csv
100% 357M/357M [00:04<00:00, 81.0MB/s]


In [7]:
path = '/content/bangkok_traffy.csv'

df = spark.read.option("delimiter", ",").option("multiline", "true").option("quote", '"').option("header", "true").option("escape", "\\").option("escape", '"').csv(path)
df.show()

# df = spark.read.option("delimiter", ",").option("multiline", "true").option("quote", '"').option("header", "true").option("escape", "\\").option("escape", '"').csv("file://" + SparkFiles.get("bangkok_traffy.csv"))
# df.show()

+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+----------------+-----------------+--------------------+--------------------+--------------+----+------------+--------------------+
|  ticket_id|                type|        organization|             comment|               photo|         photo_after|            coords|             address|     subdistrict|         district|            province|           timestamp|         state|star|count_reopen|       last_activity|
+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+----------------+-----------------+--------------------+--------------------+--------------+----+------------+--------------------+
|2021-9LHDM6|                  {}|                null|            ไม่มีภาพ|https://storage.g...|                null|100.48661,13

In [8]:
df.count()

270632

In [9]:
df.printSchema()

root
 |-- ticket_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- organization: string (nullable = true)
 |-- comment: string (nullable = true)
 |-- photo: string (nullable = true)
 |-- photo_after: string (nullable = true)
 |-- coords: string (nullable = true)
 |-- address: string (nullable = true)
 |-- subdistrict: string (nullable = true)
 |-- district: string (nullable = true)
 |-- province: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- state: string (nullable = true)
 |-- star: string (nullable = true)
 |-- count_reopen: string (nullable = true)
 |-- last_activity: string (nullable = true)



In [10]:
# drop unused column
cols = ['ticket_id','photo', 'photo_after']
df = df.drop(*cols)

 ## Convert to proper data type

In [11]:
from pyspark.sql.functions import col
cols = ['star', 'count_reopen']
for c in cols:
    df = df.withColumn(c, col(c).cast('int'))

In [12]:
cols = ['timestamp', 'last_activity']
for c in cols:
    df = df.withColumn(c, col(c).cast('timestamp'))

In [13]:
from pyspark.sql.functions import split, regexp_replace
cols = ['type']
for c in cols:
    df = df.withColumn(c, split(regexp_replace(col(c), "[{}]", ""), ","))

In [14]:
cols = ['organization', 'coords']
for c in cols:
  df = df.withColumn(c, split(col(c), ","))

In [15]:
df.printSchema()

root
 |-- type: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- organization: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- comment: string (nullable = true)
 |-- coords: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- address: string (nullable = true)
 |-- subdistrict: string (nullable = true)
 |-- district: string (nullable = true)
 |-- province: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- state: string (nullable = true)
 |-- star: integer (nullable = true)
 |-- count_reopen: integer (nullable = true)
 |-- last_activity: timestamp (nullable = true)



In [16]:
df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+-----------------+--------------------+--------------------+--------------+----+------------+--------------------+
|                type|        organization|             comment|              coords|             address|     subdistrict|         district|            province|           timestamp|         state|star|count_reopen|       last_activity|
+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+-----------------+--------------------+--------------------+--------------+----+------------+--------------------+
|                  []|                null|            ไม่มีภาพ|[100.48661, 13.79...|1867 จรัญสนิทวงศ์...|         บางพลัด|          บางพลัด|       กรุงเทพมหานคร|2021-09-01 10:44:...|กำลังดำเนินการ|null|        null|2022-02-22 04:59:...|
|         [ความสะอาด]|        [เขตบางซื่อ]|     

## Drop null

In [17]:
# count null
import pyspark.sql.functions as F

df_agg = df.agg(*[F.count(F.when(F.isnull(c), c)).alias(c) for c in df.columns])

In [18]:
df_agg.show()

+----+------------+-------+------+-------+-----------+--------+--------+---------+-----+------+------------+-------------+
|type|organization|comment|coords|address|subdistrict|district|province|timestamp|state|  star|count_reopen|last_activity|
+----+------------+-------+------+-------+-----------+--------+--------+---------+-----+------+------------+-------------+
|  97|        1002|   2648|     0|   2648|         72|      74|      25|        0|    0|171624|      117695|            0|
+----+------------+-------+------+-------+-----------+--------+--------+---------+-----+------+------------+-------------+



In [19]:
df.filter("last_activity is NULL").show()

+----+------------+-------+------+-------+-----------+--------+--------+---------+-----+----+------------+-------------+
|type|organization|comment|coords|address|subdistrict|district|province|timestamp|state|star|count_reopen|last_activity|
+----+------------+-------+------+-------+-----------+--------+--------+---------+-----+----+------------+-------------+
+----+------------+-------+------+-------+-----------+--------+--------+---------+-----+----+------------+-------------+



In [20]:
# drop rows where last_activity = null
df = df.na.drop(subset=["last_activity"])

In [21]:
df.count()

270632

## calculate time_to_state

In [22]:
from pyspark.sql.functions import expr

# calculate time_to_state
df = df.withColumn("time_to_state", expr("last_activity - timestamp"))
df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+-----------------+--------------------+--------------------+--------------+----+------------+--------------------+--------------------+
|                type|        organization|             comment|              coords|             address|     subdistrict|         district|            province|           timestamp|         state|star|count_reopen|       last_activity|       time_to_state|
+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+-----------------+--------------------+--------------------+--------------+----+------------+--------------------+--------------------+
|                  []|                null|            ไม่มีภาพ|[100.48661, 13.79...|1867 จรัญสนิทวงศ์...|         บางพลัด|          บางพลัด|       กรุงเทพมหานคร|2021-09-01 10:44:...|กำลังดำเนินการ|null|        null|2022-02

## (Run everything before this, after this is optional)

## Visualization Start here

## Focus on state

In [23]:
df_state_1 = df.groupby('state').count()
df_state_1.show()

+--------------+------+
|         state| count|
+--------------+------+
|   รอรับเรื่อง| 18789|
|กำลังดำเนินการ| 49452|
|     เสร็จสิ้น|202391|
+--------------+------+



In [24]:
df_state_1.write.option("header", "true").csv(path="/content/spark_output/state_1")

In [25]:
from pyspark.sql.functions import min, avg, max

df_state_2 = df.groupby('state').agg(min('time_to_state').alias('min_time_to_state'),
                         avg('time_to_state').alias('avg_time_to_state'),
                         max('time_to_state').alias('max_time_to_state'))
df_state_2.show(20, False)

+--------------+-------------------------------------------+-------------------------------------------+--------------------------------------------+
|state         |min_time_to_state                          |avg_time_to_state                          |max_time_to_state                           |
+--------------+-------------------------------------------+-------------------------------------------+--------------------------------------------+
|รอรับเรื่อง   |INTERVAL '-0 00:00:00.072412' DAY TO SECOND|INTERVAL '11 08:59:13.817943' DAY TO SECOND|INTERVAL '336 11:49:58.788722' DAY TO SECOND|
|กำลังดำเนินการ|INTERVAL '0 00:00:20.096892' DAY TO SECOND |INTERVAL '85 11:27:45.573835' DAY TO SECOND|INTERVAL '490 15:06:39.707912' DAY TO SECOND|
|เสร็จสิ้น     |INTERVAL '0 00:00:14.963721' DAY TO SECOND |INTERVAL '43 15:18:58.701763' DAY TO SECOND|INTERVAL '464 10:21:28.527899' DAY TO SECOND|
+--------------+-------------------------------------------+----------------------------------------

In [26]:
df_state_2.write.option("header", "true").csv(path="/content/spark_output/state_2")

In [27]:
from pyspark.sql.functions import year, month

df_state_3 = df.groupBy(year('timestamp').alias('year'), month('timestamp').alias('month')).count().orderBy('year', 'month')
df_state_3.show()

+----+-----+-----+
|year|month|count|
+----+-----+-----+
|2021|    9|   12|
|2021|   10|    1|
|2021|   11|    7|
|2021|   12|  162|
|2022|    1|  160|
|2022|    2|  172|
|2022|    3|  108|
|2022|    4|   24|
|2022|    5| 2215|
|2022|    6|61633|
|2022|    7|40880|
|2022|    8|27047|
|2022|    9|22327|
|2022|   10|13520|
|2022|   11|11481|
|2022|   12|11379|
|2023|    1|18591|
|2023|    2|18578|
|2023|    3|18211|
|2023|    4|17620|
+----+-----+-----+
only showing top 20 rows



In [28]:
df_state_3.write.option("header", "true").csv(path="/content/spark_output/state_3")

In [29]:
from pyspark.sql.functions import year, month, count

df_state_4 = df.groupBy(year('timestamp').alias('year'), month('timestamp').alias('month'), 'state')\
  .agg(count('time_to_state').alias('row_count'),
       min('time_to_state').alias('min_time_to_state'),
       avg('time_to_state').alias('avg_time_to_state'),
       max('time_to_state').alias('max_time_to_state'))\
  .orderBy('state','year', 'month')
df_state_4.show(100, False)

+----+-----+--------------+---------+--------------------------------------------+--------------------------------------------+--------------------------------------------+
|year|month|state         |row_count|min_time_to_state                           |avg_time_to_state                           |max_time_to_state                           |
+----+-----+--------------+---------+--------------------------------------------+--------------------------------------------+--------------------------------------------+
|2021|9    |กำลังดำเนินการ|9        |INTERVAL '146 05:28:39.035754' DAY TO SECOND|INTERVAL '155 04:47:42.703305' DAY TO SECOND|INTERVAL '173 18:15:03.269059' DAY TO SECOND|
|2021|11   |กำลังดำเนินการ|7        |INTERVAL '88 19:12:22.087276' DAY TO SECOND |INTERVAL '96 12:49:23.998591' DAY TO SECOND |INTERVAL '106 22:50:33.926335' DAY TO SECOND|
|2021|12   |กำลังดำเนินการ|154      |INTERVAL '53 15:25:27.671055' DAY TO SECOND |INTERVAL '72 09:29:42.934666' DAY TO SECOND |INTERVAL

In [30]:
df_state_4.write.option("header", "true").csv(path="/content/spark_output/state_4")

In [31]:
from pyspark.sql.functions import year, month, count, sum, col

df_count = df.groupBy(year('timestamp').alias('year'), month('timestamp').alias('month'), 'state') \
             .agg(count('time_to_state').alias('row_count'))

df_total_count = df_count.groupBy('year', 'month') \
                         .agg(sum('row_count').alias('total_count'))

df_ratio = df_count.join(df_total_count, ['year', 'month']) \
                   .withColumn('state_ratio', col('row_count') / col('total_count')) \
                   

df_state_5 = df_ratio.orderBy('year', 'month','state')
df_state_5.show(100, False)

+----+-----+--------------+---------+-----------+--------------------+
|year|month|state         |row_count|total_count|state_ratio         |
+----+-----+--------------+---------+-----------+--------------------+
|2021|9    |กำลังดำเนินการ|9        |12         |0.75                |
|2021|9    |เสร็จสิ้น     |3        |12         |0.25                |
|2021|10   |เสร็จสิ้น     |1        |1          |1.0                 |
|2021|11   |กำลังดำเนินการ|7        |7          |1.0                 |
|2021|12   |กำลังดำเนินการ|154      |162        |0.9506172839506173  |
|2021|12   |เสร็จสิ้น     |8        |162        |0.04938271604938271 |
|2022|1    |กำลังดำเนินการ|141      |160        |0.88125             |
|2022|1    |เสร็จสิ้น     |19       |160        |0.11875             |
|2022|2    |กำลังดำเนินการ|135      |172        |0.7848837209302325  |
|2022|2    |รอรับเรื่อง   |8        |172        |0.046511627906976744|
|2022|2    |เสร็จสิ้น     |29       |172        |0.1686046511627907  |
|2022|

In [32]:
df_state_5.write.option("header", "true").csv(path="/content/spark_output/state_5")

In [33]:
df_state_6 = df_ratio.orderBy('state','year', 'month')
df_state_6.show(100, False)

+----+-----+--------------+---------+-----------+--------------------+
|year|month|state         |row_count|total_count|state_ratio         |
+----+-----+--------------+---------+-----------+--------------------+
|2021|9    |กำลังดำเนินการ|9        |12         |0.75                |
|2021|11   |กำลังดำเนินการ|7        |7          |1.0                 |
|2021|12   |กำลังดำเนินการ|154      |162        |0.9506172839506173  |
|2022|1    |กำลังดำเนินการ|141      |160        |0.88125             |
|2022|2    |กำลังดำเนินการ|135      |172        |0.7848837209302325  |
|2022|3    |กำลังดำเนินการ|56       |108        |0.5185185185185185  |
|2022|4    |กำลังดำเนินการ|3        |24         |0.125               |
|2022|5    |กำลังดำเนินการ|300      |2215       |0.13544018058690746 |
|2022|6    |กำลังดำเนินการ|10050    |61633      |0.16306199600863175 |
|2022|7    |กำลังดำเนินการ|6382     |40880      |0.15611545988258316 |
|2022|8    |กำลังดำเนินการ|3725     |27047      |0.13772322253854402 |
|2022|

In [34]:
df_state_6.write.option("header", "true").csv(path="/content/spark_output/state_6")

## Focus on type

In [35]:
from pyspark.sql.functions import explode, size

df_exploded = df.select(explode('type').alias('type_exploded'), 'state', 'time_to_state')

df_grouped = df_exploded.groupBy('type_exploded', 'state') \
                        .agg(count('time_to_state').alias('row_count'),
                             min('time_to_state').alias('min_time_to_state'),
                             avg('time_to_state').alias('avg_time_to_state'),
                             max('time_to_state').alias('max_time_to_state')) \
                        .orderBy('type_exploded')

df_type_1 = df_grouped
df_type_1.show(100, False)

+-------------+--------------+---------+-------------------------------------------+--------------------------------------------+--------------------------------------------+
|type_exploded|state         |row_count|min_time_to_state                          |avg_time_to_state                           |max_time_to_state                           |
+-------------+--------------+---------+-------------------------------------------+--------------------------------------------+--------------------------------------------+
|             |รอรับเรื่อง   |4546     |INTERVAL '-0 00:00:00.028061' DAY TO SECOND|INTERVAL '12 00:42:24.804672' DAY TO SECOND |INTERVAL '330 07:01:20.772967' DAY TO SECOND|
|             |กำลังดำเนินการ|8613     |INTERVAL '0 00:00:56.299065' DAY TO SECOND |INTERVAL '109 21:01:12.956501' DAY TO SECOND|INTERVAL '490 15:06:39.707912' DAY TO SECOND|
|             |เสร็จสิ้น     |49232    |INTERVAL '0 00:00:14.963721' DAY TO SECOND |INTERVAL '47 18:55:45.469091' DAY TO SECO

In [36]:
df_type_1.write.option("header", "true").csv(path="/content/spark_output/type_1")

## Focus on organization

In [37]:
from pyspark.sql.functions import explode, size

df_exploded = df.select(explode('organization').alias('organization_exploded'), 'state', 'time_to_state')

df_grouped = df_exploded.groupBy('organization_exploded', 'state') \
                        .agg(count('time_to_state').alias('row_count'),
                             min('time_to_state').alias('min_time_to_state'),
                             avg('time_to_state').alias('avg_time_to_state'),
                             max('time_to_state').alias('max_time_to_state')) \
                        .filter(col('state') == 'เสร็จสิ้น') \
                        .orderBy('row_count', ascending=False)

df_org_1 = df_grouped
df_org_1.show(100, False)

+-----------------------------------------------------+---------+---------+------------------------------------------+--------------------------------------------+--------------------------------------------+
|organization_exploded                                |state    |row_count|min_time_to_state                         |avg_time_to_state                           |max_time_to_state                           |
+-----------------------------------------------------+---------+---------+------------------------------------------+--------------------------------------------+--------------------------------------------+
|กลุ่มกรุงเทพใต้ (นายชาตรี วัฒนเขจร)                  |เสร็จสิ้น|32475    |INTERVAL '0 00:00:22.509931' DAY TO SECOND|INTERVAL '68 05:06:27.457143' DAY TO SECOND |INTERVAL '339 14:30:09.372622' DAY TO SECOND|
|กลุ่มกรุงเทพตะวันออก (นายณรงค์ เรืองศรี)             |เสร็จสิ้น|32028    |INTERVAL '0 00:00:30.017043' DAY TO SECOND|INTERVAL '41 13:05:49.677972' DAY TO SECOND |I

In [38]:
df_org_1.write.option("header", "true").csv(path="/content/spark_output/org_1")

## Focus on district

In [39]:
df_grouped = df.groupBy('district', 'state') \
                        .agg(count('time_to_state').alias('row_count'),
                             min('time_to_state').alias('min_time_to_state'),
                             avg('time_to_state').alias('avg_time_to_state'),
                             max('time_to_state').alias('max_time_to_state')) \
                        .orderBy('district')

df_dist_1 = df_grouped
df_dist_1.show(100, False)

+------------+--------------+---------+--------------------------------------------+--------------------------------------------+--------------------------------------------+
|district    |state         |row_count|min_time_to_state                           |avg_time_to_state                           |max_time_to_state                           |
+------------+--------------+---------+--------------------------------------------+--------------------------------------------+--------------------------------------------+
|null        |กำลังดำเนินการ|9        |INTERVAL '39 11:35:20.576196' DAY TO SECOND |INTERVAL '111 06:13:12.151308' DAY TO SECOND|INTERVAL '274 15:52:58.551807' DAY TO SECOND|
|null        |รอรับเรื่อง   |18       |INTERVAL '-0 00:00:00.012137' DAY TO SECOND |INTERVAL '0 04:12:23.572281' DAY TO SECOND  |INTERVAL '2 01:28:28.354655' DAY TO SECOND  |
|null        |เสร็จสิ้น     |47       |INTERVAL '0 00:16:04.067019' DAY TO SECOND  |INTERVAL '67 06:58:15.141717' DAY TO SECO

In [40]:
df_dist_1.write.option("header", "true").csv(path="/content/spark_output/dist_1")

In [41]:
# rank by avg_time_to_state

df_grouped = df.groupBy('district', 'state') \
                        .agg(count('time_to_state').alias('row_count'),
                             min('time_to_state').alias('min_time_to_state'),
                             avg('time_to_state').alias('avg_time_to_state'),
                             max('time_to_state').alias('max_time_to_state')) \
                        .filter((col('state') == 'เสร็จสิ้น') & (col('row_count') > 50)) \
                        .orderBy('avg_time_to_state')

df_dist_2 = df_grouped
df_dist_2.show(100, False)

+-----------------+---------+---------+------------------------------------------+--------------------------------------------+--------------------------------------------+
|district         |state    |row_count|min_time_to_state                         |avg_time_to_state                           |max_time_to_state                           |
+-----------------+---------+---------+------------------------------------------+--------------------------------------------+--------------------------------------------+
|ราษฎร์บูรณะ      |เสร็จสิ้น|2075     |INTERVAL '0 00:01:02.493189' DAY TO SECOND|INTERVAL '15 20:03:58.693903' DAY TO SECOND |INTERVAL '325 21:31:23.960883' DAY TO SECOND|
|บางคอแหลม        |เสร็จสิ้น|2466     |INTERVAL '0 00:01:36.057744' DAY TO SECOND|INTERVAL '16 15:21:00.448705' DAY TO SECOND |INTERVAL '327 22:08:25.642183' DAY TO SECOND|
|บางแค            |เสร็จสิ้น|7803     |INTERVAL '0 00:02:14.913236' DAY TO SECOND|INTERVAL '20 04:07:16.140707' DAY TO SECOND |INTERVAL

In [42]:
df_dist_2.write.option("header", "true").csv(path="/content/spark_output/dist_2")

## Focus on star

In [43]:
df_grouped = df.groupBy('star', 'state') \
                        .agg(count('time_to_state').alias('row_count'),
                             min('time_to_state').alias('min_time_to_state'),
                             avg('time_to_state').alias('avg_time_to_state'),
                             max('time_to_state').alias('max_time_to_state')) \
                        .filter((col('state') == "เสร็จสิ้น")) \
                        .orderBy('star')

df_star_1 = df_grouped
df_star_1.show(100, False)

+----+---------+---------+------------------------------------------+-------------------------------------------+--------------------------------------------+
|star|state    |row_count|min_time_to_state                         |avg_time_to_state                          |max_time_to_state                           |
+----+---------+---------+------------------------------------------+-------------------------------------------+--------------------------------------------+
|null|เสร็จสิ้น|104752   |INTERVAL '0 00:00:21.569294' DAY TO SECOND|INTERVAL '45 14:10:13.91956' DAY TO SECOND |INTERVAL '464 10:21:28.527899' DAY TO SECOND|
|1   |เสร็จสิ้น|13773    |INTERVAL '0 00:00:30.758731' DAY TO SECOND|INTERVAL '50 22:52:43.699354' DAY TO SECOND|INTERVAL '433 21:32:48.546884' DAY TO SECOND|
|2   |เสร็จสิ้น|4629     |INTERVAL '0 00:01:06.009175' DAY TO SECOND|INTERVAL '51 13:26:33.994633' DAY TO SECOND|INTERVAL '324 17:53:37.248088' DAY TO SECOND|
|3   |เสร็จสิ้น|10394    |INTERVAL '0 00:00:34

In [44]:
df_star_1.write.option("header", "true").csv(path="/content/spark_output/star_1")

## AI/ML Start predict time to เสร็จสิ้น here

In [45]:
df_ml = df.filter(df.state == "เสร็จสิ้น")
df_ml.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+--------+-------------+--------------------+---------+----+------------+--------------------+--------------------+
|                type|        organization|             comment|              coords|             address|subdistrict|district|     province|           timestamp|    state|star|count_reopen|       last_activity|       time_to_state|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+--------+-------------+--------------------+---------+----+------------+--------------------+--------------------+
|         [ความสะอาด]|        [เขตบางซื่อ]|             ขยะเยอะ|[100.53084, 13.81...|12/14 ถนน กรุงเทพ...|       null|    null|กรุงเทพมหานคร|2021-09-03 12:51:...|เสร็จสิ้น|null|        null|2022-06-04 15:34:...|INTERVAL '274 02:...|
|[น้ำท่วม, ร้องเรียน]|[เขตประเวศ, ฝ่ายโ...|น้ำท่วมเวลาฝนตกแล...|[100

In [46]:
df_ml.count()

202391

In [50]:
# save dataframe to Pickle file

df_ml.rdd.saveAsPickleFile(path="/content/spark_output/ml")

# 2. AI/ML

In [None]:
# https://drive.google.com/file/d/1AQZysrjmVW9qMv4CDz0nPemK4ttmIFq8/view?usp=share_link
# !gdown 1AQZysrjmVW9qMv4CDz0nPemK4ttmIFq8

In [None]:
# path = '/content/done.csv' # csv

In [6]:
# https://drive.google.com/file/d/10mR0X89jq4kyQJTGxMapjIraEDdcHwy6/view?usp=share_link
!gdown 10mR0X89jq4kyQJTGxMapjIraEDdcHwy6

Downloading...
From: https://drive.google.com/uc?id=10mR0X89jq4kyQJTGxMapjIraEDdcHwy6
To: /content/ml
100% 239M/239M [00:04<00:00, 53.3MB/s]


In [8]:
pickle_rdd = spark.sparkContext.pickleFile('/content/ml').collect()
df_ml = spark.createDataFrame(pickle_rdd)

In [10]:
df_ml.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+--------+-------------+--------------------+---------+----+------------+--------------------+--------------------+
|                type|        organization|             comment|              coords|             address|subdistrict|district|     province|           timestamp|    state|star|count_reopen|       last_activity|       time_to_state|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+--------+-------------+--------------------+---------+----+------------+--------------------+--------------------+
|         [ความสะอาด]|        [เขตบางซื่อ]|             ขยะเยอะ|[100.53084, 13.81...|12/14 ถนน กรุงเทพ...|       null|    null|กรุงเทพมหานคร|2021-09-03 12:51:...|เสร็จสิ้น|null|        null|2022-06-04 15:34:...|INTERVAL '274 02:...|
|[น้ำท่วม, ร้องเรียน]|[เขตประเวศ, ฝ่ายโ...|น้ำท่วมเวลาฝนตกแล...|[100

In [12]:
df_ml.printSchema()

root
 |-- type: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- organization: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- comment: string (nullable = true)
 |-- coords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- address: string (nullable = true)
 |-- subdistrict: string (nullable = true)
 |-- district: string (nullable = true)
 |-- province: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- state: string (nullable = true)
 |-- star: long (nullable = true)
 |-- count_reopen: long (nullable = true)
 |-- last_activity: timestamp (nullable = true)
 |-- time_to_state: interval day to second (nullable = true)

