In [None]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False
if IN_COLAB:
    !apt-get install openjdk-8-jdk-headless -qq > /dev/null
    !wget -q https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
    !tar xf spark-3.3.2-bin-hadoop3.tgz
    !mv spark-3.3.2-bin-hadoop3 spark
    !pip install -q findspark
    import os
    os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
    os.environ["SPARK_HOME"] = "/content/spark"

In [43]:
import pandas as pd

import findspark
findspark.init()
spark_url = 'local'
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import split, col
from pyspark.sql import functions as F

In [26]:
spark = SparkSession.builder\
        .master(spark_url)\
        .appName('Spark Data Prep')\
        .config('spark.ui.port', '4040')\
        .getOrCreate()
sc = spark.sparkContext
path = 'bangkok_traffy.csv'
df = spark.read.option("multiline", "true") \
               .option("quote", "\"") \
               .option("escape", "\"") \
               .csv(path, header=True, inferSchema=True)


Check out data and schema

In [3]:
df.printSchema()

root
 |-- ticket_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- organization: string (nullable = true)
 |-- comment: string (nullable = true)
 |-- photo: string (nullable = true)
 |-- photo_after: string (nullable = true)
 |-- coords: string (nullable = true)
 |-- address: string (nullable = true)
 |-- subdistrict: string (nullable = true)
 |-- district: string (nullable = true)
 |-- province: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- state: string (nullable = true)
 |-- star: integer (nullable = true)
 |-- count_reopen: integer (nullable = true)
 |-- last_activity: timestamp (nullable = true)



In [33]:
df.show(5, truncate=False)

+-----------+-------------------+--------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+------------------+--------------------------------------------------------------------------------+-----------+--------+-------------+--------------------------+---------+----+------------+--------------------------+
|ticket_id  |type               |organization                    |comment                                                                                                                                          

In [43]:
df.count()

786455

In [41]:
nullOrg = df.filter(df.organization.isNull())
nullOrg.show(5, truncate=False)
print(nullOrg.count())
nullOrg.groupBy("state").count().show()

+---------+----+------------+-------+-----+-----------+------+-------+-----------+--------+--------+---------+-----+----+------------+-------------+
|ticket_id|type|organization|comment|photo|photo_after|coords|address|subdistrict|district|province|timestamp|state|star|count_reopen|last_activity|
+---------+----+------------+-------+-----+-----------+------+-------+-----------+--------+--------+---------+-----+----+------------+-------------+
+---------+----+------------+-------+-----+-----------+------+-------+-----------+--------+--------+---------+-----+----+------------+-------------+

0
+-----+-----+
|state|count|
+-----+-----+
+-----+-----+



In [42]:
df.filter(df.organization.isNotNull()).groupBy("state").count().show()

+--------------+------+
|         state| count|
+--------------+------+
|   รอรับเรื่อง| 11195|
|กำลังดำเนินการ|129560|
|     เสร็จสิ้น|645700|
+--------------+------+



<h1>Data Cleaning</h1>

In [28]:
df = df.dropDuplicates()

preprocessing organization

In [27]:
df = df.dropna(subset=['organization'], how='any')

preprocessing coordinations

In [5]:
# Filter rows where 'coords' is null or just a comma
invalid_org = df.filter(
    (col('organization').isNull()) | (col('organization') == ',') | (col('organization') == '')
)

# Show the rows with invalid coordinates
invalid_org.show()

+---------+----+------------+-------+-----+-----------+------+-------+-----------+--------+--------+---------+-----+----+------------+-------------+
|ticket_id|type|organization|comment|photo|photo_after|coords|address|subdistrict|district|province|timestamp|state|star|count_reopen|last_activity|
+---------+----+------------+-------+-----+-----------+------+-------+-----------+--------+--------+---------+-----+----+------------+-------------+
+---------+----+------------+-------+-----+-----------+------+-------+-----------+--------+--------+---------+-----+----+------------+-------------+



In [29]:
# Filter rows where 'coords' is null or just a comma
invalid_coords = df.filter(
    (col('coords').isNull()) | (col('coords') == ',') | (col('coords') == '')
)

# Show the rows with invalid coordinates
invalid_coords.show()

+---------+----+------------+-------+-----+-----------+------+-------+-----------+--------+--------+---------+-----+----+------------+-------------+
|ticket_id|type|organization|comment|photo|photo_after|coords|address|subdistrict|district|province|timestamp|state|star|count_reopen|last_activity|
+---------+----+------------+-------+-----+-----------+------+-------+-----------+--------+--------+---------+-----+----+------------+-------------+
+---------+----+------------+-------+-----+-----------+------+-------+-----------+--------+--------+---------+-----+----+------------+-------------+



In [30]:
df.select("coords").show(5, truncate=False) 

# Split the 'coords' column into two new columns 'latitude' and 'longitude'
df = df.withColumn('longitude', split(df['coords'], ',').getItem(0).cast('double')) \
       .withColumn('latitude', split(df['coords'], ',').getItem(1).cast('double'))

# Show the resulting DataFrame
df.show(5, truncate=False)

+------------------+
|coords            |
+------------------+
|100.57853,13.77817|
|100.54778,13.69016|
|100.69471,13.76328|
|100.67825,13.72006|
|100.54456,13.72599|
+------------------+
only showing top 5 rows
+-----------+-------------------------+------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [32]:
df.select(F.min('latitude'), F.max('latitude')).show()
df.select(F.min('longitude'), F.max('longitude')).show()

df.filter('longitude == 1000').show()

+-------------+-------------+
|min(latitude)|max(latitude)|
+-------------+-------------+
|          0.0|     19.90576|
+-------------+-------------+

+--------------+--------------+
|min(longitude)|max(longitude)|
+--------------+--------------+
|           0.0|        1000.0|
+--------------+--------------+

+--------------+--------------+
|min(longitude)|max(longitude)|
+--------------+--------------+
|           0.0|        1000.0|
+--------------+--------------+

+---------+------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+-----------+--------+--------+--------------------+--------------+----+------------+--------------------+---------+--------+
|ticket_id|        type|        organization|             comment|               photo|         photo_after|             coords|             address|subdistrict|district|province|           timestamp|         state|star|count_reopen|       last_activity

preprocessing province

In [41]:
df.groupBy("province").count().orderBy("count", ascending=False).show(10)

+--------------------+------+
|            province| count|
+--------------------+------+
|       กรุงเทพมหานคร|660317|
|จังหวัดกรุงเทพมหานคร|125182|
|          จังหวัดLac|   293|
|         สมุทรปราการ|   101|
|             นนทบุรี|    79|
|                NULL|    68|
|      จังหวัดนนทบุรี|    67|
|  จังหวัดสมุทรปราการ|    53|
|จังหวัดจังหวัด กร...|    30|
|            ปทุมธานี|    30|
+--------------------+------+
only showing top 10 rows


In [50]:
# Use the Column.like(...) method; functions.like doesn't exist
df.filter(col("province").like("%นนทบุรี%")).groupBy("province").count().show()
df.filter(col("province").like("%กรุงเทพ%")).groupBy("province").count().show()

+--------------+-----+
|      province|count|
+--------------+-----+
|       นนทบุรี|   79|
|จังหวัดนนทบุรี|   67|
+--------------+-----+

+--------------------+------+
|            province| count|
+--------------------+------+
|จังหวัดจังหวัด กร...|    30|
|       กรุงเทพมหานคร|660317|
|จังหวัดกรุงเทพมหานคร|125182|
|     จังหวัดกรุงเทพฯ|     2|
|จังหวัดจังหวัดกรุ...|     1|
+--------------------+------+

+--------------------+------+
|            province| count|
+--------------------+------+
|จังหวัดจังหวัด กร...|    30|
|       กรุงเทพมหานคร|660317|
|จังหวัดกรุงเทพมหานคร|125182|
|     จังหวัดกรุงเทพฯ|     2|
|จังหวัดจังหวัดกรุ...|     1|
+--------------------+------+



deal with out of range coordination

In [None]:
# Define the latitude and longitude bounds for Bangkok
latitude_min = 13
latitude_max = 14.5
longitude_min = 99
longitude_max = 101


df.filter(
    ~((df['latitude'] >= latitude_min) & (df['latitude'] <= latitude_max) &
    (df['longitude'] >= longitude_min) & (df['longitude'] <= longitude_max))
).show()

+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+-----------+------------------+--------------------+--------------------+--------------+----+------------+--------------------+---------+--------+
|  ticket_id|                type|        organization|             comment|               photo|         photo_after|            coords|             address|subdistrict|          district|            province|           timestamp|         state|star|count_reopen|       last_activity|longitude|latitude|
+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+-----------+------------------+--------------------+--------------------+--------------+----+------------+--------------------+---------+--------+
|2023-CH3YRG|         {ร้องเรียน}|สำนักงานตำรวจแห่ง...|บ้านเลขที่404  นา...|https://s

preprocessing type

In [7]:
# Remove '{' and '}' and split the 'type' column by ','
df = df.withColumn('type', F.regexp_replace(df['type'], '[\{\}]', ''))

df.show(5, truncate=False)

+-----------+-----------------+--------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+------------------+--------------------------------------------------------------------------------+-----------+--------+-------------+--------------------------+---------+----+------------+--------------------------+---------+---------+
|ticket_id  |type             |organization                    |comment                                                                                                                          

In [8]:
df.printSchema()

root
 |-- ticket_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- organization: string (nullable = true)
 |-- comment: string (nullable = true)
 |-- photo: string (nullable = true)
 |-- photo_after: string (nullable = true)
 |-- coords: string (nullable = true)
 |-- address: string (nullable = true)
 |-- subdistrict: string (nullable = true)
 |-- district: string (nullable = true)
 |-- province: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- state: string (nullable = true)
 |-- star: integer (nullable = true)
 |-- count_reopen: integer (nullable = true)
 |-- last_activity: timestamp (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)



In [10]:
# Get all unique types (categories) from the 'type' column
unique_types = df.select(F.explode(F.split(F.regexp_replace(df['type'], '[\{\}]', ''), ','))).distinct().rdd.flatMap(lambda x: x).collect()
unique_types = [t.strip() for t in unique_types if t.strip() != '']
# Show the unique types
print(unique_types)

['ห้องน้ำ', 'คนจรจัด', 'การเดินทาง', 'จราจร', 'สอบถาม', 'ป้ายจราจร', 'ทางเท้า', 'ท่อระบายน้ำ', 'ถนน', 'กีดขวาง', 'ความสะอาด', 'สะพาน', 'ต้นไม้', 'ร้องเรียน', 'เสนอแนะ', 'เสียงรบกวน', 'สัตว์จรจัด', 'ความปลอดภัย', 'สายไฟ', 'แสงสว่าง', 'คลอง', 'น้ำท่วม', 'PM2.5', 'ป้าย']


In [11]:

# Create a binary column for each unique type (one-hot encoding)
for type_ in unique_types:
    df = df.withColumn(type_, F.when(F.array_contains(F.split(F.regexp_replace(df['type'], '[\{\}]', ''), ','), type_), 1).otherwise(0))

# Show the transformed DataFrame
df.show(truncate=False)

+-----------+-------------------+----------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+------------------+------------------------------------------------------------------------------------+-----------+--------+-------------+--------------------------+--------------+----+------------+--------------------------+---------+---------+-------+-------+----------+-----+------+---------+-------+-----------+---+-------+---------+-----+------+---------+-------+----------