In [6]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False
if IN_COLAB:
    !apt-get install openjdk-8-jdk-headless -qq > /dev/null
    !wget -q https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
    !tar xf spark-3.3.2-bin-hadoop3.tgz
    !mv spark-3.3.2-bin-hadoop3 spark
    !pip install -q findspark
    import os
    os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
    os.environ["SPARK_HOME"] = "/content/spark"

In [41]:
import pandas as pd

import findspark
findspark.init()
spark_url = 'local'
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import split, col
from pyspark.sql import functions as F

In [42]:
spark = SparkSession.builder\
        .master(spark_url)\
        .appName('Spark Data Prep')\
        .config('spark.ui.port', '4040')\
        .config("spark.sql.execution.pyspark.udf.faulthandler.enabled", "true") \
        .config("spark.driver.memory", "16g") \
        .config("spark.executor.memory", "16g") \
        .getOrCreate()
sc = spark.sparkContext
path = 'bangkok_traffy.csv'

In [43]:
df = spark.read.option("multiline", "true") \
               .option("quote", "\"") \
               .option("escape", "\"") \
               .csv(path, header=True, inferSchema=True)

# explore data and schema

In [9]:
df.printSchema()

root
 |-- ticket_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- organization: string (nullable = true)
 |-- comment: string (nullable = true)
 |-- photo: string (nullable = true)
 |-- photo_after: string (nullable = true)
 |-- coords: string (nullable = true)
 |-- address: string (nullable = true)
 |-- subdistrict: string (nullable = true)
 |-- district: string (nullable = true)
 |-- province: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- state: string (nullable = true)
 |-- star: integer (nullable = true)
 |-- count_reopen: integer (nullable = true)
 |-- last_activity: timestamp (nullable = true)



In [10]:
df.count()

787026

In [11]:
df.select('type','organization','coords','province','state','timestamp','last_activity').show(10, truncate=False)

+-------------------+------------------------------------------------------------+------------------+-------------+---------+--------------------------+--------------------------+
|type               |organization                                                |coords            |province     |state    |timestamp                 |last_activity             |
+-------------------+------------------------------------------------------------+------------------+-------------+---------+--------------------------+--------------------------+
|{ความสะอาด}        |เขตบางซื่อ                                                  |100.53084,13.81865|กรุงเทพมหานคร|เสร็จสิ้น|2021-09-03 19:51:09.453003|2022-06-04 22:34:14.609206|
|{น้ำท่วม,ร้องเรียน}|เขตประเวศ,ฝ่ายโยธา เขตประเวศ                                |100.66709,13.67891|กรุงเทพมหานคร|เสร็จสิ้น|2021-09-19 21:56:08.924992|2022-06-21 15:21:09.532782|
|{สะพาน}            |เขตสาทร                                                     |100.52649,13.72060

In [12]:
print("Explore unique state:")
uniqueState = df.select('state').distinct().rdd.flatMap(lambda x: x).collect()
uniqueState

Explore unique state:


['รอรับเรื่อง', 'กำลังดำเนินการ', 'เสร็จสิ้น']

In [13]:
print("see the number of completed ticket")
print(df.filter((col('state') == "เสร็จสิ้น")).count())


see the number of completed ticket
645700


In [14]:
print("show all province")
dfprovince = df.groupBy("province").count().orderBy("count", ascending=False)
dfprovince.show(dfprovince.count(), truncate=False)

show all province
+-----------------------------+------+
|province                     |count |
+-----------------------------+------+
|กรุงเทพมหานคร                |660434|
|จังหวัดกรุงเทพมหานคร         |125189|
|จังหวัดLac                   |293   |
|NULL                         |195   |
|นนทบุรี                      |194   |
|สมุทรปราการ                  |191   |
|ปทุมธานี                     |81    |
|จังหวัดนนทบุรี               |67    |
|จังหวัดสมุทรปราการ           |53    |
|จังหวัดจังหวัด กรุงเทพมหานคร |30    |
|จังหวัดปทุมธานี              |29    |
|สมุทรสาคร                    |26    |
|นครราชสีมา                   |21    |
|นครปฐม                       |19    |
|จังหวัดสมุทรสาคร             |19    |
|จังหวัดBorno                 |17    |
|ภูเก็ต                       |14    |
|จังหวัดนครปฐม                |11    |
|จังหวัดเพชรบุรี              |10    |
|จังหวัดราชบุรี               |9     |
|ชลบุรี                       |8     |
|จังหวัดBangkok               |6     |
|จังหวั

In [15]:
print("กรุงเทพ และปริมณฑล data count: ")
pattern = "(?i)กรุงเทพ|Bangkok|นนทบุรี|นครปฐม|ปทุมธานี|สมุทรปราการ|สมุทรสาคร"
df.filter(col("province").isNotNull() & col("province").rlike(pattern)).groupBy("province").count().orderBy("count", ascending=False).show(truncate=False)

กรุงเทพ และปริมณฑล data count: 
+----------------------------+------+
|province                    |count |
+----------------------------+------+
|กรุงเทพมหานคร               |660434|
|จังหวัดกรุงเทพมหานคร        |125189|
|นนทบุรี                     |194   |
|สมุทรปราการ                 |191   |
|ปทุมธานี                    |81    |
|จังหวัดนนทบุรี              |67    |
|จังหวัดสมุทรปราการ          |53    |
|จังหวัดจังหวัด กรุงเทพมหานคร|30    |
|จังหวัดปทุมธานี             |29    |
|สมุทรสาคร                   |26    |
|นครปฐม                      |19    |
|จังหวัดสมุทรสาคร            |19    |
|จังหวัดนครปฐม               |11    |
|จังหวัดBangkok              |6     |
|จังหวัดกรุงเทพฯ             |2     |
|จังหวัดจังหวัดกรุงเทพมหานคร |1     |
+----------------------------+------+



In [16]:
print("See if all coords follow this pattern of ...,...")
coords_pattern = r"^-?\d+\.\d+,-?\d+\.\d+$"

# Filter rows where 'coords' do not match the pattern
invalid_coords = df.filter(~col('coords').rlike(coords_pattern))

# Show the rows with invalid coordinates
print("number of invalid coords format:")
invalid_coords.count()

See if all coords follow this pattern of ...,...
number of invalid coords format:


0

# Data preprocessing

In [44]:
df = df.dropDuplicates()

In [45]:
df = df.dropna(subset=['ticket_id', 'organization','timestamp','last_activity','state','coords', 'type'], how='any')

## preprocessing state and completion time calculation

In [46]:
df = df.filter(col('state')=="เสร็จสิ้น")

In [47]:
df = df.withColumn('completion_time_hours', 
                   (F.unix_timestamp('last_activity') - F.unix_timestamp('timestamp')) / 3600)

In [62]:
df.select('state','timestamp','last_activity','completion_time_hours').show(10, truncate=False)

+---------+--------------------------+--------------------------+---------------------+
|state    |timestamp                 |last_activity             |completion_time_hours|
+---------+--------------------------+--------------------------+---------------------+
|เสร็จสิ้น|2022-05-29 18:51:10.563403|2023-08-29 09:13:09.764898|10958.36638888889    |
|เสร็จสิ้น|2022-05-29 22:47:47.652949|2025-01-09 11:14:39.607469|22932.44777777778    |
|เสร็จสิ้น|2022-05-29 23:12:20.675929|2022-06-20 11:59:37.529268|516.7880555555555    |
|เสร็จสิ้น|2022-05-30 13:03:41.305578|2022-06-01 15:49:00.143225|50.75527777777778    |
|เสร็จสิ้น|2023-05-18 07:37:30.768261|2023-05-19 11:35:01.509191|27.95861111111111    |
|เสร็จสิ้น|2024-04-27 14:23:51.111848|2024-05-03 16:46:13.590538|146.37277777777777   |
|เสร็จสิ้น|2022-07-21 14:30:47.386095|2022-08-06 21:24:12.128442|390.8902777777778    |
|เสร็จสิ้น|2024-09-15 19:24:24.398573|2024-09-17 13:48:45.267314|42.405833333333334   |
|เสร็จสิ้น|2024-09-30 21:48:58.0

## preprocessing coordinations

In [48]:
# Split the 'coords' column into two new columns 'latitude' and 'longitude'
df = df.withColumn('longitude', split(df['coords'], ',').getItem(0).cast('double')) \
       .withColumn('latitude', split(df['coords'], ',').getItem(1).cast('double'))


In [49]:
print("show some longitude and latitude values:")
df.select(df.longitude, df.latitude).show(5, truncate=False)

df.select(F.min('latitude'), F.max('latitude')).show()
df.select(F.min('longitude'), F.max('longitude')).show()

df.filter('longitude == 1000').show()

show some longitude and latitude values:
+---------+--------+
|longitude|latitude|
+---------+--------+
|100.57853|13.77817|
|100.54778|13.69016|
|100.69471|13.76328|
|100.67825|13.72006|
|100.54456|13.72599|
+---------+--------+
only showing top 5 rows
+-------------+-------------+
|min(latitude)|max(latitude)|
+-------------+-------------+
|          0.0|      18.7929|
+-------------+-------------+

+--------------+--------------+
|min(longitude)|max(longitude)|
+--------------+--------------+
|           0.0|     104.49722|
+--------------+--------------+

+---------+----+------------+-------+-----+-----------+------+-------+-----------+--------+--------+---------+-----+----+------------+-------------+---------------------+---------+--------+
|ticket_id|type|organization|comment|photo|photo_after|coords|address|subdistrict|district|province|timestamp|state|star|count_reopen|last_activity|completion_time_hours|longitude|latitude|
+---------+----+------------+-------+-----+-----------

## deal with out of range coordination (outlier)

In [50]:
# Define the latitude and longitude bounds mainly in กรุงเทพ
latitude_min = 13.4
latitude_max = 14.3
longitude_min = 99.8
longitude_max = 101

# keep only the rows within the defined bounds
df = df.filter(
    (df['latitude'] >= latitude_min) & (df['latitude'] <= latitude_max) &
    (df['longitude'] >= longitude_min) & (df['longitude'] <= longitude_max))

## preprocessing province

In [51]:
# keep only rows that province are กรุงเทพ และ ปริมณฑล
pattern = "(?i)กรุงเทพ|Bangkok|นนทบุรี|นครปฐม|ปทุมธานี|สมุทรปราการ|สมุทรสาคร"
df = df.filter(col("province").isNotNull() & col("province").rlike(pattern))

In [52]:
# Apply pattern matching and replace values
df = df.withColumn(
    "province",
    F.when(
        F.col("province").rlike("(?i).*กรุงเทพ.*|.*Bangkok.*"), "กรุงเทพมหานคร"
    )
    .when(F.col("province").rlike("(?i).*นนทบุรี.*"), "นนทบุรี")
    .when(F.col("province").rlike("(?i).*สมุทรปราการ.*"), "สมุทรปราการ")
    .when(F.col("province").rlike("(?i).*ปทุมธานี.*"), "ปทุมธานี")
    .when(F.col("province").rlike("(?i).*สมุทรสาคร.*"), "สมุทรสาคร")
    .when(F.col("province").rlike("(?i).*นครปฐม.*"), "นครปฐม")
    .otherwise(F.col("province"))
)


In [26]:
print("After province preprocessing:")
# Group by the cleaned-up province and count
df.groupBy("province").count().orderBy("count", ascending=False).show(df.count(), truncate=False)

After province preprocessing:
+-------------+------+
|province     |count |
+-------------+------+
|กรุงเทพมหานคร|638122|
|สมุทรปราการ  |82    |
|นนทบุรี      |64    |
|ปทุมธานี     |34    |
|สมุทรสาคร    |22    |
|นครปฐม       |9     |
+-------------+------+



## preprocessing organization

In [53]:
# 'organization' to array
df = df.withColumn('organization_array', split(F.regexp_replace(F.col('organization'), '\\s*,\\s*', ','), ','))

In [54]:
# remove organization from the array that look like person names in parentheses, e.g. "... (นาง...)"/"... (นาย...)"
person_pattern = r'\( *?(นาย|นาง|น\.ส\.|นางสาว)[^\)]*\)'

# trim items, remove empties and remove items matching the person pattern
df = df.withColumn(
    "organization_array",
    F.filter(
        F.transform(F.col("organization_array"), lambda x: F.trim(x)),
        lambda x: (x != "") & (~F.lower(x).rlike(person_pattern))
    )
)

In [55]:
pathOrgsLoc = 'org_with_loc_v2.csv'
dfOrg = spark.read.csv(pathOrgsLoc, header=True, inferSchema=True)
dfOrg.printSchema()

# prepare org location dictionary for fast lookup
org_loc = {row['organization_raw']: (row['latitude'], row['longitude']) for row in dfOrg.collect()}

root
 |-- organization_raw: string (nullable = true)
 |-- displayName: string (nullable = true)
 |-- formattedAddress: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)



In [30]:
print("Explore unique organizations:")
# explode the array column so each organization is a separate row, trim and filter empties, then collect distinct values
unique_orgs = (
	df.select(F.explode('organization_array').alias('org'))
	  .filter(F.col('org').isNotNull())
	  .select(F.trim(F.col('org')).alias('org'))
	  .filter(F.col('org') != '')
	  .distinct()
	  .rdd
	  .map(lambda row: row['org'])
	  .collect()
)
unique_orgs

Explore unique organizations:


['สำนักงานก่อสร้างและบูรณะ สำนักการโยธา (สกบ.สนย.) กทม.',
 'กองระบบคลอง สำนักการระบายน้ำ',
 'ศูนย์ก่อสร้างและบูรณะถนน 2 ส่วนก่อสร้างและบูรณะ 2 สำนักงานก่อสร้างและบูรณะ (สกบ.สนย.)',
 'สอบ.กรร.สสสภ.',
 'ฝ่ายการคลัง เขตตลิ่งชัน',
 'กลุ่มงานควบคุมอาคาร 1 ส่วนควบคุมอาคาร 1 สำนักงานควบคุมอาคาร (สคอ.สนย.)',
 'ฝ่ายโยธา เขตจตุจักร',
 'ฝ่ายการศึกษา เขตบางนา',
 'สน.หัวหมาก',
 'สำนักงานเขตสายไหม',
 'สซท.สสภ.',
 'สำนักป้องกันและบรรเทาสาธารณภัย กทม.',
 'พนักงานกวาด เขตป้อมปราบศัตรูพ่าย',
 'ฝ่ายพัฒนาชุมชนฯ เขตปทุมวัน',
 'ฝ่ายการคลัง เขตจตุจักร',
 'ท่าเรือกรุงเทพ',
 'ฝ่ายรักษาความสะอาดฯ เขตจอมทอง',
 'สน.บางคอแหลม',
 'กลุ่มประสานงานและติดตาม กทม.',
 'ฝ่ายทะเบียน เขตจตุจักร',
 'โรงพยาบาลสิรินธร สำนักการแพทย์',
 'ส.ส.ธีรัจชัย พันธุมาศ (หนองจอก มีนบุรี ลาดกระบัง)',
 'พรรคก้าวไกล บางซื่อ-ดุสิต กทม.เขต7',
 'ฝ่ายสิ่งแวดล้อม แขวงจอมพล จตุจักร',
 'อำเภอปากเกร็ด',
 'ส่วนวิศวกรรมทาง 2 สำนักงานวิศวกรรมทาง (สวก.สนย.)',
 'การไฟฟ้านครหลวงเขตวัดเลียบ',
 'ศูนย์ประสานงาน จุฬาลงกรณ์มหาวิทยาลัย',
 'ฝ่ายทะเบียน เขตบางกะปิ

In [56]:
from pyspark.sql.types import ArrayType, StringType
# only keep organizations that have valid locations in the org_loc dictionary
def filter_orgs(orgs):
    filtered = []
    if orgs is None:
        return filtered
    for org in orgs:
        loc = org_loc.get(org)
        if loc and loc[0] != "Not Found" and loc[1] != "Not Found":
            filtered.append(org)
    return filtered

filter_orgs_udf = F.udf(filter_orgs, ArrayType(StringType()))

# apply filter to the array
df = df.withColumn('organization_array', filter_orgs_udf(col('organization_array')))

## calculate orgs distance

In [57]:
from pyspark.sql.types import ArrayType, DoubleType
import math
from pyspark.sql.functions import udf, col

# calculate haversine distance between two points
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km
    lat1, lon1, lat2, lon2 = map(float, [lat1, lon1, lat2, lon2])
    phi1, phi2 = math.radians(lat1), math.radians(lat2)
    dphi = math.radians(lat2 - lat1)
    dlambda = math.radians(lon2 - lon1)
    a = math.sin(dphi/2)**2 + math.cos(phi1)*math.cos(phi2)*math.sin(dlambda/2)**2
    return R * 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))




def calc_distances(orgs, lat, lon):
    if orgs is None or lat is None or lon is None:
        return []
    distances = set()
    for org in orgs:
        loc = org_loc.get(org)
        if loc and loc[0] != "Not Found" and loc[1] != "Not Found":
            try:
                dist = haversine(lat, lon, float(loc[0]), float(loc[1]))
                distances.add(dist)
            except Exception:
                continue
    return list(distances)

calc_distances_udf = udf(calc_distances, ArrayType(DoubleType()))

# calculate distances from ticket latitude and longitude to each organization's location
df = df.withColumn(
    'organization_distances',
    calc_distances_udf(col('organization_array'), col('latitude'), col('longitude'))
)



In [33]:
print("sample after preprocessing organization and calculating distances:")
df.filter(col('organization').like('%Traffy @ ITS Lab2%')).select('organization', 'organization_array', 'latitude', 'longitude', 'organization_distances').show(5, truncate=False  )

sample after preprocessing organization and calculating distances:
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+---------+------------------------------

## preprocessing type

In [58]:
# remove rows with empty type
df = df.filter(df['type'] != '{}')
# Remove '{' and '}' and split the 'type' column by ','
df = df.withColumn('type_array', F.split(F.regexp_replace(df['type'], '[\{\}]', ''), ','))

In [72]:
df.select(df.type_array, df.type).show(10, truncate=False)

+---------------------------+-------------------------+
|type_array                 |type                     |
+---------------------------+-------------------------+
|[ท่อระบายน้ำ, ถนน, น้ำท่วม]|{ท่อระบายน้ำ,ถนน,น้ำท่วม}|
|[แสงสว่าง, สะพาน]          |{แสงสว่าง,สะพาน}         |
|[ความสะอาด]                |{ความสะอาด}              |
|[สัตว์จรจัด]               |{สัตว์จรจัด}             |
|[น้ำท่วม]                  |{น้ำท่วม}                |
|[สัตว์จรจัด]               |{สัตว์จรจัด}             |
|[ความสะอาด]                |{ความสะอาด}              |
|[กีดขวาง, ต้นไม้]          |{กีดขวาง,ต้นไม้}         |
|[ความปลอดภัย, จราจร]       |{ความปลอดภัย,จราจร}      |
|[ทางเท้า]                  |{ทางเท้า}                |
+---------------------------+-------------------------+
only showing top 10 rows


In [59]:
df.printSchema()

root
 |-- ticket_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- organization: string (nullable = true)
 |-- comment: string (nullable = true)
 |-- photo: string (nullable = true)
 |-- photo_after: string (nullable = true)
 |-- coords: string (nullable = true)
 |-- address: string (nullable = true)
 |-- subdistrict: string (nullable = true)
 |-- district: string (nullable = true)
 |-- province: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- state: string (nullable = true)
 |-- star: integer (nullable = true)
 |-- count_reopen: integer (nullable = true)
 |-- last_activity: timestamp (nullable = true)
 |-- completion_time_hours: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- organization_array: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- organization_distances: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- type_ar

In [60]:
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import explode
from pyspark.ml.feature import OneHotEncoder

# a row will will get exploded into multiple rows, one for each type in the type_array
df_exploded = df.withColumn("type_array", explode(df["type_array"]))

indexer = StringIndexer(inputCol="type_array", outputCol="type_index")
df_indexed = indexer.fit(df_exploded).transform(df_exploded)


encoder = OneHotEncoder(inputCol="type_index", outputCol="type_onehot")
df_encoded = encoder.fit(df_indexed).transform(df_indexed)

In [75]:
df_encoded.select(df_encoded.ticket_id, df_encoded.type_array, df_encoded.type_onehot).show(truncate=False)


+-----------+-----------+---------------+
|ticket_id  |type_array |type_onehot    |
+-----------+-----------+---------------+
|2022-6MXLYB|ท่อระบายน้ำ|(23,[8],[1.0]) |
|2022-6MXLYB|ถนน        |(23,[0],[1.0]) |
|2022-6MXLYB|น้ำท่วม    |(23,[7],[1.0]) |
|2022-AUCPV4|แสงสว่าง   |(23,[5],[1.0]) |
|2022-AUCPV4|สะพาน      |(23,[13],[1.0])|
|2023-LBLAKH|ความสะอาด  |(23,[2],[1.0]) |
|2024-K64MQY|สัตว์จรจัด |(23,[15],[1.0])|
|2022-7EM6TQ|น้ำท่วม    |(23,[7],[1.0]) |
|2024-4YZ6AQ|สัตว์จรจัด |(23,[15],[1.0])|
|2024-NWHXRB|ความสะอาด  |(23,[2],[1.0]) |
|2024-GNLW7W|กีดขวาง    |(23,[4],[1.0]) |
|2024-GNLW7W|ต้นไม้     |(23,[12],[1.0])|
|2023-7HM627|ความปลอดภัย|(23,[3],[1.0]) |
|2023-7HM627|จราจร      |(23,[9],[1.0]) |
|2022-ECAF8K|ทางเท้า    |(23,[1],[1.0]) |
|2023-LZ7YK6|ความปลอดภัย|(23,[3],[1.0]) |
|2024-FMWCP2|ทางเท้า    |(23,[1],[1.0]) |
|2024-FMWCP2|ต้นไม้     |(23,[12],[1.0])|
|2022-8FCEQQ|ทางเท้า    |(23,[1],[1.0]) |
|2024-MAPHUU|ความสะอาด  |(23,[2],[1.0]) |
+-----------+-----------+---------

In [61]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import functions as F

# Aggregate one-hot encoded vectors for each ID
df_combined = df_encoded.groupBy("ticket_id").agg(
    F.collect_list("type_onehot").alias("type_onehots")
)

df = df.join(df_combined, on='ticket_id', how='left')


# Final data

In [78]:
df.filter(df.organization == "").show()

+---------+----+------------+-------+-----+-----------+------+-------+-----------+--------+--------+---------+-----+----+------------+-------------+---------------------+---------+--------+------------------+----------------------+----------+------------+
|ticket_id|type|organization|comment|photo|photo_after|coords|address|subdistrict|district|province|timestamp|state|star|count_reopen|last_activity|completion_time_hours|longitude|latitude|organization_array|organization_distances|type_array|type_onehots|
+---------+----+------------+-------+-----+-----------+------+-------+-----------+--------+--------+---------+-----+----+------------+-------------+---------------------+---------+--------+------------------+----------------------+----------+------------+
+---------+----+------------+-------+-----+-----------+------+-------+-----------+--------+--------+---------+-----+----+------------+-------------+---------------------+---------+--------+------------------+----------------------+-

In [62]:
df.count()

544599

In [87]:
df.printSchema()

root
 |-- ticket_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- organization: string (nullable = true)
 |-- comment: string (nullable = true)
 |-- photo: string (nullable = true)
 |-- photo_after: string (nullable = true)
 |-- coords: string (nullable = true)
 |-- address: string (nullable = true)
 |-- subdistrict: string (nullable = true)
 |-- district: string (nullable = true)
 |-- province: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- state: string (nullable = true)
 |-- star: integer (nullable = true)
 |-- count_reopen: integer (nullable = true)
 |-- last_activity: timestamp (nullable = true)
 |-- completion_time_hours: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- organization_array: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- organization_distances: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- type_ar

In [None]:
dfProcessed = df.select(df.ticket_id, df.latitude, df.longitude, df.organization_array, df.organization_distances, df.type_array, df.type_onehots, df.completion_time_hours)
train_data, test_data = dfProcessed.randomSplit([0.8, 0.2], seed=1234)

path_to_your_folder = "C:/Users/somePath"

In [None]:
dfProcessed.coalesce(1).write.mode("overwrite").json('file:///{path_to_your_folder}/raw_processed.json')

In [None]:
train_data.coalesce(1).write.mode("overwrite").json('file:///{path_to_your_folder}/train_data.json')

In [None]:
test_data.coalesce(1).write.mode("overwrite").json('file:///{path_to_your_folder}/test_data.json')