In [None]:
!pip install findspark
!pip install pyspark
!apt-get install -qq openjdk-17-jdk-headless
from google.colab import drive



In [None]:
drive.flush_and_unmount()
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import findspark, os
findspark.init()

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("CarAccidents")
    .master("local[*]")
    .getOrCreate()
)

In [None]:
train = spark.read.parquet("./drive/MyDrive/dataset/train_without_missing.parquet")
test = spark.read.parquet("./drive/MyDrive/dataset/test_without_missing.parquet")


In [None]:
from pyspark.sql.functions import col, sum, when, isnull
minus_one_counts = test.select([
    sum(when(col(c) == -1, 1).otherwise(0)).alias(c)
    for c in test.columns
])
minus_one_counts.show()

+------------+-----------------+------------------+--------------------+--------------------------+--------------+---------+-----------+---------------+----------------+---------------------------------+---------------------------------------+----------------+------------------+-----------------------+--------------------------+-------------------+-------------------+-------------------------------------------+-----------------+------------+-----------------------+-----------------+--------------------------------+-----------------+------------------------+-------------------------+---------------------------+--------------------------+-------------------+----------------------------+-------------------------+-------------+-------------+------------------+---------------+--------------+---------------------+--------------------------+------------------+--------------+---------------+---------------+--------------------+-----------------+-------------------+-------------------+---------

In [None]:
test.printSchema()

root
 |-- Police_Force: string (nullable = true)
 |-- Accident_Severity: string (nullable = true)
 |-- Number_of_Vehicles: integer (nullable = true)
 |-- Number_of_Casualties: integer (nullable = true)
 |-- Local_Authority_(District): string (nullable = true)
 |-- 1st_Road_Class: string (nullable = true)
 |-- Road_Type: string (nullable = true)
 |-- Speed_limit: integer (nullable = true)
 |-- Junction_Detail: string (nullable = true)
 |-- Junction_Control: string (nullable = true)
 |-- Pedestrian_Crossing-Human_Control: string (nullable = true)
 |-- Pedestrian_Crossing-Physical_Facilities: string (nullable = true)
 |-- Light_Conditions: string (nullable = true)
 |-- Weather_Conditions: string (nullable = true)
 |-- Road_Surface_Conditions: string (nullable = true)
 |-- Special_Conditions_at_Site: string (nullable = true)
 |-- Carriageway_Hazards: string (nullable = true)
 |-- Urban_or_Rural_Area: string (nullable = true)
 |-- Did_Police_Officer_Attend_Scene_of_Accident: string (nullabl

In [None]:
###############################################
#Predspracovanie - Vypočítanie štatistík pre numerické atribúty
################################################

In [None]:
train.show()

+------------+-----------------+------------------+--------------------+--------------------------+--------------+---------+-----------+---------------+----------------+---------------------------------+---------------------------------------+----------------+------------------+-----------------------+--------------------------+-------------------+-------------------+-------------------------------------------+-----------------+------------+-----------------------+-----------------+--------------------------------+-----------------+------------------------+-------------------------+---------------------------+--------------------------+-------------------+----------------------------+-------------------------+-------------+-------------+------------------+---------------+--------------+---------------------+--------------------------+------------------+--------------+---------------+---------------+--------------------+-----------------+-------------------+-------------------+---------

In [None]:
numeric_columns = [
    "Number_of_Vehicles",
    "Number_of_Casualties",
    "Speed_limit",
    "Age_of_Driver",
    "Age_Band_of_Driver",
    "Age_of_Vehicle",
    "Age_of_Casualty",
    "Age_Band_of_Casualty"
]

In [None]:
train.select(numeric_columns).describe().show()


+-------+------------------+--------------------+------------------+------------------+------------------+------------------+------------------+--------------------+
|summary|Number_of_Vehicles|Number_of_Casualties|       Speed_limit|     Age_of_Driver|Age_Band_of_Driver|    Age_of_Vehicle|   Age_of_Casualty|Age_Band_of_Casualty|
+-------+------------------+--------------------+------------------+------------------+------------------+------------------+------------------+--------------------+
|  count|            248557|              248557|            248557|            248557|            248557|            248557|            248557|              248557|
|   mean|2.3439613448826626|  2.1088563186713714|41.133321531882025| 38.51086471111254| 6.730540680809633|7.1799064198554055| 35.56881117812011|   6.243453211939314|
| stddev| 2.440986126583052|  3.2053214843487607|15.132506670584833|15.153302333584861|1.6820657144304634|3.8359131755476517|17.863890908093907|   2.137254448244892|
|   

In [None]:
###############################################
#Predspracovanie - Transformácia numerických atribútov na nominálne
################################################

In [None]:
from pyspark.sql.functions import when, col

train = train.withColumn(
    "Number_of_Vehicles",
    when(col("Number_of_Vehicles") == 1, "1_vehicle")
    .when(col("Number_of_Vehicles") == 2, "2_vehicles")
    .otherwise("3_or_more_vehicles")
)

test = test.withColumn(
    "Number_of_Vehicles",
    when(col("Number_of_Vehicles") == 1, "1_vehicle")
    .when(col("Number_of_Vehicles") == 2, "2_vehicles")
    .otherwise("3_or_more_vehicles")
)


In [None]:
train = train.withColumn(
    "Number_of_Casualties",
    when(col("Number_of_Casualties") == 1, "1")
    .when(col("Number_of_Casualties") == 2, "2")
    .otherwise("3+")
)

test = test.withColumn(
    "Number_of_Casualties",
    when(col("Number_of_Casualties") == 1, "1")
    .when(col("Number_of_Casualties") == 2, "2")
    .otherwise("3+")
)


In [None]:
train = train.withColumn(
    "Speed_limit",
    when(col("Speed_limit") <= 30, "Low")
    .when(col("Speed_limit") <= 50, "Medium")
    .otherwise("High")
)
test = test.withColumn(
    "Speed_limit",
    when(col("Speed_limit") <= 30, "Low")
    .when(col("Speed_limit") <= 50, "Medium")
    .otherwise("High")
)


In [None]:
train = train.withColumn(
    "Age_of_Driver",
    when(col("Age_of_Driver") <= 40, "<=40")
    .when(col("Age_of_Driver") <= 70, "41–70")
    .otherwise("71+")
)

test = test.withColumn(
    "Age_of_Driver",
    when(col("Age_of_Driver") <= 40, "<=40")
    .when(col("Age_of_Driver") <= 70, "41–70")
    .otherwise("71+")
)

In [None]:
train = train.withColumn(
    "Age_Band_of_Driver",
    when(col("Age_Band_of_Driver") <= 5, "Young")
    .when(col("Age_Band_of_Driver") <= 8, "Adult")
    .otherwise("Senior")
)
test = test.withColumn(
    "Age_Band_of_Driver",
    when(col("Age_Band_of_Driver") <= 5, "Young")
    .when(col("Age_Band_of_Driver") <= 8, "Adult")
    .otherwise("Senior")
)

In [None]:
train = train.withColumn(
    "Age_of_Vehicle",
    when(col("Age_of_Vehicle") <= 3, "0–3")
    .when(col("Age_of_Vehicle") <= 10, "4–10")
    .otherwise("11+")
)
test = test.withColumn(
    "Age_of_Vehicle",
    when(col("Age_of_Vehicle") <= 3, "0–3")
    .when(col("Age_of_Vehicle") <= 8, "4–8")
    .otherwise("8+")
)

In [None]:
train = train.withColumn(
    "Age_of_Casualty",
    when(col("Age_of_Casualty") <= 17, "0–17")
    .when(col("Age_of_Casualty") <= 40, "18–40")
    .when(col("Age_of_Casualty") <= 60, "41–60")
    .otherwise("61+")
)
test = test.withColumn(
    "Age_of_Casualty",
    when(col("Age_of_Casualty") <= 17, "0–17")
    .when(col("Age_of_Casualty") <= 40, "18–40")
    .when(col("Age_of_Casualty") <= 60, "41–60")
    .otherwise("61+")
)


In [None]:
train = train.withColumn(
    "Age_Band_of_Casualty",
    when(col("Age_Band_of_Casualty") <= 5, "Young")
    .when(col("Age_Band_of_Casualty") <= 8, "Adult")
    .otherwise("Senior")
)
test = test.withColumn(
    "Age_Band_of_Casualty",
    when(col("Age_Band_of_Casualty") <= 5, "Young")
    .when(col("Age_Band_of_Casualty") <= 8, "Adult")
    .otherwise("Senior")
)


In [None]:
train.show()

+------------+-----------------+------------------+--------------------+--------------------------+--------------+---------+-----------+---------------+----------------+---------------------------------+---------------------------------------+----------------+------------------+-----------------------+--------------------------+-------------------+-------------------+-------------------------------------------+-----------------+------------+-----------------------+-----------------+--------------------------------+-----------------+------------------------+-------------------------+---------------------------+--------------------------+-------------------+----------------------------+-------------------------+-------------+-------------+------------------+---------------+--------------+---------------------+--------------------------+------------------+--------------+---------------+---------------+--------------------+-----------------+-------------------+-------------------+---------

In [None]:
train.printSchema()

root
 |-- Police_Force: string (nullable = true)
 |-- Accident_Severity: string (nullable = true)
 |-- Number_of_Vehicles: string (nullable = false)
 |-- Number_of_Casualties: string (nullable = false)
 |-- Local_Authority_(District): string (nullable = true)
 |-- 1st_Road_Class: string (nullable = true)
 |-- Road_Type: string (nullable = true)
 |-- Speed_limit: string (nullable = false)
 |-- Junction_Detail: string (nullable = true)
 |-- Junction_Control: string (nullable = true)
 |-- Pedestrian_Crossing-Human_Control: string (nullable = true)
 |-- Pedestrian_Crossing-Physical_Facilities: string (nullable = true)
 |-- Light_Conditions: string (nullable = true)
 |-- Weather_Conditions: string (nullable = true)
 |-- Road_Surface_Conditions: string (nullable = true)
 |-- Special_Conditions_at_Site: string (nullable = true)
 |-- Carriageway_Hazards: string (nullable = true)
 |-- Urban_or_Rural_Area: string (nullable = true)
 |-- Did_Police_Officer_Attend_Scene_of_Accident: string (nullabl

In [None]:
#####################################################
# Predspracovanie - Transformácia nominálnych atribútov na numerické
######################################################

In [None]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

categorical_columns = [column for column in train.columns]
indexers = [
    StringIndexer(inputCol=column, outputCol=f"{column}_ind", handleInvalid="skip")
    for column in categorical_columns
]

In [None]:
pipeline = Pipeline(stages=indexers)
train_indexed = pipeline.fit(train).transform(train)
test_indexed = pipeline.fit(test).transform(test)

In [None]:
train_indexed.show(5)

+------------+-----------------+------------------+--------------------+--------------------------+--------------+---------+-----------+---------------+----------------+---------------------------------+---------------------------------------+----------------+------------------+-----------------------+--------------------------+-------------------+-------------------+-------------------------------------------+-----------------+------------+-----------------------+-----------------+--------------------------------+-----------------+------------------------+-------------------------+---------------------------+--------------------------+-------------------+----------------------------+-------------------------+-------------+-------------+------------------+---------------+--------------+---------------------+--------------------------+------------------+--------------+---------------+---------------+--------------------+-----------------+-------------------+-------------------+---------

In [None]:
# delete non indx columns
indexed_columns = [column for column in train_indexed.columns if column.endswith('_ind')]

train_indexed = train_indexed.select(*indexed_columns)
test_indexed = test_indexed.select(*indexed_columns)

train_indexed.show()
test_indexed.show()


+----------------+---------------------+----------------------+------------------------+------------------------------+------------------+-------------+---------------+-------------------+--------------------+-------------------------------------+-------------------------------------------+--------------------+----------------------+---------------------------+------------------------------+-----------------------+-----------------------+-----------------------------------------------+---------------------+----------------+---------------------------+---------------------+------------------------------------+---------------------+----------------------------+-----------------------------+-------------------------------+------------------------------+-----------------------+--------------------------------+-----------------------------+-----------------+-----------------+----------------------+-------------------+------------------+-------------------------+-----------------------------

In [None]:
from google.colab import files

train_indexed.write.mode("overwrite").parquet("/content/train_without_ig.parquet")
test_indexed.write.mode("overwrite").parquet("/content/test_without_ig.parquet")

files.download('/content/train_without_ig.parquet')
files.download('/content/test_without_ig.parquet')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>