In [None]:
#####################################################
# Predspracovanie - Vypočítanie pomerového kritéria
######################################################

In [None]:
!pip install findspark
!pip install pyspark
!apt-get install -qq openjdk-17-jdk-headless
from google.colab import drive

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1
Selecting previously unselected package openjdk-17-jre-headless:amd64.
(Reading database ... 126101 files and directories currently installed.)
Preparing to unpack .../openjdk-17-jre-headless_17.0.14+7-1~22.04.1_amd64.deb ...
Unpacking openjdk-17-jre-headless:amd64 (17.0.14+7-1~22.04.1) ...
Selecting previously unselected package openjdk-17-jdk-headless:amd64.
Preparing to unpack .../openjdk-17-jdk-headless_17.0.14+7-1~22.04.1_amd64.deb ...
Unpacking openjdk-17-jdk-headless:amd64 (17.0.14+7-1~22.04.1) ...
Setting up openjdk-17-jre-headless:amd64 (17.0.14+7-1~22.04.1) ...
update-alternatives: using /usr/lib/jvm/java-17-openjdk-amd64/bin/java to provide /usr/bin/java (java) in auto mode
update-alternatives: using /usr/lib/jvm/java-17-openjdk-amd64/bin/jpackage 

In [None]:
drive.flush_and_unmount()
drive.mount('/content/drive')

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


In [None]:
import findspark, os
findspark.init()
from pyspark.sql import SparkSession


# spark config:
spark = SparkSession.builder \
    .appName("IG Calculation") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .getOrCreate()

In [None]:
train_indexed = spark.read.parquet("./drive/MyDrive/dataset/train_without_ig.parquet")
test_indexed = spark.read.parquet("./drive/MyDrive/dataset/test_without_ig.parquet")

In [None]:
#entropia (y) -> entropia (y|x) -> ig
#entropia (y) = -sum(P(y)log2P(y))
#entropia (y|x) = sum(P(x)H(Y|X=x))
#IG = H(y) - H(y|x)

In [None]:
from pyspark.sql import functions as F
import math

def entropy(df, column):
    total_count = df.count()
    value_counts = df.groupBy(column).count()
    probs = value_counts.withColumn("prob", F.col("count") / total_count)
    entropy_val = probs.select(
        (F.col("prob") * F.log2(F.col("prob"))).alias("ent")
    ).agg(F.sum("ent")).collect()[0][0]
    return -entropy_val


In [None]:
def conditional_entropy(df, feature_column, target_column):
    feature_values = df.select(feature_column).distinct().collect()
    total_count = df.count()
    cond_entropy = 0
    for value in feature_values:
        subset_df = df.filter(df[feature_column] == value[0])
        prob = subset_df.count() / total_count
        cond_entropy += prob * entropy(subset_df, target_column)
    return cond_entropy

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col, log2
import numpy as np

def fast_ig(df, feature, target="Accident_Severity_ind"):
    """IG + log"""
    try:
        # null check -> optional
        df_clean = df.filter(
            F.col(feature).isNotNull() &
            F.col(target).isNotNull()
        )

        # entropy
        h_y = entropy(df_clean, target)
        h_y_x = conditional_entropy(df_clean, feature, target)

        ig = max(0.0, h_y - h_y_x)  # IG !< 0

        # log, progress
        print(f"{feature}: H(Y)={h_y:.6f}, H(Y|X)={h_y_x:.6f}, IG={ig:.6f}")
        return (feature, ig)

    except Exception as e:
        print(f"error in {feature}: {str(e)}")
        return (feature, 0.0)

In [None]:
#paralelne vypocty
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
#progress
from tqdm import tqdm

features = [col for col in train_indexed.columns
           if col != "Accident_Severity_ind" and col not in ["id", "timestamp"]]

def calculate_all_ig(features, num_workers=8):
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        results = list(tqdm(
            executor.map(lambda f: (f, fast_ig(train_indexed, f)), features),
            total=len(features)
        ))
    return pd.DataFrame(results, columns=["Feature", "IG"]).sort_values("IG", ascending=False)

In [None]:
ig_results_sample = calculate_all_ig(features)

  0%|          | 0/50 [00:00<?, ?it/s]

Number_of_Casualties_ind: H(Y)=0.717679, H(Y|X)=0.702208, IG=0.015472
Number_of_Vehicles_ind: H(Y)=0.717679, H(Y|X)=0.710499, IG=0.007181
Speed_limit_ind: H(Y)=0.717679, H(Y|X)=0.702583, IG=0.015097
1st_Road_Class_ind: H(Y)=0.717679, H(Y|X)=0.715261, IG=0.002418
Road_Type_ind: H(Y)=0.717679, H(Y|X)=0.713732, IG=0.003947
Pedestrian_Crossing-Human_Control_ind: H(Y)=0.717679, H(Y|X)=0.717537, IG=0.000142
Junction_Detail_ind: H(Y)=0.717679, H(Y|X)=0.706632, IG=0.011047
Pedestrian_Crossing-Physical_Facilities_ind: H(Y)=0.717679, H(Y|X)=0.715874, IG=0.001806
Junction_Control_ind: H(Y)=0.717679, H(Y|X)=0.708258, IG=0.009421
Light_Conditions_ind: H(Y)=0.717679, H(Y|X)=0.711401, IG=0.006278
Road_Surface_Conditions_ind: H(Y)=0.717679, H(Y|X)=0.717576, IG=0.000103
Urban_or_Rural_Area_ind: H(Y)=0.717679, H(Y|X)=0.704113, IG=0.013566
Weather_Conditions_ind: H(Y)=0.717679, H(Y|X)=0.715571, IG=0.002108
Did_Police_Officer_Attend_Scene_of_Accident_ind: H(Y)=0.717679, H(Y|X)=0.704271, IG=0.013408
Carria

  2%|▏         | 1/50 [05:11<4:14:32, 311.69s/it]

Police_Force_ind: H(Y)=0.717679, H(Y|X)=0.708009, IG=0.009670
Casualty_Class_ind: H(Y)=0.717679, H(Y|X)=0.714041, IG=0.003638
Sex_of_Casualty_ind: H(Y)=0.717679, H(Y|X)=0.712290, IG=0.005390
Propulsion_Code_ind: H(Y)=0.717679, H(Y|X)=0.717315, IG=0.000364
Age_Band_of_Casualty_ind: H(Y)=0.717679, H(Y|X)=0.715831, IG=0.001848
Age_of_Casualty_ind: H(Y)=0.717679, H(Y|X)=0.715280, IG=0.002399
Casualty_Severity_ind: H(Y)=0.717679, H(Y|X)=0.319955, IG=0.397724
Car_Passenger_ind: H(Y)=0.717679, H(Y|X)=0.717023, IG=0.000656
Bus_or_Coach_Passenger_ind: H(Y)=0.717679, H(Y|X)=0.717411, IG=0.000268
Pedestrian_Movement_ind: H(Y)=0.717679, H(Y|X)=0.714259, IG=0.003420
Pedestrian_Location_ind: H(Y)=0.717679, H(Y|X)=0.714304, IG=0.003375
Casualty_Home_Area_Type_ind: H(Y)=0.717679, H(Y|X)=0.715009, IG=0.002670
Vehicle_Reference_ind: H(Y)=0.717679, H(Y|X)=0.712094, IG=0.005585
Casualty_Type_ind: H(Y)=0.717679, H(Y|X)=0.699099, IG=0.018581
Vehicle_Reference_Casualty_ind: H(Y)=0.717679, H(Y|X)=0.709768, IG

100%|██████████| 50/50 [11:28<00:00, 13.77s/it]  

Local_Authority_(District)_ind: H(Y)=0.717679, H(Y|X)=0.697575, IG=0.020104





In [None]:
significant_features = [
    "Casualty_Severity_ind",
    "Local_Authority_(District)_ind",
    "Casualty_Type_ind",
    "Vehicle_Manoeuvre_ind",
    "Number_of_Casualties_ind",
    "Speed_limit_ind",
    "Casualty_Reference_ind",
    "Urban_or_Rural_Area_ind",
    "Did_Police_Officer_Attend_Scene_of_Accident_ind",
    "Junction_Detail_ind",
    "Vehicle_Leaving_Carriageway_ind",
    "Junction_Location_ind",
    "Vehicle_Type_ind",
    "Police_Force_ind",
    "Junction_Control_ind",
    "Vehicle_Reference_Casualty_ind",
    "1st_Point_of_Impact_ind",
    "Number_of_Vehicles_ind",
    "Light_Conditions_ind",
    "Hit_Object_off_Carriageway_ind",
    "Accident_Severity_ind"
]

In [None]:
train_filtered = train_indexed.select(significant_features)

In [None]:
print(train_filtered.columns)

['Casualty_Severity_ind', 'Local_Authority_(District)_ind', 'Casualty_Type_ind', 'Vehicle_Manoeuvre_ind', 'Number_of_Casualties_ind', 'Speed_limit_ind', 'Casualty_Reference_ind', 'Urban_or_Rural_Area_ind', 'Did_Police_Officer_Attend_Scene_of_Accident_ind', 'Junction_Detail_ind', 'Vehicle_Leaving_Carriageway_ind', 'Junction_Location_ind', 'Vehicle_Type_ind', 'Police_Force_ind', 'Junction_Control_ind', 'Vehicle_Reference_Casualty_ind', '1st_Point_of_Impact_ind', 'Number_of_Vehicles_ind', 'Light_Conditions_ind', 'Hit_Object_off_Carriageway_ind', 'Accident_Severity_ind']


In [None]:
test_filtered = test_indexed.select(significant_features)

In [None]:
print(test_filtered.columns)

['Casualty_Severity_ind', 'Local_Authority_(District)_ind', 'Casualty_Type_ind', 'Vehicle_Manoeuvre_ind', 'Number_of_Casualties_ind', 'Speed_limit_ind', 'Casualty_Reference_ind', 'Urban_or_Rural_Area_ind', 'Did_Police_Officer_Attend_Scene_of_Accident_ind', 'Junction_Detail_ind', 'Vehicle_Leaving_Carriageway_ind', 'Junction_Location_ind', 'Vehicle_Type_ind', 'Police_Force_ind', 'Junction_Control_ind', 'Vehicle_Reference_Casualty_ind', '1st_Point_of_Impact_ind', 'Number_of_Vehicles_ind', 'Light_Conditions_ind', 'Hit_Object_off_Carriageway_ind', 'Accident_Severity_ind']


In [None]:
train_filtered.write.mode("append").parquet("./drive/MyDrive/dataset/train_selected.parquet")
test_filtered.write.mode("append").parquet("./drive/MyDrive/dataset/test_selected.parquet")