In [None]:
import pandas as pd

# Load types
types_df = pd.read_csv(
    "C://Users//jirip//Documents//Developer//python//kriminalita//types.csv",
    sep=",",
    encoding="utf-8"
)

# Build direct parent map
parent_map = (
    types_df
    .melt(id_vars='id', value_vars=['parent_id1', 'parent_id2', 'parent_id3'], value_name='parent_id')
    .dropna(subset=['parent_id'])
    .query('parent_id > 0')
    .groupby('id')['parent_id']
    .apply(set)
    .to_dict()
)

# Initialize ancestor map
ancestor_map = {tid: set() for tid in types_df['id']}

# Iteratively propagate ancestors
updated = True
while updated:
    updated = False
    for child, parents in parent_map.items():
        current_ancestors = ancestor_map[child].copy()
        for parent in parents:
            current_ancestors.add(parent)
            current_ancestors.update(ancestor_map[parent])  # inherit ancestors of parent
        if current_ancestors != ancestor_map[child]:
            ancestor_map[child] = current_ancestors
            updated = True  # Keep looping if anything changed

# Now ancestor_map holds: {type_id: set(all ancestors)}


In [None]:
import pandas as pd
from collections import defaultdict

# === 1. Load and Prepare Types Data ===
types_df = pd.read_csv(
    "C://Users//jirip//Documents//Developer//python//kriminalita//types.csv",
    sep=",",
    encoding="utf-8"
)

type_lookup = types_df.set_index('id')[['name', 'label']].to_dict('index')

def expand_hierarchy(row):
    ids = [row['parent_id1'], row['parent_id2'], row['parent_id3'], row['id']]
    labels = [type_lookup.get(i, {'label': None})['label'] if i > 0 else None for i in ids]
    names = [type_lookup.get(i, {'name': None})['name'] if i > 0 else None for i in ids]

    result = {}
    for level, (tid, label, name) in enumerate(zip(ids, labels, names), 1):
        result[f'level_{level}_id'] = tid if tid > 0 else None
        result[f'level_{level}_label'] = label
        result[f'level_{level}_name'] = name
    return pd.Series(result)

dimension_df = types_df.apply(expand_hierarchy, axis=1)
dimension_df = pd.concat([types_df[['id']], dimension_df, types_df[['name', 'label']]], axis=1)

# Save dimension table
dimension_df.to_csv('dim_type.csv', index=False)

# === 2. Build Type Ancestor Map ===
# Build direct parent map
parent_map = (
    types_df
    .melt(id_vars='id', value_vars=['parent_id1', 'parent_id2', 'parent_id3'], value_name='parent_id')
    .dropna(subset=['parent_id'])
    .query('parent_id > 0')
    .groupby('id')['parent_id']
    .apply(set)
    .to_dict()
)

# Initialize ancestor map
ancestor_map = {tid: set() for tid in types_df['id']}

# Iteratively propagate ancestors
updated = True
while updated:
    updated = False
    for child, parents in parent_map.items():
        current_ancestors = ancestor_map[child].copy()
        for parent in parents:
            current_ancestors.add(parent)
            current_ancestors.update(ancestor_map[parent])  # inherit ancestors of parent
        if current_ancestors != ancestor_map[child]:
            ancestor_map[child] = current_ancestors
            updated = True  # Keep looping if anything changed

# Now ancestor_map holds: {type_id: set(all ancestors)}


type_ancestors = ancestor_map

# === 3. Load Fact Data ===
fact_df = pd.read_csv(
    "C://Users//jirip//Documents//Developer//python//kriminalita//202504.csv",
    sep=",",
    encoding="utf-8"
)

# === 4. Filter Most Granular Types ===
def filter_most_granular(group):
    type_ids = set(group['types'])

    # Remove non-granular types
    to_remove = set()
    for t in type_ids:
        ancestors = type_ancestors.get(t, set())
        to_remove.update(type_ids & ancestors)  # Only care if ancestors exist in current types

    granular_types = type_ids - to_remove

    # Take common attributes once
    row = group.iloc[0]
    common_data = {
        'id': row['id'],
        'Longitude': row['x'],
        'Latitude': row['y'],
        'mp': row['mp'],
        'date': row['date'],
        'state': row['state'],
        'relevance': row['relevance'],
    }

    # Expand into rows
    records = [
        {**common_data, 'types': t}
        for t in granular_types
    ]

    return pd.DataFrame(records)


# === 5. Process and Save ===
filtered_fact_df = (
    fact_df
    .groupby('id', group_keys=False)
    .apply(filter_most_granular)
    .reset_index(drop=True)
    .sort_values(by=['id', 'types'])
    .assign(is_one=lambda df: df.groupby('id').cumcount() + 1)
    .reset_index(drop=True)
)

filtered_fact_df.to_csv('fact_clean.csv', index=False)

# Preview the cleaned fact table
print(filtered_fact_df.head())


In [None]:
import pandas as pd
from collections import defaultdict

# === 1. Load and Prepare Types Data ===
types_df = pd.read_csv(
    "C://Users//jirip//Documents//Developer//python//kriminalita//types.csv",
    sep=",",
    encoding="utf-8"
)

type_lookup = types_df.set_index('id')[['name', 'label']].to_dict('index')

def expand_hierarchy(row):
    ids = [row['parent_id1'], row['parent_id2'], row['parent_id3'], row['id']]
    labels = [type_lookup.get(i, {'label': None})['label'] if i > 0 else None for i in ids]
    names = [type_lookup.get(i, {'name': None})['name'] if i > 0 else None for i in ids]

    result = {}
    for level, (tid, label, name) in enumerate(zip(ids, labels, names), 1):
        result[f'level_{level}_id'] = tid if tid > 0 else None
        result[f'level_{level}_label'] = label
        result[f'level_{level}_name'] = name
    return pd.Series(result)

dimension_df = types_df.apply(expand_hierarchy, axis=1)
dimension_df = pd.concat([types_df[['id']], dimension_df, types_df[['name', 'label']]], axis=1)

# Save dimension table
dimension_df.to_csv('dim_type_new.csv', index=False)

# === 2. Build Type Ancestor Map ===
def build_type_ancestor_map(df: pd.DataFrame) -> dict[int, set[int]]:
    def collect_ancestors(type_id: int) -> set[int]:
        if not type_id or pd.isna(type_id):
            return set()

        row = df.loc[df['id'] == type_id]
        if row.empty:
            return set()

        direct_parents = {pid for pid in row[['parent_id1', 'parent_id2', 'parent_id3']].values.flatten() if pid > 0}

        ancestors = set(direct_parents)
        for parent in direct_parents:
            ancestors |= collect_ancestors(parent)

        return ancestors

    return {tid: collect_ancestors(tid) for tid in df['id']}

type_ancestors = build_type_ancestor_map(types_df)

# === 3. Load Fact Data ===
fact_df = pd.read_csv(
    "C://Users//jirip//Documents//Developer//python//kriminalita//202504.csv",
    sep=",",
    encoding="utf-8"
)

# === 4. Filter Most Granular Types ===
def filter_most_granular(group: pd.DataFrame) -> pd.DataFrame:
    type_ids = set(group['types'])
    to_remove = {
        t1 for t1 in type_ids
        for t2 in type_ids
        if t1 != t2 and t1 in type_ancestors.get(t2, set())
    }
    granular_types = type_ids - to_remove

    return pd.DataFrame({
        'id': [group['id'].iloc[0]] * len(granular_types),
        'types': list(granular_types),
        'Longitude': group['x'].iloc[0],
        'Latitude': group['y'].iloc[0],
        'mp': group['mp'].iloc[0],
        'date': group['date'].iloc[0],
        'state': group['state'].iloc[0],
        'relevance': group['relevance'].iloc[0]
    })

# === 5. Process and Save ===
filtered_fact_df = (
    fact_df
    .groupby('id', group_keys=False)
    .apply(filter_most_granular)
    .reset_index(drop=True)
    .sort_values(by=['id', 'types'])
    .assign(is_one=lambda df: df.groupby('id').cumcount() + 1)
    .reset_index(drop=True)
)

filtered_fact_df.to_csv('fact_clean_new.csv', index=False)

# Preview the cleaned fact table
print(filtered_fact_df.head())


In [None]:
import pandas as pd
import time
from collections import defaultdict

def load_types(types_path):
    types_df = pd.read_csv(types_path, sep=",", encoding="utf-8")
    return types_df

def build_parent_map(types_df):
    parent_map = defaultdict(set)
    for _, row in types_df.iterrows():
        tid = row['id']
        for pid in (row['parent_id1'], row['parent_id2'], row['parent_id3']):
            if pid > 0:
                parent_map[tid].add(pid)
    return parent_map

def build_ancestor_map(parent_map):
    ancestor_map = defaultdict(set)

    changed = True
    while changed:
        changed = False
        for child, parents in parent_map.items():
            current_ancestors = ancestor_map[child]
            new_ancestors = set()
            for parent in parents:
                new_ancestors.add(parent)
                new_ancestors.update(ancestor_map[parent])
            if not new_ancestors.issubset(current_ancestors):
                ancestor_map[child].update(new_ancestors)
                changed = True
    return dict(ancestor_map)

def load_fact_table(fact_path):
    fact_df = pd.read_csv(fact_path, sep=",", encoding="utf-8")
    return fact_df

def filter_most_granular(group, type_ancestors):
    type_ids = set(group['types'])

    # Remove types that are ancestors of others
    to_remove = set()
    for t in type_ids:
        ancestors = type_ancestors.get(t, set())
        to_remove.update(type_ids & ancestors)

    granular_types = type_ids - to_remove

    # Fixed attributes
    row = group.iloc[0]
    common_data = {
        'id': row['id'],
        'Longitude': row['x'],
        'Latitude': row['y'],
        'mp': row['mp'],
        'date': row['date'],
        'state': row['state'],
        'relevance': row['relevance'],
    }

    # Expand
    records = [
        {**common_data, 'types': t}
        for t in granular_types
    ]
    return pd.DataFrame(records)

# def clean_fact_table(fact_df, type_ancestors):
#     # Group and apply filtering
#     filtered_df = fact_df.groupby('id', group_keys=False).apply(
#         lambda group: filter_most_granular(group, type_ancestors)
#     ).reset_index(drop=True)
# 
#     # Assign 'is_one'
#     filtered_df['is_one'] = (
#         filtered_df.sort_values(by=['id', 'types'])
#         .groupby('id')
#         .cumcount() + 1
#     )
# 
#     return filtered_df.sort_values(by=['id', 'types']).reset_index(drop=True)

def clean_fact_table(fact_df, type_ancestors):
    # Apply filtering per group while avoiding the deprecation warning
    grouped = fact_df.groupby('id', group_keys=False)

    # Explicitly remove grouping columns from the group if not needed in function
    filtered_df = grouped.apply(
        lambda group: filter_most_granular(group.copy(), type_ancestors),
        include_group=False  # Avoid future deprecation
    ).reset_index(drop=True)

    # Assign 'is_one'
    filtered_df['is_one'] = (
        filtered_df.sort_values(by=['id', 'types'])
        .groupby('id')
        .cumcount() + 1
    )

    return filtered_df.sort_values(by=['id', 'types']).reset_index(drop=True)

def save_fact_table(filtered_df, output_path):
    filtered_df.to_csv(output_path, index=False)

def main():
    start_time = time.time()

    # Paths
    types_path = "C://Users//jirip//Documents//Developer//python//kriminalita//types.csv"
    
    yearMonts = ["202412","202411","202410","202409","202408","202407","202406","202405","202404","202403","202402","202401"]
    for yearMonth in yearMonts:
        loop_start_time = time.time()
        fact_path = f"C://Users//jirip//Documents//Developer//python//kriminalita//source_files//{yearMonth}.csv"
        output_path = f"fact_clean_{yearMonth}.csv"
        print(f"Processing {yearMonth}...")
        # Load
        types_df = load_types(types_path)
        fact_df = load_fact_table(fact_path)

        # Build maps
        parent_map = build_parent_map(types_df)
        type_ancestors = build_ancestor_map(parent_map)

        # Clean fact table
        cleaned_fact_df = clean_fact_table(fact_df, type_ancestors)

        # Save
        save_fact_table(cleaned_fact_df, output_path)
        
        elapsed = time.time() - loop_start_time
        print(f"Processed {yearMonth} in {elapsed:.2f} seconds.")    


    
    # Print elapsed time

    elapsed = time.time() - start_time
    print(f"Completed in {elapsed:.2f} seconds.")

if __name__ == "__main__":
    main()


In [None]:
import polars as pl
import time
from collections import defaultdict

def load_types(types_path):
    return pl.read_csv(types_path)

def load_fact_table(fact_path):
    return pl.read_csv(fact_path)

def build_parent_map(types_df):
    parent_map = defaultdict(set)
    for row in types_df.iter_rows(named=True):
        tid = row['id']
        for pid in (row['parent_id1'], row['parent_id2'], row['parent_id3']):
            if pid > 0:
                parent_map[tid].add(pid)
    return parent_map

def build_ancestor_map(parent_map):
    ancestor_map = defaultdict(set)
    changed = True
    while changed:
        changed = False
        for child, parents in parent_map.items():
            current_ancestors = ancestor_map[child]
            new_ancestors = set()
            for parent in parents:
                new_ancestors.add(parent)
                new_ancestors.update(ancestor_map[parent])
            if not new_ancestors.issubset(current_ancestors):
                ancestor_map[child].update(new_ancestors)
                changed = True
    return dict(ancestor_map)

def explode_types_column(fact_df: pl.DataFrame) -> pl.DataFrame:
    return fact_df.select([
        pl.col("id"),
        pl.col("x").alias("Longitude"),
        pl.col("y").alias("Latitude"),
        "mp", "date", "state", "relevance",
        pl.col("types")
    ])

def filter_most_granular_py(group, type_ancestors):
    type_ids = set(group)
    to_remove = set()
    for t in type_ids:
        to_remove.update(type_ids & type_ancestors.get(t, set()))
    return [t not in to_remove for t in group]

def get_granular_type_mask(fact_df: pl.DataFrame, type_ancestors: dict) -> pl.Series:
    ids = fact_df["id"].to_list()
    types = fact_df["types"].to_list()

    granular_mask = []
    current_id = None
    group = []

    for i in range(len(ids)):
        if ids[i] != current_id:
            if group:
                mask = filter_most_granular_py(group, type_ancestors)
                granular_mask.extend(mask)
            group = [types[i]]
            current_id = ids[i]
        else:
            group.append(types[i])
    if group:
        mask = filter_most_granular_py(group, type_ancestors)
        granular_mask.extend(mask)

    return pl.Series("granular", granular_mask)

def clean_fact_table_polars(fact_df: pl.DataFrame, type_ancestors: dict) -> pl.DataFrame:
    exploded = explode_types_column(fact_df)
    mask = get_granular_type_mask(exploded, type_ancestors)
    filtered_df = exploded.with_columns(mask).filter(pl.col("granular")).drop("granular")

    filtered_df = (
    filtered_df.sort(["id", "types"])
    .with_columns([
        (pl.arange(0, pl.count()).over("id")).alias("is_one") + 1
    ])
)
    filtered_df = filtered_df.with_columns([
    pl.col("date").str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%.f%z").alias("parsed_date"),
    pl.col("date").str.split("T").list.get(0).alias("date_key"),
    pl.col("date").str.split("T").list.get(1).str.split(":").list.slice(0, 2).list.join(":").alias("time_key"),
])

    # Drop the original 'date' column if no longer needed
    filtered_df = filtered_df.drop("date", "parsed_date")


    
    return filtered_df

def save_fact_table(df: pl.DataFrame, output_path: str):
    df.write_csv(output_path)

def main():
    start_time = time.time()

    types_path = "C://Users//jirip//Documents//Developer//python//kriminalita//dimensions//types.csv"

    yearMonths = ['202501','202502','202503','202504','202401','202402','202403','202404','202405','202406','202407','202408','202409','202410','202411','202412','202301','202302','202303','202304','202305','202306','202307','202308','202309','202310','202311','202312','202201','202202','202203','202204','202205','202206','202207','202208','202209','202210','202211','202212','202101','202102','202103','202104','202105','202106','202107','202108','202109','202110','202111','202112','202001','202002','202003','202004','202005','202006','202007','202008','202009','202010','202011','202012','201901','201902','201903','201904','201905','201906','201907','201908','201909','201910','201911','201912','201801','201802','201803','201804','201805','201806','201807','201808','201809','201810','201811','201812','201701','201702','201703','201704','201705','201706','201707','201708','201709','201710','201711','201712','201601','201602','201603','201604','201605','201606','201607','201608','201609','201610','201611','201612','201501','201502','201503','201504','201505','201506','201507','201508','201509','201510','201511','201512','201401','201402','201403','201404','201405','201406','201407','201408','201409','201410','201411','201412','201301','201302','201303','201304','201305','201306','201307','201308','201309','201310','201311','201312','201201','201202','201203','201204','201205','201206','201207','201208','201209','201210','201211','201212']
    # yearMonths = ['202501','202502']


    for yearMonth in yearMonths:
        loop_start_time = time.time()
        fact_path = f"C://Users//jirip//Documents//Developer//python//kriminalita//source_files//{yearMonth}.csv"
        output_path = f"processed_files//fact_clean_{yearMonth}.csv"
        print(f"Processing {yearMonth}...")

        types_df = load_types(types_path)
        fact_df = load_fact_table(fact_path)
        print(f"Len of {yearMonth}... {len(fact_df)}")

        parent_map = build_parent_map(types_df)
        type_ancestors = build_ancestor_map(parent_map)

        cleaned_df = clean_fact_table_polars(fact_df, type_ancestors)
        print(f"Len of cleaned {yearMonth}... {len(cleaned_df)}")

        save_fact_table(cleaned_df, output_path)
        elapsed = time.time() - loop_start_time
        print(f"Processed {yearMonth} in {elapsed:.2f} seconds.")  

    print(f"Completed in {time.time() - start_time:.2f} seconds.")

if __name__ == "__main__":
    main()


Processing 202501...
Len of 202501... 338707


  (pl.arange(0, pl.count()).over("id")).alias("is_one") + 1


Len of cleaned 202501... 125717
Processed 202501 in 0.59 seconds.
Processing 202502...
Len of 202502... 324007
Len of cleaned 202502... 119628
Processed 202502 in 0.52 seconds.
Processing 202503...
Len of 202503... 357619
Len of cleaned 202503... 131215
Processed 202503 in 0.68 seconds.
Processing 202504...
Len of 202504... 310501
Len of cleaned 202504... 111895
Processed 202504 in 0.48 seconds.
Processing 202401...
Len of 202401... 336512
Len of cleaned 202401... 126193
Processed 202401 in 0.55 seconds.
Processing 202402...
Len of 202402... 358757
Len of cleaned 202402... 132984
Processed 202402 in 0.64 seconds.
Processing 202403...
Len of 202403... 372085
Len of cleaned 202403... 137538
Processed 202403 in 0.64 seconds.
Processing 202404...
Len of 202404... 373364
Len of cleaned 202404... 138152
Processed 202404 in 0.63 seconds.
Processing 202405...
Len of 202405... 372166
Len of cleaned 202405... 138293
Processed 202405 in 0.66 seconds.
Processing 202406...
Len of 202406... 332026
L

In [1]:
import polars as pl
import os

# Define the folder path
folder_path = "C://Users//jirip//Documents//Developer//python//kriminalita//processed_files"

# List all CSV files in the folder
csv_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]

# Read and append all files
appended_df = pl.concat([pl.read_csv(file) for file in csv_files])

# Preview the appended DataFrame
print(appended_df.head())
print(len(appended_df))

shape: (5, 10)
┌──────┬───────────┬───────────┬───────┬───┬───────┬────────┬────────────┬───────────┐
│ id   ┆ Longitude ┆ Latitude  ┆ mp    ┆ … ┆ types ┆ is_one ┆ date_only  ┆ time_only │
│ ---  ┆ ---       ┆ ---       ┆ ---   ┆   ┆ ---   ┆ ---    ┆ ---        ┆ ---       │
│ i64  ┆ f64       ┆ f64       ┆ bool  ┆   ┆ i64   ┆ i64    ┆ str        ┆ str       │
╞══════╪═══════════╪═══════════╪═══════╪═══╪═══════╪════════╪════════════╪═══════════╡
│ 739  ┆ 17.316089 ┆ 49.30221  ┆ false ┆ … ┆ 101   ┆ 1      ┆ 2012-01-19 ┆ 13:14     │
│ 739  ┆ 17.316089 ┆ 49.30221  ┆ false ┆ … ┆ 102   ┆ 2      ┆ 2012-01-19 ┆ 13:14     │
│ 1075 ┆ 14.585108 ┆ 50.226063 ┆ false ┆ … ┆ 120   ┆ 1      ┆ 2012-01-26 ┆ 16:00     │
│ 2822 ┆ 17.251485 ┆ 49.572761 ┆ false ┆ … ┆ 77    ┆ 1      ┆ 2012-01-27 ┆ 10:03     │
│ 3016 ┆ 15.796849 ┆ 49.957659 ┆ false ┆ … ┆ 54    ┆ 1      ┆ 2012-01-12 ┆ 09:20     │
└──────┴───────────┴───────────┴───────┴───┴───────┴────────┴────────────┴───────────┘
17764377


In [None]:
import json
import pandas as pd
import geopandas as gpd
from shapely.geometry import shape, Point

# Load obce polygons from JSON
with open('C://Users//jirip//Documents//Developer//python//kriminalita//source_files//okresy.json') as f:
    regions_data = json.load(f)

# Prepare polygons
regions = []
for feature in regions_data['features']:  # assuming GeoJSON-like structure
    region_id = feature['id']
    polygon = shape(feature['geometry'])  # shapely geometry from geojson
    regions.append((region_id, polygon))
    
#  Load your obce polygons from JSON
with open('C://Users//jirip//Documents//Developer//python//kriminalita//source_files//obce.json') as f:
    cities_data = json.load(f)
# Prepare polygons
cities = []
for feature in cities_data['features']:  # assuming GeoJSON-like structure
    city_id = feature['id']
    polygon = shape(feature['geometry'])  # shapely geometry from geojson
    cities.append((city_id, polygon))

df = appended_df.to_pandas()

# Prepare regions GeoDataFrame
gdf_regions = gpd.GeoDataFrame({'region_id': [r[0] for r in regions]}, geometry=[r[1] for r in regions])

# Prepare points GeoDataFrame
gdf_points = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.Longitude, df.Latitude))

# Spatial join
gdf_joined = gpd.sjoin(gdf_points, gdf_regions, how='left', predicate='within')

# If needed, bring the result back to pandas
df_result = pd.DataFrame(gdf_joined.drop(columns=['geometry', 'index_right']))
# df_result['mp'] = df_result['mp'].astype(int)
# df_result['region_id'] = df_result['region_id'].str[:-6]

# Prepare cities GeoDataFrame
gdf_cities = gpd.GeoDataFrame({'city_id': [r[0] for r in cities]}, geometry=[r[1] for r in cities])
gdf_points = gpd.GeoDataFrame(df_result, geometry=gpd.points_from_xy(df_result.Longitude, df_result.Latitude))
gdf_joined = gpd.sjoin(gdf_points, gdf_cities,how='left', predicate='within')
df_result_final = pd.DataFrame(gdf_joined.drop(columns=['geometry', 'index_right', 'Longitude', 'Latitude']))
df_result_final['mp'] = df_result_final['mp'].astype(int)
df_result_final['city_id'] = df_result_final['city_id'].fillna(df_result_final['region_id'])
df_result_final = df_result_final.drop(columns=['region_id'])
df_result_final.to_csv('mapped_files//F_Crime.csv', index=False)

In [None]:
print(len(df_result_final))

18433166


In [None]:
# Split the 'date' column into separate 'date' and 'time' columns
appended_df = appended_df.with_columns([
    pl.col("date").str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%.f%z").alias("parsed_date"),
    pl.col("date").str.split("T").list.get(0).alias("date_only"),
    pl.col("date").str.split("T").list.get(1).str.split(":00.").list.get(0).alias("time_only"),
])

# Drop the original 'date' column if no longer needed
appended_df = appended_df.drop("date", "parsed_date")

# Preview the updated DataFrame
print(appended_df.head())

# Save the appended DataFrame to a new CSV file
appended_df.write_csv("processed_files//appended_fact_clean.csv")
