In [3]:
import pandas as pd

# Load types
types_df = pd.read_csv(
    "C://Users//jirip//Documents//Developer//python//kriminalita//types.csv",
    sep=",",
    encoding="utf-8"
)

# Build direct parent map
parent_map = (
    types_df
    .melt(id_vars='id', value_vars=['parent_id1', 'parent_id2', 'parent_id3'], value_name='parent_id')
    .dropna(subset=['parent_id'])
    .query('parent_id > 0')
    .groupby('id')['parent_id']
    .apply(set)
    .to_dict()
)

# Initialize ancestor map
ancestor_map = {tid: set() for tid in types_df['id']}

# Iteratively propagate ancestors
updated = True
while updated:
    updated = False
    for child, parents in parent_map.items():
        current_ancestors = ancestor_map[child].copy()
        for parent in parents:
            current_ancestors.add(parent)
            current_ancestors.update(ancestor_map[parent])  # inherit ancestors of parent
        if current_ancestors != ancestor_map[child]:
            ancestor_map[child] = current_ancestors
            updated = True  # Keep looping if anything changed

# Now ancestor_map holds: {type_id: set(all ancestors)}


In [5]:
import pandas as pd
from collections import defaultdict

# === 1. Load and Prepare Types Data ===
types_df = pd.read_csv(
    "C://Users//jirip//Documents//Developer//python//kriminalita//types.csv",
    sep=",",
    encoding="utf-8"
)

type_lookup = types_df.set_index('id')[['name', 'label']].to_dict('index')

def expand_hierarchy(row):
    ids = [row['parent_id1'], row['parent_id2'], row['parent_id3'], row['id']]
    labels = [type_lookup.get(i, {'label': None})['label'] if i > 0 else None for i in ids]
    names = [type_lookup.get(i, {'name': None})['name'] if i > 0 else None for i in ids]

    result = {}
    for level, (tid, label, name) in enumerate(zip(ids, labels, names), 1):
        result[f'level_{level}_id'] = tid if tid > 0 else None
        result[f'level_{level}_label'] = label
        result[f'level_{level}_name'] = name
    return pd.Series(result)

dimension_df = types_df.apply(expand_hierarchy, axis=1)
dimension_df = pd.concat([types_df[['id']], dimension_df, types_df[['name', 'label']]], axis=1)

# Save dimension table
dimension_df.to_csv('dim_type.csv', index=False)

# === 2. Build Type Ancestor Map ===
# Build direct parent map
parent_map = (
    types_df
    .melt(id_vars='id', value_vars=['parent_id1', 'parent_id2', 'parent_id3'], value_name='parent_id')
    .dropna(subset=['parent_id'])
    .query('parent_id > 0')
    .groupby('id')['parent_id']
    .apply(set)
    .to_dict()
)

# Initialize ancestor map
ancestor_map = {tid: set() for tid in types_df['id']}

# Iteratively propagate ancestors
updated = True
while updated:
    updated = False
    for child, parents in parent_map.items():
        current_ancestors = ancestor_map[child].copy()
        for parent in parents:
            current_ancestors.add(parent)
            current_ancestors.update(ancestor_map[parent])  # inherit ancestors of parent
        if current_ancestors != ancestor_map[child]:
            ancestor_map[child] = current_ancestors
            updated = True  # Keep looping if anything changed

# Now ancestor_map holds: {type_id: set(all ancestors)}


type_ancestors = ancestor_map

# === 3. Load Fact Data ===
fact_df = pd.read_csv(
    "C://Users//jirip//Documents//Developer//python//kriminalita//202504.csv",
    sep=",",
    encoding="utf-8"
)

# === 4. Filter Most Granular Types ===
def filter_most_granular(group):
    type_ids = set(group['types'])

    # Remove non-granular types
    to_remove = set()
    for t in type_ids:
        ancestors = type_ancestors.get(t, set())
        to_remove.update(type_ids & ancestors)  # Only care if ancestors exist in current types

    granular_types = type_ids - to_remove

    # Take common attributes once
    row = group.iloc[0]
    common_data = {
        'id': row['id'],
        'Longitude': row['x'],
        'Latitude': row['y'],
        'mp': row['mp'],
        'date': row['date'],
        'state': row['state'],
        'relevance': row['relevance'],
    }

    # Expand into rows
    records = [
        {**common_data, 'types': t}
        for t in granular_types
    ]

    return pd.DataFrame(records)


# === 5. Process and Save ===
filtered_fact_df = (
    fact_df
    .groupby('id', group_keys=False)
    .apply(filter_most_granular)
    .reset_index(drop=True)
    .sort_values(by=['id', 'types'])
    .assign(is_one=lambda df: df.groupby('id').cumcount() + 1)
    .reset_index(drop=True)
)

filtered_fact_df.to_csv('fact_clean.csv', index=False)

# Preview the cleaned fact table
print(filtered_fact_df.head())


  .apply(filter_most_granular)


         id  Longitude   Latitude     mp                            date  \
0  26505406  14.414338  50.087987  False  2025-04-04T13:00:00.0000+02:00   
1  26767015  17.693409  49.080685  False  2025-04-11T20:14:00.0000+02:00   
2  26863079  14.195408  50.743945  False  2025-04-01T12:32:00.0000+02:00   
3  26863079  14.195408  50.743945  False  2025-04-01T12:32:00.0000+02:00   
4  26896576  14.422167  50.066555  False  2025-04-04T00:47:00.0000+02:00   

   state  relevance  types  is_one  
0      2          4    102       1  
1      1          4    111       1  
2      2          4     69       1  
3      2          4    127       2  
4      2          4      5       1  


In [None]:
import pandas as pd
from collections import defaultdict

# === 1. Load and Prepare Types Data ===
types_df = pd.read_csv(
    "C://Users//jirip//Documents//Developer//python//kriminalita//types.csv",
    sep=",",
    encoding="utf-8"
)

type_lookup = types_df.set_index('id')[['name', 'label']].to_dict('index')

def expand_hierarchy(row):
    ids = [row['parent_id1'], row['parent_id2'], row['parent_id3'], row['id']]
    labels = [type_lookup.get(i, {'label': None})['label'] if i > 0 else None for i in ids]
    names = [type_lookup.get(i, {'name': None})['name'] if i > 0 else None for i in ids]

    result = {}
    for level, (tid, label, name) in enumerate(zip(ids, labels, names), 1):
        result[f'level_{level}_id'] = tid if tid > 0 else None
        result[f'level_{level}_label'] = label
        result[f'level_{level}_name'] = name
    return pd.Series(result)

dimension_df = types_df.apply(expand_hierarchy, axis=1)
dimension_df = pd.concat([types_df[['id']], dimension_df, types_df[['name', 'label']]], axis=1)

# Save dimension table
dimension_df.to_csv('dim_type_new.csv', index=False)

# === 2. Build Type Ancestor Map ===
def build_type_ancestor_map(df: pd.DataFrame) -> dict[int, set[int]]:
    def collect_ancestors(type_id: int) -> set[int]:
        if not type_id or pd.isna(type_id):
            return set()

        row = df.loc[df['id'] == type_id]
        if row.empty:
            return set()

        direct_parents = {pid for pid in row[['parent_id1', 'parent_id2', 'parent_id3']].values.flatten() if pid > 0}

        ancestors = set(direct_parents)
        for parent in direct_parents:
            ancestors |= collect_ancestors(parent)

        return ancestors

    return {tid: collect_ancestors(tid) for tid in df['id']}

type_ancestors = build_type_ancestor_map(types_df)

# === 3. Load Fact Data ===
fact_df = pd.read_csv(
    "C://Users//jirip//Documents//Developer//python//kriminalita//202504.csv",
    sep=",",
    encoding="utf-8"
)

# === 4. Filter Most Granular Types ===
def filter_most_granular(group: pd.DataFrame) -> pd.DataFrame:
    type_ids = set(group['types'])
    to_remove = {
        t1 for t1 in type_ids
        for t2 in type_ids
        if t1 != t2 and t1 in type_ancestors.get(t2, set())
    }
    granular_types = type_ids - to_remove

    return pd.DataFrame({
        'id': [group['id'].iloc[0]] * len(granular_types),
        'types': list(granular_types),
        'Longitude': group['x'].iloc[0],
        'Latitude': group['y'].iloc[0],
        'mp': group['mp'].iloc[0],
        'date': group['date'].iloc[0],
        'state': group['state'].iloc[0],
        'relevance': group['relevance'].iloc[0]
    })

# === 5. Process and Save ===
filtered_fact_df = (
    fact_df
    .groupby('id', group_keys=False)
    .apply(filter_most_granular)
    .reset_index(drop=True)
    .sort_values(by=['id', 'types'])
    .assign(is_one=lambda df: df.groupby('id').cumcount() + 1)
    .reset_index(drop=True)
)

filtered_fact_df.to_csv('fact_clean_new.csv', index=False)

# Preview the cleaned fact table
print(filtered_fact_df.head())


  .apply(filter_most_granular)


         id  types  Longitude   Latitude     mp  \
0  26505406    102  14.414338  50.087987  False   
1  26767015    111  17.693409  49.080685  False   
2  26863079     69  14.195408  50.743945  False   
3  26863079    127  14.195408  50.743945  False   
4  26896576      5  14.422167  50.066555  False   

                             date  state  relevance  is_one  
0  2025-04-04T13:00:00.0000+02:00      2          4       1  
1  2025-04-11T20:14:00.0000+02:00      1          4       1  
2  2025-04-01T12:32:00.0000+02:00      2          4       1  
3  2025-04-01T12:32:00.0000+02:00      2          4       2  
4  2025-04-04T00:47:00.0000+02:00      2          4       1  


In [None]:
import pandas as pd
import time
from collections import defaultdict

def load_types(types_path):
    types_df = pd.read_csv(types_path, sep=",", encoding="utf-8")
    return types_df

def build_parent_map(types_df):
    parent_map = defaultdict(set)
    for _, row in types_df.iterrows():
        tid = row['id']
        for pid in (row['parent_id1'], row['parent_id2'], row['parent_id3']):
            if pid > 0:
                parent_map[tid].add(pid)
    return parent_map

def build_ancestor_map(parent_map):
    ancestor_map = defaultdict(set)

    changed = True
    while changed:
        changed = False
        for child, parents in parent_map.items():
            current_ancestors = ancestor_map[child]
            new_ancestors = set()
            for parent in parents:
                new_ancestors.add(parent)
                new_ancestors.update(ancestor_map[parent])
            if not new_ancestors.issubset(current_ancestors):
                ancestor_map[child].update(new_ancestors)
                changed = True
    return dict(ancestor_map)

def load_fact_table(fact_path):
    fact_df = pd.read_csv(fact_path, sep=",", encoding="utf-8")
    return fact_df

def filter_most_granular(group, type_ancestors):
    type_ids = set(group['types'])

    # Remove types that are ancestors of others
    to_remove = set()
    for t in type_ids:
        ancestors = type_ancestors.get(t, set())
        to_remove.update(type_ids & ancestors)

    granular_types = type_ids - to_remove

    # Fixed attributes
    row = group.iloc[0]
    common_data = {
        'id': row['id'],
        'Longitude': row['x'],
        'Latitude': row['y'],
        'mp': row['mp'],
        'date': row['date'],
        'state': row['state'],
        'relevance': row['relevance'],
    }

    # Expand
    records = [
        {**common_data, 'types': t}
        for t in granular_types
    ]
    return pd.DataFrame(records)

def clean_fact_table(fact_df, type_ancestors):
    # Group and apply filtering
    filtered_df = fact_df.groupby('id', group_keys=False).apply(
        lambda group: filter_most_granular(group, type_ancestors)
    ).reset_index(drop=True)

    # Assign 'is_one'
    filtered_df['is_one'] = (
        filtered_df.sort_values(by=['id', 'types'])
        .groupby('id')
        .cumcount() + 1
    )

    return filtered_df.sort_values(by=['id', 'types']).reset_index(drop=True)

def save_fact_table(filtered_df, output_path):
    filtered_df.to_csv(output_path, index=False)

def main():
    start_time = time.time()

    # Paths
    types_path = "C://Users//jirip//Documents//Developer//python//kriminalita//types.csv"
    
    yearMonth = ["202504","202503","202502","202501"]
    for year in yearMonth:
        loop_start_time = time.time()
        fact_path = f"C://Users//jirip//Documents//Developer//python//kriminalita//map_files//{year}.csv"
        output_path = f"fact_clean_{year}.csv"
        print(f"Processing {year}...")
        # Load
        types_df = load_types(types_path)
        fact_df = load_fact_table(fact_path)

        # Build maps
        parent_map = build_parent_map(types_df)
        type_ancestors = build_ancestor_map(parent_map)

        # Clean fact table
        cleaned_fact_df = clean_fact_table(fact_df, type_ancestors)

        # Save
        save_fact_table(cleaned_fact_df, output_path)
        
        elapsed = time.time() - loop_start_time
        print(f"Processed {year} in {elapsed:.2f} seconds.")    


    
    # Print elapsed time

    elapsed = time.time() - start_time
    print(f"Completed in {elapsed:.2f} seconds.")

if __name__ == "__main__":
    main()


Processing 202504...


  filtered_df = fact_df.groupby('id', group_keys=False).apply(


Processed 202504 in 63.49 seconds.
Processing 202503...


  filtered_df = fact_df.groupby('id', group_keys=False).apply(


Processed 202503 in 152.04 seconds.
Processing 202502...


  filtered_df = fact_df.groupby('id', group_keys=False).apply(


Processed 202502 in 241.82 seconds.
Processing 202501...


  filtered_df = fact_df.groupby('id', group_keys=False).apply(


Processed 202501 in 333.81 seconds.
Completed in 333.81 seconds.


In [4]:
# parallelize

import pandas as pd
from collections import defaultdict
import multiprocessing as mp

# === Step 1: Load Type Hierarchy and Build Ancestor Map Iteratively ===
types_df = pd.read_csv("C://Users//jirip//Documents//Developer//python//kriminalita//types.csv", sep=",", encoding="utf-8")

def build_ancestor_map(types_df):
    parent_map = defaultdict(set)
    for _, row in types_df.iterrows():
        tid = row['id']
        for pid in (row['parent_id1'], row['parent_id2'], row['parent_id3']):
            if pid > 0:
                parent_map[tid].add(pid)

    ancestor_map = defaultdict(set)
    changed = True
    while changed:
        changed = False
        for child, parents in parent_map.items():
            current = ancestor_map[child]
            new_ancestors = set()
            for p in parents:
                new_ancestors.add(p)
                new_ancestors.update(ancestor_map[p])
            if not new_ancestors.issubset(current):
                ancestor_map[child].update(new_ancestors)
                changed = True
    return dict(ancestor_map)

ancestor_map = build_ancestor_map(types_df)

# === Step 2: Load and Prepare the Fact Table ===
fact_df = pd.read_csv("C://Users//jirip//Documents//Developer//python//kriminalita//map_files//202504.csv", dtype={'id': int, 'types': int})
fact_df.rename(columns={'x': 'Longitude', 'y': 'Latitude'}, inplace=True)

# === Step 3: Group Type IDs Per 'id' ===
grouped = fact_df.groupby('id')
meta_cols = ['Longitude', 'Latitude', 'mp', 'date', 'state', 'relevance']

# Extract metadata per id
meta_df = grouped[meta_cols].first().reset_index()

# Extract types list per id
types_grouped = grouped['types'].apply(list).reset_index(name='types')

# Merge
merged_df = pd.merge(meta_df, types_grouped, on='id')


# === Step 4: Filter to Most Granular Types ===
def filter_granular(row):
    types = set(row['types'])
    to_remove = set()
    for t1 in types:
        for t2 in types:
            if t1 != t2 and t1 in ancestor_map.get(t2, set()):
                to_remove.add(t1)
    granular = types - to_remove
    return pd.DataFrame({
        'id': [row['id']] * len(granular),
        'types': list(granular),
        **{col: [row[col]] * len(granular) for col in meta_cols}
    })


# === Step 5: Parallel Processing (Optional but Recommended for Large Files) ===
def parallel_filter(df, num_processes=mp.cpu_count()):
    with mp.Pool(processes=num_processes) as pool:
        parts = np.array_split(df, num_processes)
        results = pool.map(lambda part: pd.concat([filter_granular(r) for _, r in part.iterrows()]), parts)
    return pd.concat(results)

import numpy as np
filtered_df = parallel_filter(merged_df)

# === Step 6: Mark Single/Multiple Types ===
filtered_df['is_one'] = (
    filtered_df.sort_values(['id', 'types'])
    .groupby('id').cumcount() + 1
)

# === Step 7: Save Cleaned Data ===
filtered_df.to_csv("fact_clean.csv", index=False)


  return bound(*args, **kwds)


AttributeError: Can't pickle local object 'parallel_filter.<locals>.<lambda>'