Poniższy kod był realizowany w pythonie 3.13

In [1]:
# Importowanie bibliotek
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
import seaborn as sns

# Funkcja do wczytania danych


def load_csv(file_path, use_dask=False):
    """
    Wczytuje dane z pliku CSV.
    :param file_path: Ścieżka do pliku CSV.
    :param use_dask: Czy użyć Dask zamiast Pandas.
    :return: DataFrame (Pandas lub Dask).
    """
    print(f"Wczytywanie danych z {file_path}...")
    if use_dask:
        df = dd.read_csv(file_path, dtype=str, assume_missing=True)
    else:
        df = pd.read_csv(file_path)
    print("Dane załadowane.")
    return df

# Funkcja do analizy eksploracyjnej


def exploratory_data_analysis(df, use_dask=False):
    """
    Wykonuje analizę eksploracyjną na zbiorze danych.
    :param df: DataFrame (Pandas lub Dask).
    :param use_dask: Czy DataFrame jest typu Dask.
    """
    # Podstawowe informacje o danych
    print("\n--- Podstawowe informacje ---")
    if use_dask:
        print(df.head())
        # Dask wymaga compute() do wykonania operacji
        print(df.describe().compute())
    else:
        print(df.info())
        print(df.describe())

    # Analiza brakujących wartości
    print("\n--- Brakujące wartości ---")
    if use_dask:
        missing_values = df.isnull().sum().compute()
    else:
        missing_values = df.isnull().sum()
    print(missing_values[missing_values > 0])

    # Wizualizacja brakujących wartości (jeśli dane są małe)
    if not use_dask:
        plt.figure(figsize=(10, 6))
        sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
        plt.title("Brakujące wartości")
        plt.show()

    # Analiza korelacji (dla danych numerycznych)
    if not use_dask:
        print("\n--- Korelacje ---")
        numeric_columns = df.select_dtypes(
            include=['float64', 'int64']).columns
        if len(numeric_columns) > 1:
            correlation = df[numeric_columns].corr()
            print(correlation)

            # Wizualizacja korelacji
            plt.figure(figsize=(12, 8))
            sns.heatmap(correlation, annot=True, fmt=".2f",
                        cmap="coolwarm", square=True)
            plt.title("Macierz korelacji")
            plt.show()
        else:
            print(
                "Brak wystarczającej liczby zmiennych numerycznych do obliczenia korelacji.")

    # Wizualizacja rozkładu danych (dla pierwszych kilku kolumn numerycznych)
    if not use_dask:
        plt.figure(figsize=(15, 10))
        for i, col in enumerate(df.select_dtypes(include=['float64', 'int64']).columns[:6]):
            plt.subplot(2, 3, i + 1)
            sns.histplot(df[col], kde=True, bins=30)
            plt.title(f"Rozkład {col}")
        plt.tight_layout()
        plt.show()


# Ścieżka do pliku CSV
file_path = "Crime_Data.csv"

# Wczytanie danych (użyj Dask dla dużych zbiorów danych)
use_dask = True
data = load_csv(file_path, use_dask=use_dask)

# Analiza eksploracyjna
exploratory_data_analysis(data, use_dask=use_dask)

output_path = "Crime_Data.parquet"
data.compute().to_parquet(output_path, index=False, engine='pyarrow')
print(f"Zapisano dane do {output_path}")

Wczytywanie danych z Crime_Data.csv...
Dane załadowane.

--- Podstawowe informacje ---
       DR_NO               Date Rptd                DATE OCC TIME OCC AREA  \
0  190326475  03/01/2020 12:00:00 AM  03/01/2020 12:00:00 AM     2130   07   
1  200106753  02/09/2020 12:00:00 AM  02/08/2020 12:00:00 AM     1800   01   
2  200320258  11/11/2020 12:00:00 AM  11/04/2020 12:00:00 AM     1700   03   
3  200907217  05/10/2023 12:00:00 AM  03/10/2020 12:00:00 AM     2037   09   
4  200412582  09/09/2020 12:00:00 AM  09/09/2020 12:00:00 AM     0630   04   

    AREA NAME Rpt Dist No Part 1-2 Crm Cd  \
0    Wilshire        0784        1    510   
1     Central        0182        1    330   
2   Southwest        0356        1    480   
3    Van Nuys        0964        1    343   
4  Hollenbeck        0413        1    510   

                                Crm Cd Desc  ... Status   Status Desc  \
0                          VEHICLE - STOLEN  ...     AA  Adult Arrest   
1                     BURGL

In [2]:
# Calculate the threshold for missing values
missing_threshold = len(data) * 0.5

# Compute the number of missing values per column
missing_values = data.isnull().sum().compute()

# Filter columns that have more than the threshold of missing values
columns_to_drop = missing_values[missing_values > missing_threshold].index

# Drop the columns
crime_data = data.drop(columns=columns_to_drop)

print(f"Dropped columns: {columns_to_drop}")

Dropped columns: Index(['Weapon Used Cd', 'Weapon Desc', 'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4',
       'Cross Street'],
      dtype='object')


In [3]:
crime_data.head()

Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Vict Sex,Vict Descent,Premis Cd,Premis Desc,Status,Status Desc,Crm Cd 1,LOCATION,LAT,LON
0,190326475,03/01/2020 12:00:00 AM,03/01/2020 12:00:00 AM,2130,7,Wilshire,784,1,510,VEHICLE - STOLEN,...,M,O,101,STREET,AA,Adult Arrest,510,1900 S LONGWOOD AV,34.0375,-118.3506
1,200106753,02/09/2020 12:00:00 AM,02/08/2020 12:00:00 AM,1800,1,Central,182,1,330,BURGLARY FROM VEHICLE,...,M,O,128,BUS STOP/LAYOVER (ALSO QUERY 124),IC,Invest Cont,330,1000 S FLOWER ST,34.0444,-118.2628
2,200320258,11/11/2020 12:00:00 AM,11/04/2020 12:00:00 AM,1700,3,Southwest,356,1,480,BIKE - STOLEN,...,X,X,502,"MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)",IC,Invest Cont,480,1400 W 37TH ST,34.021,-118.3002
3,200907217,05/10/2023 12:00:00 AM,03/10/2020 12:00:00 AM,2037,9,Van Nuys,964,1,343,SHOPLIFTING-GRAND THEFT ($950.01 & OVER),...,M,O,405,CLOTHING STORE,IC,Invest Cont,343,14000 RIVERSIDE DR,34.1576,-118.4387
4,200412582,09/09/2020 12:00:00 AM,09/09/2020 12:00:00 AM,630,4,Hollenbeck,413,1,510,VEHICLE - STOLEN,...,,,101,STREET,IC,Invest Cont,510,200 E AVENUE 28,34.082,-118.213


In [4]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Select the columns to normalize
columns_to_normalize = ['LAT', 'LON', 'Vict Age']

# Ensure the columns to normalize contain numeric data
crime_data[columns_to_normalize] = crime_data[columns_to_normalize].astype(
    float)

# Fit the scaler on a sample of the data
sample = crime_data[columns_to_normalize].compute()
scaler.fit(sample)

# Normalize the columns
crime_data[columns_to_normalize] = crime_data[columns_to_normalize].map_partitions(
    lambda df: pd.DataFrame(scaler.transform(df),
                            columns=columns_to_normalize),
    meta=pd.DataFrame(columns=columns_to_normalize)
)

# Compute the result
crime_data = crime_data.compute()

print(crime_data.head())

       DR_NO               Date Rptd                DATE OCC TIME OCC AREA  \
0  190326475  03/01/2020 12:00:00 AM  03/01/2020 12:00:00 AM     2130   07   
1  200106753  02/09/2020 12:00:00 AM  02/08/2020 12:00:00 AM     1800   01   
2  200320258  11/11/2020 12:00:00 AM  11/04/2020 12:00:00 AM     1700   03   
3  200907217  05/10/2023 12:00:00 AM  03/10/2020 12:00:00 AM     2037   09   
4  200412582  09/09/2020 12:00:00 AM  09/09/2020 12:00:00 AM     0630   04   

    AREA NAME Rpt Dist No Part 1-2 Crm Cd  \
0    Wilshire        0784        1    510   
1     Central        0182        1    330   
2   Southwest        0356        1    480   
3    Van Nuys        0964        1    343   
4  Hollenbeck        0413        1    510   

                                Crm Cd Desc  ... Vict Sex  Vict Descent  \
0                          VEHICLE - STOLEN  ...        M             O   
1                     BURGLARY FROM VEHICLE  ...        M             O   
2                             BIKE 

In [5]:
cleaned_output_path = "cleaned_crime_data.parquet"
crime_data.to_parquet(cleaned_output_path, index=False, engine='pyarrow')
print(f"Zapisano oczyszczone dane do {cleaned_output_path}")

Zapisano oczyszczone dane do cleaned_crime_data.parquet
