In [2]:
from pyspark.sql import SparkSession, Row
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pyspark.pandas as ps

try:
    # Tworzenie sesji Spark
    spark = SparkSession.builder \
        .appName("PySpark SQL Server Connection") \
        .config("spark.jars", "mssql-jdbc-12.6.1.jre8.jar") \
        .getOrCreate()

    # Parametry połączenia z bazą danych MSSQL
    server_name = "mssql-server"
    port = "1433"
    database_name = "ETLKI"
    url = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name}"

    table_name = "ETLKI"
    username = "sa"
    password = "YourStrongPassword123"

    # Wczytanie danych z bazy danych MSSQL
    df = spark.read \
        .format("jdbc") \
        .option("url", url) \
        .option("dbtable", table_name) \
        .option("user", username) \
        .option("password", password) \
        .option("encrypt", "false") \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()

    print("Dane zostały pomyślnie wczytane z MSSQL.")
    # Wyświetlenie pierwszych kilku wierszy DataFrame
    df.show(5)

except Exception as e:
    print("Wystąpił błąd podczas łączenia z bazą danych:", str(e))





Dane zostały pomyślnie wczytane z MSSQL.
+----------+---------+----------+----------+--------+----+----------+-----------+--------+------+--------------------+--------------------+--------+--------+------------------+---------+--------------------+--------------+--------------------+------+-----------+--------+--------+--------+--------+--------------------+------------+-------+---------+------------------+-------------------+----------+---------------------------+---------------+-------------------+----------------+--------------------+---------------+-------------+----+-----+---+----+-------+-----------+---------------+
|Unnamed: 0|    DR_NO| Date Rptd|  DATE OCC|TIME OCC|AREA| AREA NAME|Rpt Dist No|Part 1-2|Crm Cd|         Crm Cd Desc|             Mocodes|Vict Age|Vict Sex|      Vict Descent|Premis Cd|         Premis Desc|Weapon Used Cd|         Weapon Desc|Status|Status Desc|Crm Cd 1|Crm Cd 2|Crm Cd 3|Crm Cd 4|            LOCATION|Cross Street|    LAT|      LON|gender of criminal|c

In [127]:
# Tworzenie zestawienia wielowymiarowego danych (kostki OLAP)
df = df.groupby( "Year","Quarter",'Month','Day', 'AREA NAME','gender of criminal',  'age of criminal', ).agg(count('*').alias('IncidentCount'))

# Wyświetlenie pierwszych kilku wierszy kostki OLAP
df.show()


+----+-------+-----+---+----------+------------------+---------------+-------------+
|Year|Quarter|Month|Day| AREA NAME|gender of criminal|age of criminal|IncidentCount|
+----+-------+-----+---+----------+------------------+---------------+-------------+
|2021|      4|   12| 22|Devonshire|              Male|           0-18|            2|
|2022|      4|   10| 19|  Wilshire|              Male|           0-18|            2|
|2022|      4|   12| 02|    Harbor|              Male|          35-60|            2|
|2021|      1|   02| 13|   Topanga|              Male|          18-35|            2|
|2022|      2|   06| 02|  Foothill|            Female|          18-35|            2|
|2020|      3|   07| 03|  Wilshire|              Male|           0-18|            2|
|2020|      3|   08| 08|   Central|              Male|         60-100|            2|
|2022|      4|   12| 24|   Mission|              Male|          18-35|            2|
|2022|      1|   03| 21|   Rampart|              Male|         60

In [3]:
import pandas as pd

# Konwertowanie ramki danych z PySpark na ramkę danych Pandas
pandas_df = df.toPandas()


In [4]:
pandas_df

Unnamed: 0.1,Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,...,drug substances,annual earnings,value_divided,Time,Month,Day,Year,Quarter,Day of Week,Vict Age bucket
0,0,10304468,2020-01-08,01/08/2020,2230,3,Southwest,377,2,624,...,Alcohol,118908,22.3,22,01,08,2020,1,Wednesday,35-60
1,1000,200105946,2020-01-29,01/29/2020,720,1,Central,157,2,624,...,Prescription opioids,27044,7.2,7,01,29,2020,1,Wednesday,35-60
2,2000,200205282,2020-01-27,01/27/2020,810,2,Rampart,249,2,956,...,Heroin,127062,8.1,8,01,27,2020,1,Monday,35-60
3,3000,200110265,2020-04-09,03/31/2020,1930,1,Central,163,1,350,...,Cannabis (Marijuana),144944,19.3,19,03,31,2020,1,Thursday,35-60
4,4000,200404736,2020-01-20,01/19/2020,2030,4,Hollenbeck,437,2,627,...,Alcohol,101813,20.3,20,01,19,2020,1,Monday,0-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1327,752000,230606360,2023-02-26,02/26/2023,1230,6,Hollywood,648,1,210,...,Benzodiazepines,116958,12.3,12,02,26,2023,1,Sunday,60-100
1328,753000,231210585,2023-04-21,04/21/2023,2150,12,77th Street,1251,1,761,...,Cannabis (Marijuana),53446,21.5,22,04,21,2023,2,Friday,0-18
1329,754000,231911228,2023-07-04,07/03/2023,1200,19,Mission,1994,1,520,...,Cannabis (Marijuana),144292,12.0,12,07,03,2023,3,Tuesday,18-35
1330,756000,230408765,2023-05-25,05/24/2023,630,4,Hollenbeck,416,2,354,...,Cannabis (Marijuana),98311,6.3,6,05,24,2023,2,Thursday,35-60


In [131]:
import plotly.express as px

# Tworzenie interaktywnej wizualizacji w stylu kostki OLAP
fig = px.sunburst(option1, path=["Year",'Month',  'age of criminal'], values='IncidentCount')
fig.update_traces(textinfo='label+percent entry')
fig.show(width=1800, height=4400)  # Ustawienie szerokości na 1000 pikseli i wysokości na 1200 pikseli

In [133]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
import pandas as pd

# Tworzenie przykładowych danych
data = {
    "Year": [2020, 2021, 2022],
    "Month": [1, 2, 3],
    "age of criminal": ["0-18", "18-35", "35-60"],
    "IncidentCount": [100, 150, 200]
}

df_data = pd.DataFrame(data)

# Inicjalizacja aplikacji Dash
app = dash.Dash(__name__)

# Layout aplikacji
app.layout = html.Div([
    dcc.Checklist(
        id='year-checkboxes',
        options=[{'label': str(year), 'value': year} for year in df_data['Year'].unique()],
        value=[df_data['Year'].unique()[0]],  # Domyślnie zaznacz pierwszy rok
    ),
    dcc.Graph(id='sunburst-graph')
])

# Callback do aktualizacji sunburst graph
@app.callback(
    Output('sunburst-graph', 'figure'),
    [Input('year-checkboxes', 'value')]
)
def update_sunburst_graph(selected_years):
    filtered_df = df_data[df_data['Year'].isin(selected_years)]
    fig = px.sunburst(filtered_df, path=["Year", 'Month', 'age of criminal'], values='IncidentCount')
    fig.update_traces(textinfo='label+percent entry')
    return fig

# Uruchomienie aplikacji
if __name__ == '__main__':
    app.run_server(debug=True)


Dash app running on http://127.0.0.1:8050/
