In [4]:
from pyspark.sql import SparkSession, Row
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pyspark.pandas as ps

try:
    # Tworzenie sesji Spark
    spark = SparkSession.builder \
        .appName("PySpark SQL Server Connection") \
        .config("spark.jars", "mssql-jdbc-12.6.1.jre8.jar") \
        .getOrCreate()

    # Parametry połączenia z bazą danych MSSQL
    server_name = "mssql-server"
    port = "1433"
    database_name = "ETLKI"
    url = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name}"

    table_name = "ETLKI"
    username = "sa"
    password = "YourStrongPassword123"

    # Wczytanie danych z bazy danych MSSQL
    df = spark.read \
        .format("jdbc") \
        .option("url", url) \
        .option("dbtable", table_name) \
        .option("user", username) \
        .option("password", password) \
        .option("encrypt", "false") \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()

    print("Dane zostały pomyślnie wczytane z MSSQL.")
    # Wyświetlenie pierwszych kilku wierszy DataFrame
    df.show(5)


except Exception as e:
    print("Wystąpił błąd podczas łączenia z bazą danych:", str(e))
    



Dane zostały pomyślnie wczytane z MSSQL.
+----------+---------+----------+----------+--------+----+----------+-----------+--------+------+--------------------+--------------------+--------+--------+------------------+---------+--------------------+--------------+--------------------+------+-----------+--------+--------+--------+--------+--------------------+------------+-------+---------+------------------+-------------------+----------+---------------------------+---------------+-------------------+----------------+--------------------+---------------+-------------+----+-----+---+----+-------+-----------+---------------+
|Unnamed: 0|    DR_NO| Date Rptd|  DATE OCC|TIME OCC|AREA| AREA NAME|Rpt Dist No|Part 1-2|Crm Cd|         Crm Cd Desc|             Mocodes|Vict Age|Vict Sex|      Vict Descent|Premis Cd|         Premis Desc|Weapon Used Cd|         Weapon Desc|Status|Status Desc|Crm Cd 1|Crm Cd 2|Crm Cd 3|Crm Cd 4|            LOCATION|Cross Street|    LAT|      LON|gender of criminal|c

In [5]:
# Tworzenie zestawienia wielowymiarowego danych (kostki OLAP)
df = df.groupby( "Year","Quarter",'Month','Day', 'AREA NAME','gender of criminal',  'age of criminal', ).agg(count('*').alias('IncidentCount'))

# Wyświetlenie pierwszych kilku wierszy kostki OLAP
df.show()


+----+-------+-----+---+----------+------------------+---------------+-------------+
|Year|Quarter|Month|Day| AREA NAME|gender of criminal|age of criminal|IncidentCount|
+----+-------+-----+---+----------+------------------+---------------+-------------+
|2021|      4|   12| 22|Devonshire|              Male|           0-18|            2|
|2022|      4|   10| 19|  Wilshire|              Male|           0-18|            2|
|2022|      4|   12| 02|    Harbor|              Male|          35-60|            2|
|2021|      1|   02| 13|   Topanga|              Male|          18-35|            2|
|2022|      2|   06| 02|  Foothill|            Female|          18-35|            2|
|2020|      3|   07| 03|  Wilshire|              Male|           0-18|            2|
|2020|      3|   08| 08|   Central|              Male|         60-100|            2|
|2022|      4|   12| 24|   Mission|              Male|          18-35|            2|
|2022|      1|   03| 21|   Rampart|              Male|         60

In [6]:
import pandas as pd

# Konwertowanie ramki danych z PySpark na ramkę danych Pandas
pandas_df = df.toPandas()


In [7]:
pandas_df

Unnamed: 0,Year,Quarter,Month,Day,AREA NAME,gender of criminal,age of criminal,IncidentCount
0,2021,4,12,22,Devonshire,Male,0-18,2
1,2022,4,10,19,Wilshire,Male,0-18,2
2,2022,4,12,02,Harbor,Male,35-60,2
3,2021,1,02,13,Topanga,Male,18-35,2
4,2022,2,06,02,Foothill,Female,18-35,2
...,...,...,...,...,...,...,...,...
661,2021,1,03,31,77th Street,Female,35-60,2
662,2021,1,03,06,Pacific,Male,0-18,2
663,2022,4,10,24,Olympic,Male,35-60,2
664,2022,4,11,18,Van Nuys,Female,18-35,2


In [8]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
import pandas as pd

df_data = pandas_df

# Inicjalizacja aplikacji Dash
app = dash.Dash(__name__)

# Layout aplikacji
app.layout = html.Div([
    dcc.Checklist(
        id='year-checkboxes',
        options=[{'label': str(year), 'value': year} for year in df_data['Year'].unique()],
        value=[df_data['Year'].unique()[0]],  # Domyślnie zaznacz pierwszy rok
    ),
    dcc.Graph(id='sunburst-graph')
])

# Callback do aktualizacji sunburst graph
@app.callback(
    Output('sunburst-graph', 'figure'),
    [Input('year-checkboxes', 'value')]
)
def update_sunburst_graph(selected_years):
    filtered_df = df_data[df_data['Year'].isin(selected_years)]
    fig = px.sunburst(filtered_df, path=["Year", 'Month', 'age of criminal'], values='IncidentCount')
    fig.update_traces(textinfo='label+percent entry')
    return fig

# Uruchomienie aplikacji w trybie osadzonym
app.run_server(mode='inline')



Address already in use
Port 8050 is in use by another program. Either identify and stop that program, or start the server with a different port.


AttributeError: 'tuple' object has no attribute 'tb_frame'

In [None]:
import plotly.express as px

# Tworzenie interaktywnej wizualizacji w stylu kostki OLAP
fig = px.sunburst(option1, path=["Year",'Month',  'age of criminal'], values='IncidentCount')
fig.update_traces(textinfo='label+percent entry')
fig.show(width=1800, height=4400)  # Ustawienie szerokości na 1000 pikseli i wysokości na 1200 pikseli

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Tworzenie interaktywnych wizualizacji w stylu kostki OLAP
fig1 = px.sunburst(option1, path=["Year",'Month',  'age of criminal'], values='IncidentCount')
fig1.update_traces(textinfo='label+percent entry')

fig2 = px.sunburst(option1, path=["Year",'Month',  'age of criminal'], values='IncidentCount')
fig2.update_traces(textinfo='label+percent entry')

fig3 = px.sunburst(option1, path=["Year",'Month',  'age of criminal'], values='IncidentCount')
fig3.update_traces(textinfo='label+percent entry')

# Skopiuj etykiety osi x i y z pierwszego wykresu
x_label = fig1.layout.xaxis.title.text
y_label = fig1.layout.yaxis.title.text

# Połączenie wykresów w jedną stronę HTML
combined_fig = make_subplots(rows=1, cols=3, subplot_titles=("Wykres 1", "Wykres 2", "Wykres 3"))
combined_fig.add_trace(fig1.data[0], row=1, col=1)
combined_fig.add_trace(fig2.data[0], row=1, col=2)
combined_fig.add_trace(fig3.data[0], row=1, col=3)

# Aktualizacja etykiet osi x i y dla każdego subplotu
combined_fig.update_xaxes(title_text=x_label, row=1, col=1)
combined_fig.update_xaxes(title_text=x_label, row=1, col=2)
combined_fig.update_xaxes(title_text=x_label, row=1, col=3)
combined_fig.update_yaxes(title_text=y_label, row=1, col=1)



# Zapisanie do pliku HTML
combined_fig.write_html("multiple_charts.html", auto_open=True)


In [15]:
from jupyter_dash import JupyterDash
app = JupyterDash(__name__)
app.layout = html.Div([
    html.H1("Random datastream"),
    dcc.Interval(
        id='interval-component',
        interval=1*1000,  # in milliseconds
        n_intervals=0
    ),
    dcc.Graph(id='graph'),
])

# Define callback to update graph
@app.callback(
    Output('graph', 'figure'),
    [Input('interval-component', "n_intervals")]
)
def stream_fig(value):
    global df
    
    Y = np.random.randn(1, len(cols))  
    df2 = pd.DataFrame(Y, columns=cols)
    df = df.append(df2, ignore_index=True)
    df3 = df.copy()
    df3 = df3.cumsum()
    fig = df3.plot(template='plotly_dark')
    
    return fig
app.run_server(mode='inline',port=8090)

TypeError: 'NoneType' object cannot be interpreted as an integer

In [13]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from dash import Dash, dcc, html
from dash.dependencies import Input, Output

# code and plot setup
# settings
pd.options.plotting.backend = "plotly"

# sample dataframe of a wide format
np.random.seed(4)
cols = list('abc')
X = np.random.randn(50, len(cols))  
df = pd.DataFrame(X, columns=cols)
df.iloc[0] = 0

# plotly figure
fig = df.plot(template='plotly_dark')

app = Dash(__name__)
app.layout = html.Div([
    html.H1("Random datastream"),
    dcc.Interval(
        id='interval-component',
        interval=1*1000,  # in milliseconds
        n_intervals=0
    ),
    dcc.Graph(id='graph'),
])

# Define callback to update graph
@app.callback(
    Output('graph', 'figure'),
    [Input('interval-component', "n_intervals")]
)
def stream_fig(value):
    global df
    
    Y = np.random.randn(1, len(cols))  
    df2 = pd.DataFrame(Y, columns=cols)
    df = df.append(df2, ignore_index=True)
    df3 = df.copy()
    df3 = df3.cumsum()
    fig = df3.plot(template='plotly_dark')
    
    return fig

app.run_server(mode='jupyterlab', port=8501, dev_tools_ui=True, dev_tools_hot_reload=True, threaded=True)
