In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pyspark.pandas as ps

try:
    # Tworzenie sesji Spark
    spark = SparkSession.builder \
        .appName("PySpark SQL Server Connection") \
        .config("spark.jars", "C:\\Projekty\\Wielowymiarowa\\Wielowymiarowa_analiza_danych\\sqljdbc_12.6\\enu\\jars\\mssql-jdbc-12.6.1.jre8.jar") \
        .getOrCreate()

    # Parametry połączenia z bazą danych MSSQL
    server_name = "localhost"
    port = "1433"
    database_name = "After_ETL"
    url = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name}"

    table_name = "Clean_table1"
    username = "sa"
    password = "YourStrongPassword123"

    # Wczytanie danych z bazy danych MSSQL
    df = spark.read \
        .format("jdbc") \
        .option("url", url) \
        .option("dbtable", table_name) \
        .option("user", username) \
        .option("password", password) \
        .option("encrypt", "false") \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()

    print("Dane zostały pomyślnie wczytane z MSSQL.")
    # Wyświetlenie pierwszych kilku wierszy DataFrame
    df.show()

except Exception as e:
    print("Wystąpił błąd podczas łączenia z bazą danych:", str(e))

ImportError: PyArrow >= 4.0.0 must be installed; however, it was not found.

In [2]:
df.select("Vict Age bucket").distinct().show()
df.select("Vict descent").distinct().show()
df.select("Vict sex").distinct().show()
df.select("Premis Desc").distinct().show()

+---------------+
|Vict Age bucket|
+---------------+
|         60-120|
|          18-35|
|          35-60|
|           0-18|
+---------------+

+------------------+
|      Vict descent|
+------------------+
|Hispanic or Latino|
|           Unknown|
|             Other|
|             White|
|             Black|
|             Asian|
|              NULL|
+------------------+

+--------+
|Vict sex|
+--------+
|X-gender|
|  Female|
|    Male|
+--------+

+--------------------+
|         Premis Desc|
+--------------------+
|            BUS STOP|
|      MASSAGE PARLOR|
|          RIVER BED*|
|MTA - BLUE LINE -...|
|             MOSQUE*|
|               MOTEL|
|MTA - SILVER LINE...|
|OPTICAL OFFICE IN...|
|            SIDEWALK|
|    SYNAGOGUE/TEMPLE|
|APARTMENT/CONDO C...|
|          STAIRWELL*|
|CULTURAL SIGNIFIC...|
|            CAR WASH|
|           THE GROVE|
|MTA - PURPLE LINE...|
|RETIRED (DUPLICAT...|
|        GOLF COURSE*|
|      OTHER BUSINESS|
|        TOBACCO SHOP|
+---------------

In [3]:
df = df.dropna(subset=['Vict descent'])

In [4]:
vict_age_values = df.select('Vict Age bucket').distinct().collect()
vict_age_values = [row['Vict Age bucket'] for row in vict_age_values]

vict_descent_values = df.select('Vict descent').distinct().collect()
vict_descent_values = [row['Vict descent'] for row in vict_descent_values]

vict_sex_values = df.select('Vict sex').distinct().collect()
vict_sex_values = [row['Vict sex'] for row in vict_sex_values]

combined_df = df.groupBy('Vict Age bucket', 'Vict descent', 'Vict sex').agg(count('*').alias('count'))
combined_df.show()

+---------------+------------------+--------+-----+
|Vict Age bucket|      Vict descent|Vict sex|count|
+---------------+------------------+--------+-----+
|           0-18|           Unknown|  Female|   40|
|          35-60|             Black|X-gender|    3|
|          18-35|           Unknown|X-gender|  553|
|          35-60|             Other|  Female| 1187|
|         60-120|             Asian|  Female|  147|
|           0-18|             Black|    Male|  248|
|         60-120|           Unknown|    Male|   18|
|          18-35|             Black|  Female| 2642|
|          18-35|             Black|X-gender|   11|
|          35-60|             Asian|    Male|  360|
|          18-35|             Asian|    Male|  316|
|         60-120|           Unknown|  Female|   10|
|          35-60|             White|X-gender|    6|
|          35-60|Hispanic or Latino|    Male| 5079|
|          35-60|             Other|    Male| 1615|
|         60-120|             Other|    Male|  565|
|          1

In [11]:
import plotly.express as px

# Tworzenie interaktywnej wizualizacji w stylu kostki OLAP
fig = px.sunburst(combined_df, path=['Vict Age bucket', 'Vict descent', 'Vict sex'], values='count')
fig.update_traces(textinfo='label+percent entry')
fig.show(width=1000, height=1200)  # Ustawienie szerokości na 1000 pikseli i wysokości na 1200 pikseli

In [13]:
import plotly.express as px

# Tworzenie interaktywnej wizualizacji typu treemap
fig = px.treemap(combined_df, path=['Vict Age bucket', 'Vict descent', 'Vict sex'], values='count')
fig.show()

In [28]:
import plotly.express as px

# Tworzenie interaktywnej wizualizacji typu treemap z opcją squarify
fig = px.treemap(combined_df, path=['Vict Age bucket', 'Vict descent', 'Vict sex'], values='count', 
                 title='Hierarchiczna wizualizacja przestępstw', 
                 color='count', color_continuous_scale='viridis',
                 hover_data=['count'], 
                 width=2000, height=1600)
fig.show()

In [15]:
import plotly.express as px

fig = px.bar(combined_df, x='Vict Age bucket', y='count', color='Vict descent', 
             barmode='group', title="Liczba przestępstw w różnych grupach wiekowych i pochodzeniach")
fig.show()


In [34]:
import seaborn as sns
import matplotlib.pyplot as plt

# Tworzenie pivot table
pivot_df = combined_df.pivot_table(index='Vict Age bucket', columns=['Vict descent', 'Vict sex'], values='count', fill_value=0)

# Rysowanie heatmapy
plt.figure(figsize=(12, 8))
sns.heatmap(pivot_df, cmap='YlGnBu', annot=True, fmt='d', linewidths=.5)
plt.title('Liczba zdarzeń w zależności od wieku, pochodzenia i płci')
plt.xlabel('Pochodzenie i płeć')
plt.ylabel('Wiek')
plt.show()

AttributeError: 'DataFrame' object has no attribute 'pivot_table'

In [None]:
# Visualizing 5-D mix data using bubble charts
# leveraging the concepts of hue, size and depth
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')
t = fig.suptitle('Wine Residual Sugar - Alcohol Content - Acidity - Total Sulfur Dioxide - Type', fontsize=14)

xs = list(wines['residual sugar'])
ys = list(wines['alcohol'])
zs = list(wines['fixed acidity'])
data_points = [(x, y, z) for x, y, z in zip(xs, ys, zs)]

ss = list(wines['total sulfur dioxide'])
colors = ['red' if wt == 'red' else 'yellow' for wt in list(wines['wine_type'])]

for data, color, size in zip(data_points, colors, ss):
    x, y, z = data
    ax.scatter(x, y, z, alpha=0.4, c=color, edgecolors='none', s=size)

ax.set_xlabel('Residual Sugar')
ax.set_ylabel('Alcohol')
ax.set_zlabel('Fixed Acidity')

In [35]:
#kostka

# Import libraries
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
 
 
# Create axis
axes = [5, 5, 5]
 
# Create Data
data = np.ones(axes, dtype=np.bool)
 
# Control Transparency
alpha = 0.9
 
# Control colour
colors = np.empty(axes + [4], dtype=np.float32)
 
colors[0] = [1, 0, 0, alpha]  # red
colors[1] = [0, 1, 0, alpha]  # green
colors[2] = [0, 0, 1, alpha]  # blue
colors[3] = [1, 1, 0, alpha]  # yellow
colors[4] = [1, 1, 1, alpha]  # grey
 
# Plot figure
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
 
# Voxels is used to customizations of
# the sizes, positions and colors.
ax.voxels(data, facecolors=colors, edgecolors='grey')