### Połączenie

In [135]:
from pyspark.sql import SparkSession, Row
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pyspark.pandas as ps

try:
    # Tworzenie sesji Spark
    spark = SparkSession.builder \
        .appName("PySpark SQL Server Connection") \
        .config("spark.jars", "mssql-jdbc-12.6.1.jre8.jar") \
        .getOrCreate()

    # Parametry połączenia z bazą danych MSSQL
    server_name = "localhost"
    port = "1433"
    database_name = "AFTER_ETL_LA_CRIME"
    url = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name}"

    table_name = "Crime"
    username = "sa"
    password = "YourStrongPassword123"

    # Wczytanie danych z bazy danych MSSQL
    df = spark.read \
        .format("jdbc") \
        .option("url", url) \
        .option("dbtable", table_name) \
        .option("user", username) \
        .option("password", password) \
        .option("encrypt", "false") \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()

    print("Dane zostały pomyślnie wczytane z MSSQL.")
    # Wyświetlenie pierwszych kilku wierszy DataFrame
    df.show()

except Exception as e:
    print("Wystąpił błąd podczas łączenia z bazą danych:", str(e))

Dane zostały pomyślnie wczytane z MSSQL.
+------------+------------+------------+----------+---------+--------------------+--------------------+--------+----+----------+-----------+--------+------+--------------------+--------------------+--------+--------+------------+---------+--------------------+--------------+--------------------+------+-----------+--------+--------+--------+--------+--------------------+------------+-------+---------+------------------+-------------------+----------+---------------------------+---------------+-------------------+----------------+--------------------+-------------------+
|Unnamed: 0.3|Unnamed: 0.2|Unnamed: 0.1|Unnamed: 0|    DR_NO|           Date Rptd|            DATE OCC|TIME OCC|AREA| AREA NAME|Rpt Dist No|Part 1-2|Crm Cd|         Crm Cd Desc|             Mocodes|Vict Age|Vict Sex|Vict Descent|Premis Cd|         Premis Desc|Weapon Used Cd|         Weapon Desc|Status|Status Desc|Crm Cd 1|Crm Cd 2|Crm Cd 3|Crm Cd 4|            LOCATION|Cross Stree

### ETL 1 - Przygotowanie dat do późniejszej analizy

In [136]:
df = df.withColumn("value_divided", col("Time OCC") / 100)

# Zamiana kropki na dwukropek w kolumnie "value_divided" i tworzenie nowej kolumny "time_fixed"
df = df.withColumn("Time", round(col("value_divided"), 0))

df = df.withColumn("Time", col("Time").cast("integer"))

# Wyświetlenie wyniku
df.select('Time').show()

+----+
|Time|
+----+
|  22|
|  22|
|  17|
|   6|
|  19|
|   3|
|   5|
|  20|
|   3|
|  15|
|  22|
|   1|
|  19|
|  24|
|  16|
|   0|
|   0|
|   8|
|  21|
|   9|
+----+
only showing top 20 rows



In [137]:
from pyspark.sql.functions import split, col

df = df.withColumn("Date Rptd", split(col("Date Rptd"), " ").getItem(0))
df = df.withColumn("DATE OCC", split(col("DATE OCC"), " ").getItem(0))

In [138]:
df.select('Date Rptd').show()

+----------+
| Date Rptd|
+----------+
|01/08/2020|
|01/04/2020|
|09/10/2020|
|01/23/2020|
|01/30/2020|
|03/27/2020|
|01/01/2020|
|07/23/2020|
|01/01/2020|
|01/02/2020|
|01/02/2020|
|01/03/2020|
|01/04/2020|
|02/17/2020|
|01/04/2020|
|01/04/2020|
|01/01/2020|
|07/23/2020|
|02/07/2020|
|01/06/2020|
+----------+
only showing top 20 rows



In [139]:
df = df.withColumn("Date Rptd", to_date(df["Date Rptd"], "MM/dd/yyyy"))

# Wybierz dzień tygodnia z daty
df= df.withColumn("Day of Week", dayofweek(df["Date Rptd"]))


df = df.withColumn("Day of Week", 
                   when(df["Day of Week"] == lit(1), lit("Sunday"))
                   .when(df["Day of Week"] == lit(2), lit("Monday"))
                   .when(df["Day of Week"] == lit(3), lit("Tuesday"))
                   .when(df["Day of Week"] == lit(4), lit("Wednesday"))
                   .when(df["Day of Week"] == lit(5), lit("Thursday"))
                   .when(df["Day of Week"] == lit(6), lit("Friday"))
                   .when(df["Day of Week"] == lit(7), lit("Saturday"))
                   .otherwise(None))


# Wyświetl DataFrame z nową kolumną 'Day of Week'
df.show()

+------------+------------+------------+----------+---------+----------+----------+--------+----+----------+-----------+--------+------+--------------------+--------------------+--------+--------+------------+---------+--------------------+--------------+--------------------+------+-----------+--------+--------+--------+--------+--------------------+------------+-------+---------+------------------+-------------------+----------+---------------------------+---------------+-------------------+----------------+--------------------+-------------------+-------------+----+-----------+
|Unnamed: 0.3|Unnamed: 0.2|Unnamed: 0.1|Unnamed: 0|    DR_NO| Date Rptd|  DATE OCC|TIME OCC|AREA| AREA NAME|Rpt Dist No|Part 1-2|Crm Cd|         Crm Cd Desc|             Mocodes|Vict Age|Vict Sex|Vict Descent|Premis Cd|         Premis Desc|Weapon Used Cd|         Weapon Desc|Status|Status Desc|Crm Cd 1|Crm Cd 2|Crm Cd 3|Crm Cd 4|            LOCATION|Cross Street|    LAT|      LON|gender of criminal|country of 

In [140]:
from pyspark.sql.functions import dayofweek, to_date
df_with_day_of_week = df.withColumn("Date Rptd", dayofweek(df["Date Rptd"]))
df_with_day_of_week.show()

+------------+------------+------------+----------+---------+---------+----------+--------+----+----------+-----------+--------+------+--------------------+--------------------+--------+--------+------------+---------+--------------------+--------------+--------------------+------+-----------+--------+--------+--------+--------+--------------------+------------+-------+---------+------------------+-------------------+----------+---------------------------+---------------+-------------------+----------------+--------------------+-------------------+-------------+----+-----------+
|Unnamed: 0.3|Unnamed: 0.2|Unnamed: 0.1|Unnamed: 0|    DR_NO|Date Rptd|  DATE OCC|TIME OCC|AREA| AREA NAME|Rpt Dist No|Part 1-2|Crm Cd|         Crm Cd Desc|             Mocodes|Vict Age|Vict Sex|Vict Descent|Premis Cd|         Premis Desc|Weapon Used Cd|         Weapon Desc|Status|Status Desc|Crm Cd 1|Crm Cd 2|Crm Cd 3|Crm Cd 4|            LOCATION|Cross Street|    LAT|      LON|gender of criminal|country of cr

In [141]:
df.show(10)

+------------+------------+------------+----------+---------+----------+----------+--------+----+----------+-----------+--------+------+--------------------+--------------------+--------+--------+------------+---------+--------------------+--------------+--------------------+------+-----------+--------+--------+--------+--------+--------------------+------------+-------+---------+------------------+-------------------+----------+---------------------------+---------------+-------------------+----------------+--------------------+-------------------+-------------+----+-----------+
|Unnamed: 0.3|Unnamed: 0.2|Unnamed: 0.1|Unnamed: 0|    DR_NO| Date Rptd|  DATE OCC|TIME OCC|AREA| AREA NAME|Rpt Dist No|Part 1-2|Crm Cd|         Crm Cd Desc|             Mocodes|Vict Age|Vict Sex|Vict Descent|Premis Cd|         Premis Desc|Weapon Used Cd|         Weapon Desc|Status|Status Desc|Crm Cd 1|Crm Cd 2|Crm Cd 3|Crm Cd 4|            LOCATION|Cross Street|    LAT|      LON|gender of criminal|country of 

In [142]:
# Wyświetlenie początkowych danych
print("Przed zastąpieniem:")
df.select('Vict Sex').distinct().show()

Przed zastąpieniem:
+--------+
|Vict Sex|
+--------+
|       F|
|       M|
|       X|
|       H|
|    NULL|
+--------+



In [143]:
count_H = df.filter(df['Vict Sex'].isNotNull()).count()
print(count_H)
count_H = df.filter(df['Vict Sex'] == 'H').count()
print(count_H)

65787
9


In [144]:
outed_data = (df.filter((df['Vict Sex'] == 'H')))
df = df.filter((df['Vict Sex'] != 'H'))

# Aktualizacja wartości w kolumnie 'Vict Sex'
df = df.withColumn("Vict Sex", 
                   when(df["Vict Sex"] == "F", "Female")
                   .when(df["Vict Sex"] == "M", "Male")
                   .when(df["Vict Sex"] == "X", "X-gender")
                   .when(df["Vict Sex"].isNull(), "no data")
                   .otherwise(df["Vict Sex"]))

df.printSchema()

root
 |-- Unnamed: 0.3: long (nullable = true)
 |-- Unnamed: 0.2: long (nullable = true)
 |-- Unnamed: 0.1: long (nullable = true)
 |-- Unnamed: 0: long (nullable = true)
 |-- DR_NO: long (nullable = true)
 |-- Date Rptd: date (nullable = true)
 |-- DATE OCC: string (nullable = true)
 |-- TIME OCC: long (nullable = true)
 |-- AREA: long (nullable = true)
 |-- AREA NAME: string (nullable = true)
 |-- Rpt Dist No: long (nullable = true)
 |-- Part 1-2: long (nullable = true)
 |-- Crm Cd: long (nullable = true)
 |-- Crm Cd Desc: string (nullable = true)
 |-- Mocodes: string (nullable = true)
 |-- Vict Age: long (nullable = true)
 |-- Vict Sex: string (nullable = true)
 |-- Vict Descent: string (nullable = true)
 |-- Premis Cd: double (nullable = true)
 |-- Premis Desc: string (nullable = true)
 |-- Weapon Used Cd: double (nullable = true)
 |-- Weapon Desc: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Status Desc: string (nullable = true)
 |-- Crm Cd 1: double (nullab

In [145]:
df.select('Vict Age')

DataFrame[Vict Age: bigint]

In [146]:
df.select('Vict Descent').distinct().show()

+------------+
|Vict Descent|
+------------+
|           K|
|           F|
|           B|
|           L|
|           V|
|           U|
|           O|
|           C|
|           J|
|           Z|
|           A|
|           X|
|           W|
|           S|
|           G|
|           I|
|           P|
|           H|
|        NULL|
+------------+



In [147]:
counts_df = df.groupBy("Vict Descent").count()
counts_df.show()

+------------+-----+
|Vict Descent|count|
+------------+-----+
|           K|  377|
|           F|  316|
|        NULL|    2|
|           B|10788|
|           L|    2|
|           V|   74|
|           U|   13|
|           O| 5995|
|           C|  292|
|           J|  119|
|           Z|   41|
|           A| 1662|
|           X| 7198|
|           W|15553|
|           S|    3|
|           G|    9|
|           I|   90|
|           P|   14|
|           H|23230|
+------------+-----+



In [148]:
df_gb = df.groupBy("Vict Descent").count()

df_gb = df_gb.orderBy('count', ascending=False)

start_row = 6

df_gb = df_gb.withColumn("row_index", monotonically_increasing_id())

# Przypisanie elementów od 6. wiersza
others = df_gb.filter(df_gb["row_index"] >= start_row)

top_five = df_gb.filter(df_gb['row_index'] < start_row)

In [149]:
top_five.show()

+------------+-----+---------+
|Vict Descent|count|row_index|
+------------+-----+---------+
|           H|23230|        0|
|           W|15553|        1|
|           B|10788|        2|
|           X| 7198|        3|
|           O| 5995|        4|
|           A| 1662|        5|
+------------+-----+---------+



In [150]:
suma = others.agg(sum("count")).collect()[0][0]
suma

data = [('Other', suma, 6)]

# Tworzenie ramki danych
df_sum = spark.createDataFrame(data, ["Vict Descent", "count", 'row_index'])

df_sum.show()

+------------+-----+---------+
|Vict Descent|count|row_index|
+------------+-----+---------+
|       Other| 1352|        6|
+------------+-----+---------+



In [151]:
combined_df = df_sum.union(top_five)
combined_df.show()

+------------+-----+---------+
|Vict Descent|count|row_index|
+------------+-----+---------+
|       Other| 1352|        6|
|           H|23230|        0|
|           W|15553|        1|
|           B|10788|        2|
|           X| 7198|        3|
|           O| 5995|        4|
|           A| 1662|        5|
+------------+-----+---------+



In [152]:
top_five = top_five.filter(top_five["Vict Descent"] != 'O')
top_five.show()

+------------+-----+---------+
|Vict Descent|count|row_index|
+------------+-----+---------+
|           H|23230|        0|
|           W|15553|        1|
|           B|10788|        2|
|           X| 7198|        3|
|           A| 1662|        5|
+------------+-----+---------+



In [153]:
tfv = top_five.select('Vict Descent')
tfv.show()

+------------+
|Vict Descent|
+------------+
|           H|
|           W|
|           B|
|           X|
|           A|
+------------+



In [154]:
from pyspark.sql.functions import when

# Utworzenie warunku
top_five_values = top_five.select('Vict Descent').distinct().collect()
top_five_values = [row['Vict Descent'] for row in top_five_values]
condition = ~df['Vict Descent'].isin(top_five_values)

# Tworzenie nowej kolumny na podstawie warunku
df = df.withColumn('Vict Descent', when(condition, 'Other').otherwise(df['Vict Descent']))

print(top_five_values)

['H', 'W', 'B', 'X', 'A']


In [155]:
df = df.withColumn("Vict Descent", 
                   when(df["Vict Descent"] == "H", "Hispanic or Latino")
                   .when(df["Vict Descent"] == "W", "White")
                   .when(df["Vict Descent"] == "B", "Black")
                   .when(df["Vict Descent"] == "X", "Unknown")
                   .when(df["Vict Descent"] == "O", "Other")
                   .when(df["Vict Descent"] == "A", "Asian")
                   .otherwise(df["Vict Descent"]))

In [156]:
df.select('Vict Age').distinct().show(200)

+--------+
|Vict Age|
+--------+
|      29|
|      26|
|      65|
|      19|
|      54|
|       0|
|      22|
|      77|
|       7|
|      34|
|      50|
|      94|
|      57|
|      32|
|      43|
|      84|
|      31|
|      39|
|      98|
|      25|
|      95|
|      71|
|      68|
|       6|
|      72|
|      87|
|      58|
|       9|
|      27|
|      63|
|      51|
|      56|
|      52|
|      79|
|      17|
|      41|
|      28|
|      33|
|      88|
|       5|
|      96|
|      -2|
|      10|
|      89|
|      85|
|      67|
|      48|
|      44|
|      61|
|       3|
|      37|
|      83|
|      12|
|      55|
|      74|
|      62|
|       8|
|      49|
|      11|
|      35|
|      80|
|       2|
|      66|
|      76|
|      92|
|       4|
|      13|
|      36|
|      75|
|      78|
|      18|
|      69|
|      14|
|      21|
|      59|
|      15|
|      81|
|      38|
|      82|
|      97|
|      42|
|      30|
|      73|
|      90|
|      23|
|      46|
|      20|
|      70|

In [157]:
from pyspark.ml.feature import Bucketizer

df = df.filter(df['Vict Age'] >= 0)
bucketizer = Bucketizer(splits=[0, 18, 35, 60,120, float('Inf') ],inputCol="Vict Age", outputCol="Vict Age bucket")
df = bucketizer.setHandleInvalid("keep").transform(df)

# Aktualizacja wartości w kolumnie 'Vict Age bucket'
df = df.withColumn("Vict Age bucket", 
                   when(df["Vict Age bucket"] == 0.0, "0-18")
                   .when(df["Vict Age bucket"] == 1.0, "18-35")
                   .when(df["Vict Age bucket"] == 2.0, "35-60")
                   .when(df["Vict Age bucket"] == 3.0, "60-100")
                   .otherwise("Unknown"))

df.select('Vict Age bucket').show()

+---------------+
|Vict Age bucket|
+---------------+
|          35-60|
|          18-35|
|          35-60|
|          35-60|
|           0-18|
|          35-60|
|          18-35|
|          35-60|
|          35-60|
|          35-60|
|          18-35|
|          35-60|
|          18-35|
|          18-35|
|          18-35|
|         60-100|
|          18-35|
|          35-60|
|           0-18|
|          18-35|
+---------------+
only showing top 20 rows



In [158]:
server_name = "localhost"
port = "1433"
database_name = "After_ETL"
url = f"jdbc:sqlserver://{server_name}:{port};databaseName={database_name}"

table_name = "Clean_table1"
username = "sa"
password = "YourStrongPassword123"

df.write \
    .format("jdbc") \
    .option("url", url) \
    .option("dbtable", table_name) \
    .option("user", username) \
    .option("password", password) \
    .option("encrypt", "false") \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .mode("overwrite") \
    .save()

In [159]:
spark.stop()