In [1]:
import os
spark_version = 'spark-3.5.1'
os.environ['SPARK_VERSION']=spark_version
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"
import findspark
findspark.init()

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com (185.125.190.39)] [Waiting for headers] [1 InRelease 0 B/3,626 0% [Connecting to archive.ubuntu.com (185.125.190.39)] [Waiting for headers] [Connected to ppa.launc                                                                                                    Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:6 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Get:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:8 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [53.5 kB]
Get:9 http://archive.ubuntu

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("crimes").getOrCreate()

In [3]:
from pyspark import SparkFiles
url ="https://raw.githubusercontent.com/RchlEMllr/Project_4/branchel/Resources/crime_numbers.csv"
spark.sparkContext.addFile(url)
murders = spark.read.option('header', 'true').csv(SparkFiles.get("crime_numbers.csv"), inferSchema=True, sep=',', timestampFormat="mm/dd/yy")

murders.show()

+------------+----------------------+------------+----------+-----------+------------+-------+
|offense_code|offense_code_extension|     geo_lon|   geo_lat|precinct_id|victim_count|murders|
+------------+----------------------+------------+----------+-----------+------------+-------+
|        2999|                     0|-104.9989101|39.7339566|      123.0|         1.0|      0|
|        2999|                     0|-104.9933421| 39.746248|      611.0|         1.0|      0|
|        2999|                     0|-105.0255203|39.7828883|      111.0|         1.0|      0|
|        2999|                     0|  -105.02533|39.7153571|      411.0|         1.0|      0|
|        2999|                     0|-104.8450739|39.7830825|      521.0|         1.0|      0|
|        2999|                     0| -105.012173|39.7594773|      113.0|         1.0|      0|
|        2999|                     0|-104.9807217|39.7365214|      623.0|         1.0|      0|
|        2999|                     0|-104.9527637|

In [4]:
import pandas as pd
import sklearn as skl
import tensorflow as tf
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [22]:
murder_df = murders.toPandas()

In [23]:
#Strip the columns for offense code, which will not so much predict as duplicate the murders column
trimmed_df = murder_df.drop(columns=['offense_code','offense_code_extension'])
trimmed_df

Unnamed: 0,geo_lon,geo_lat,precinct_id,victim_count,murders
0,-104.998910,39.733957,123.0,1.0,0
1,-104.993342,39.746248,611.0,1.0,0
2,-105.025520,39.782888,111.0,1.0,0
3,-105.025330,39.715357,411.0,1.0,0
4,-104.845074,39.783082,521.0,1.0,0
...,...,...,...,...,...
370661,-104.897261,39.739668,223.0,1.0,0
370662,-104.861343,39.795869,511.0,1.0,0
370663,-104.883680,39.706214,321.0,1.0,0
370664,-105.052966,39.735110,122.0,1.0,0


In [31]:
!pip install -q -U keras-tuner
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=10,
    hyperband_iterations=2)

Reloading Tuner from ./untitled_project/tuner0.json


In [25]:
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])

    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=20,
        step=2), activation=activation, input_dim=4))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 4)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))

    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

    return nn_model

In [26]:
y = trimmed_df["murders"].values
X = trimmed_df.drop("murders",axis=1).values

# Use sklearn to split dataset


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Create scaler instance
X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [27]:
tuner.search(X_train_scaled,y_train,epochs=10,validation_data=(X_test_scaled,y_test))

Trial 42 Complete [00h 01m 24s]
val_accuracy: 0.9987481832504272

Best val_accuracy So Far: 0.9987481832504272
Total elapsed time: 01h 14m 17s

Search: Running Trial #43

Value             |Best Value So Far |Hyperparameter
sigmoid           |sigmoid           |activation
3                 |1                 |first_units
4                 |6                 |num_layers
1                 |7                 |units_0
1                 |7                 |units_1
7                 |1                 |units_2
7                 |1                 |units_3
3                 |1                 |units_4
7                 |1                 |units_5
10                |2                 |tuner/epochs
4                 |0                 |tuner/initial_epoch
2                 |2                 |tuner/bracket
2                 |0                 |tuner/round
0038              |None              |tuner/trial_id

Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

KeyboardInterrupt: 

After an hour and 14 minutes I decided I could live with an accuracy of .9987

In [29]:
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'activation': 'sigmoid',
 'first_units': 1,
 'num_layers': 6,
 'units_0': 7,
 'units_1': 7,
 'units_2': 1,
 'tuner/epochs': 2,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 2,
 'tuner/round': 0,
 'units_3': 1,
 'units_4': 1,
 'units_5': 1}

In [30]:
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

2896/2896 - 4s - loss: 0.0096 - accuracy: 0.9987 - 4s/epoch - 1ms/step
Loss: 0.009632369503378868, Accuracy: 0.9987481832504272
