In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import tensorflow as tf

# It suggest sklearn.ensemble.HistGradientBoostingClassifier and Regressor, for NaN handlers, or preprocessing data

In [2]:
# Read the CSV file into a Pandas DataFrame
try:
    df = pd.read_csv("CFA_dataset_clean.csv")
except Exception as e:
    print("An error occured:", e)

In [3]:
df.dtypes

incident_datetime                          object
Incident_count                              int64
Ball_Direction of maximum wind gust        object
Ball_Maximum temperature (°C)             float64
Ball_Minimum temperature (°C)             float64
Ball_Speed of maximum wind gust (km/h)    float64
Fern_Direction of maximum wind gust        object
Fern_Maximum temperature (°C)             float64
Fern_Minimum temperature (°C)             float64
Fern_Speed of maximum wind gust (km/h)    float64
Wang_Direction of maximum wind gust        object
Wang_Maximum temperature (°C)             float64
Wang_Minimum temperature (°C)             float64
Wang_Speed of maximum wind gust (km/h)    float64
dtype: object

In [4]:
df.drop(columns=['incident_datetime','Fern_Direction of maximum wind gust', \
                 'Fern_Maximum temperature (°C)','Fern_Minimum temperature (°C)','Fern_Speed of maximum wind gust (km/h)',\
                 'Wang_Direction of maximum wind gust','Wang_Maximum temperature (°C)',\
                 'Wang_Minimum temperature (°C)', 'Wang_Speed of maximum wind gust (km/h)'], inplace=True)

In [5]:
df

Unnamed: 0,Incident_count,Ball_Direction of maximum wind gust,Ball_Maximum temperature (°C),Ball_Minimum temperature (°C),Ball_Speed of maximum wind gust (km/h)
0,204,NNW,30.5,2.9,78.0
1,132,NW,16.4,9.0,61.0
2,162,S,28.9,8.0,46.0
3,140,SE,26.0,13.3,43.0
4,135,SE,32.8,12.3,43.0
...,...,...,...,...,...
1266,63,SW,10.0,5.6,43.0
1267,68,SSW,9.9,-0.5,28.0
1268,56,N,10.1,-0.1,43.0
1269,63,N,7.4,2.2,46.0


In [6]:
enc = OneHotEncoder(handle_unknown='ignore')
enc_arr = enc.fit_transform(df[['Ball_Direction of maximum wind gust']])
column_names = enc.get_feature_names_out(input_features=['Ball_Direction of maximum wind gust'])

enc_df = pd.DataFrame(enc_arr.toarray(), columns=column_names)

df = df.join(enc_df)
df

Unnamed: 0,Incident_count,Ball_Direction of maximum wind gust,Ball_Maximum temperature (°C),Ball_Minimum temperature (°C),Ball_Speed of maximum wind gust (km/h),Ball_Direction of maximum wind gust_E,Ball_Direction of maximum wind gust_ENE,Ball_Direction of maximum wind gust_ESE,Ball_Direction of maximum wind gust_N,Ball_Direction of maximum wind gust_NE,...,Ball_Direction of maximum wind gust_NW,Ball_Direction of maximum wind gust_None,Ball_Direction of maximum wind gust_S,Ball_Direction of maximum wind gust_SE,Ball_Direction of maximum wind gust_SSE,Ball_Direction of maximum wind gust_SSW,Ball_Direction of maximum wind gust_SW,Ball_Direction of maximum wind gust_W,Ball_Direction of maximum wind gust_WNW,Ball_Direction of maximum wind gust_WSW
0,204,NNW,30.5,2.9,78.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,132,NW,16.4,9.0,61.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,162,S,28.9,8.0,46.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,140,SE,26.0,13.3,43.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,135,SE,32.8,12.3,43.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1266,63,SW,10.0,5.6,43.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1267,68,SSW,9.9,-0.5,28.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1268,56,N,10.1,-0.1,43.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1269,63,N,7.4,2.2,46.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df.drop(columns='Ball_Direction of maximum wind gust', inplace=True)

In [8]:
df

Unnamed: 0,Incident_count,Ball_Maximum temperature (°C),Ball_Minimum temperature (°C),Ball_Speed of maximum wind gust (km/h),Ball_Direction of maximum wind gust_E,Ball_Direction of maximum wind gust_ENE,Ball_Direction of maximum wind gust_ESE,Ball_Direction of maximum wind gust_N,Ball_Direction of maximum wind gust_NE,Ball_Direction of maximum wind gust_NNE,...,Ball_Direction of maximum wind gust_NW,Ball_Direction of maximum wind gust_None,Ball_Direction of maximum wind gust_S,Ball_Direction of maximum wind gust_SE,Ball_Direction of maximum wind gust_SSE,Ball_Direction of maximum wind gust_SSW,Ball_Direction of maximum wind gust_SW,Ball_Direction of maximum wind gust_W,Ball_Direction of maximum wind gust_WNW,Ball_Direction of maximum wind gust_WSW
0,204,30.5,2.9,78.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,132,16.4,9.0,61.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,162,28.9,8.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,140,26.0,13.3,43.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,135,32.8,12.3,43.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1266,63,10.0,5.6,43.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1267,68,9.9,-0.5,28.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1268,56,10.1,-0.1,43.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1269,63,7.4,2.2,46.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
df.to_csv('df_encoded.csv', sep=',', index=False)

In [9]:
y = df.Incident_count.values
X = df.drop(columns="Incident_count").values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [10]:
# Preview the features data
print(X)

[[30.5  2.9 78.  ...  0.   0.   0. ]
 [16.4  9.  61.  ...  0.   0.   0. ]
 [28.9  8.  46.  ...  0.   0.   0. ]
 ...
 [10.1 -0.1 43.  ...  0.   0.   0. ]
 [ 7.4  2.2 46.  ...  0.   0.   0. ]
 [11.9  1.8 31.  ...  0.   0.   0. ]]


In [11]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

In [12]:
print(X_test_scaled)

[[-1.12287315 -0.51129946 -1.78971842 ... -0.26831139 -0.20656617
  -0.28372322]
 [ 0.37713214  0.61869903 -0.80096384 ... -0.26831139 -0.20656617
  -0.28372322]
 [ 1.65072154  2.00946641  0.7962551  ... -0.26831139 -0.20656617
  -0.28372322]
 ...
 [-0.11815263 -1.53264426  1.78500969 ... -0.26831139 -0.20656617
  -0.28372322]
 [ 0.82996393  0.24927645  0.03567465 ... -0.26831139 -0.20656617
  -0.28372322]
 [-0.25966256  0.74908347  0.49202292 ... -0.26831139 -0.20656617
  -0.28372322]]


In [13]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [15]:
# Define the deep learning model 
nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Dense(units=1, activation="softmax", input_dim=20))
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=20)
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
10/10 - 0s - loss: -6.7321e+01 - accuracy: 0.0000e+00 - 112ms/epoch - 11ms/step
Loss: -67.32083129882812, Accuracy: 0.0
