In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import tensorflow as tf

# It suggest sklearn.ensemble.HistGradientBoostingClassifier and Regressor, for NaN handlers, or preprocessing data

In [2]:
# Read the CSV file into a Pandas DataFrame
try:
    df = pd.read_csv("CFA_dataset_clean.csv")
except Exception as e:
    print("An error occured:", e)

In [3]:
df.isnull().values.sum()

0

In [4]:
df.dtypes

incident_datetime                          object
Incident_count                              int64
Ball_Direction of maximum wind gust        object
Ball_Maximum temperature (°C)             float64
Ball_Minimum temperature (°C)             float64
Ball_Speed of maximum wind gust (km/h)    float64
Fern_Direction of maximum wind gust        object
Fern_Maximum temperature (°C)             float64
Fern_Minimum temperature (°C)             float64
Fern_Speed of maximum wind gust (km/h)    float64
Wang_Direction of maximum wind gust        object
Wang_Maximum temperature (°C)             float64
Wang_Minimum temperature (°C)             float64
Wang_Speed of maximum wind gust (km/h)    float64
dtype: object

In [5]:
df.drop(columns=['incident_datetime'], inplace=True)

In [8]:
y[:5]

array([204, 132, 162, 140, 135], dtype=int64)

In [64]:
from sklearn.preprocessing import OneHotEncoder

In [69]:
selected_columns = df[['Ball_Direction of maximum wind gust', 'Fern_Direction of maximum wind gust','Wang_Direction of maximum wind gust']]

In [72]:
# Initialize an encoder for each
encoders = [OneHotEncoder(sparse=False) for _ in selected_columns.columns]

# Fit and transform each encoder for each column
encoded_data = [encoder.fit_transform(selected_columns[[col]]) for encoder, col in zip(encoders, selected_columns.columns)]

# Create DataFrames for each encoded column
encoded_dfs = [pd.DataFrame(encoded, columns=encoder.get_feature_names_out(input_features=[col])) for encoder, col, encoded in zip(encoders, selected_columns.columns, encoded_data)]

# Concatenate the encoded DataFrames with the original DataFrame
df_encoded = pd.concat([df] + encoded_dfs, axis=1)



In [73]:
df_encoded

Unnamed: 0,Incident_count,Ball_Direction of maximum wind gust,Ball_Maximum temperature (°C),Ball_Minimum temperature (°C),Ball_Speed of maximum wind gust (km/h),Fern_Direction of maximum wind gust,Fern_Maximum temperature (°C),Fern_Minimum temperature (°C),Fern_Speed of maximum wind gust (km/h),Wang_Direction of maximum wind gust,...,Wang_Direction of maximum wind gust_NW,Wang_Direction of maximum wind gust_None,Wang_Direction of maximum wind gust_S,Wang_Direction of maximum wind gust_SE,Wang_Direction of maximum wind gust_SSE,Wang_Direction of maximum wind gust_SSW,Wang_Direction of maximum wind gust_SW,Wang_Direction of maximum wind gust_W,Wang_Direction of maximum wind gust_WNW,Wang_Direction of maximum wind gust_WSW
0,204,NNW,30.5,2.9,78.0,E,18.9,11.8,15.0,N,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,132,NW,16.4,9.0,61.0,NW,21.2,10.1,19.0,NNW,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,162,S,28.9,8.0,46.0,NW,10.7,7.2,24.0,WNW,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,140,SE,26.0,13.3,43.0,WNW,10.7,8.6,26.0,NNE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,135,SE,32.8,12.3,43.0,SW,11.4,8.5,43.0,WSW,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1266,63,SW,10.0,5.6,43.0,NW,7.8,6.7,30.0,WNW,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1267,68,SSW,9.9,-0.5,28.0,WSW,7.3,4.2,17.0,NE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1268,56,N,10.1,-0.1,43.0,N,9.9,3.9,39.0,SE,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1269,63,N,7.4,2.2,46.0,N,7.7,4.8,33.0,S,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [74]:
# Remove  target from features data
y = df_encoded.Incident_count.values
X = df_encoded.drop(columns="Incident_count").values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [75]:
# Preview the features data
print(X)

[['NNW' 30.5 2.9 ... 0.0 0.0 0.0]
 ['NW' 16.4 9.0 ... 0.0 0.0 0.0]
 ['S' 28.9 8.0 ... 0.0 1.0 0.0]
 ...
 ['N' 10.1 -0.1 ... 0.0 0.0 0.0]
 ['N' 7.4 2.2 ... 0.0 0.0 0.0]
 ['SSE' 11.9 1.8 ... 0.0 0.0 0.0]]


In [76]:
print(X)

[['NNW' 30.5 2.9 ... 0.0 0.0 0.0]
 ['NW' 16.4 9.0 ... 0.0 0.0 0.0]
 ['S' 28.9 8.0 ... 0.0 1.0 0.0]
 ...
 ['N' 10.1 -0.1 ... 0.0 0.0 0.0]
 ['N' 7.4 2.2 ... 0.0 0.0 0.0]
 ['SSE' 11.9 1.8 ... 0.0 0.0 0.0]]


In [77]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

ValueError: could not convert string to float: 'NNW'

In [78]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)

ValueError: Data must be 1-dimensional

In [79]:
# Review the features data
X.head()

AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [80]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [81]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

ValueError: could not convert string to float: 'ESE'

In [82]:
# Import the KNeighborsClassifier module from sklearn
from sklearn.neighbors import KNeighborsClassifier

# Instantiate the KNeighborsClassifier model with n_neighbors = 3 
knn = KNeighborsClassifier(n_neighbors=3)

In [83]:
# Train the model using the training data
knn.fit(X_train_scaled, y_train)

NameError: name 'X_train_scaled' is not defined

In [84]:
# Train the model using the training data
knn.fit(X_train_scaled, y_train)

NameError: name 'X_train_scaled' is not defined

In [85]:
# Create predictions using the testing data
y_pred = knn.predict(X_test_scaled)

NameError: name 'X_test_scaled' is not defined

In [86]:
# Print the classification report comparing the testing data to the model predictions
print(classification_report(y_test, y_pred))

NameError: name 'classification_report' is not defined

In [87]:
# NEW TEST, BUT SCALING THE DATA FIRST

In [88]:
try:
    df2 = pd.read_csv("D:\Bootcamp\Classwork\Canberra_airport\CFA_dataset_clean.csv", low_memory=False)
except Exception as e:
    print("An error occured:", e)


In [89]:
df2.dropna(inplace=True)

In [90]:
df2.head()

Unnamed: 0,incident_datetime,Incident_count,Ball_Direction of maximum wind gust,Ball_Maximum temperature (°C),Ball_Minimum temperature (°C),Ball_Speed of maximum wind gust (km/h),Fern_Direction of maximum wind gust,Fern_Maximum temperature (°C),Fern_Minimum temperature (°C),Fern_Speed of maximum wind gust (km/h),Wang_Direction of maximum wind gust,Wang_Maximum temperature (°C),Wang_Minimum temperature (°C),Wang_Speed of maximum wind gust (km/h)
334,1/12/2019,97,NNW,14.6,5.6,52,ESE,14.9,7.8,31.0,N,33.7,16.8,26.0
335,2/12/2019,94,WSW,11.7,5.0,57,ESE,21.6,6.7,28.0,SSW,25.9,20.9,48.0
336,3/12/2019,106,W,19.3,4.8,61,,25.3,13.5,0.0,SSW,30.3,17.1,48.0
365,1/01/2020,235,WSW,25.9,7.4,41,NW,17.2,9.2,15.0,SW,35.5,13.2,28.0
366,2/01/2020,180,ESE,27.2,8.6,39,NNE,16.8,11.1,28.0,N,36.6,13.9,30.0


In [91]:
# Scaling the numeric columns
data_scaled = StandardScaler().fit_transform(df[["Ball_Maximum temperature (°C)", "Ball_Speed of maximum wind gust (km/h)", "Fern_Maximum temperature (°C)","Fern_Maximum temperature (°C)", "Fern_Minimum temperature (°C)","Fern_Speed of maximum wind gust (km/h)", "Wang_Maximum temperature (°C)", "Wang_Minimum temperature (°C)","Wang_Speed of maximum wind gust (km/h)"]])

# Creating a DataFrame with with the scaled data
data_transformed = pd.DataFrame(data_scaled, columns=["Ball_Maximum temperature (°C)", "Ball_Speed of maximum wind gust (km/h)", "Fern_Maximum temperature (°C)","Fern_Maximum temperature (°C)", "Fern_Minimum temperature (°C)","Fern_Speed of maximum wind gust (km/h)", "Wang_Maximum temperature (°C)", "Wang_Minimum temperature (°C)","Wang_Speed of maximum wind gust (km/h)"])

# Display sample data
data_transformed.head()

Unnamed: 0,Ball_Maximum temperature (°C),Ball_Speed of maximum wind gust (km/h),Fern_Maximum temperature (°C),Fern_Maximum temperature (°C).1,Fern_Minimum temperature (°C),Fern_Speed of maximum wind gust (km/h),Wang_Maximum temperature (°C),Wang_Minimum temperature (°C),Wang_Speed of maximum wind gust (km/h)
0,1.730722,2.405663,0.442568,0.442568,0.724699,-1.19281,2.156289,1.029363,-0.023879
1,-0.255487,1.151249,0.803101,0.803101,0.321694,-0.863411,2.917852,1.839605,1.178812
2,1.505336,0.044413,-0.842814,-0.842814,-0.365786,-0.451661,0.633161,0.539842,0.754333
3,1.096825,-0.176954,-0.842814,-0.842814,-0.033899,-0.286961,0.758008,0.168482,0.117614
4,2.054713,-0.176954,-0.733086,-0.733086,-0.057605,1.112987,1.319817,0.421682,0.4006


In [92]:
df2.tail()

Unnamed: 0,incident_datetime,Incident_count,Ball_Direction of maximum wind gust,Ball_Maximum temperature (°C),Ball_Minimum temperature (°C),Ball_Speed of maximum wind gust (km/h),Fern_Direction of maximum wind gust,Fern_Maximum temperature (°C),Fern_Minimum temperature (°C),Fern_Speed of maximum wind gust (km/h),Wang_Direction of maximum wind gust,Wang_Maximum temperature (°C),Wang_Minimum temperature (°C),Wang_Speed of maximum wind gust (km/h)
1203,26/06/2022,63,SW,10.0,5.6,43,NNE,13.7,13.4,41.0,WNW,18.4,3.6,26.0
1204,27/06/2022,68,SSW,9.9,-0.5,28,NNW,16.4,11.3,44.0,NE,19.4,3.8,24.0
1205,28/06/2022,56,N,10.1,-0.1,43,NW,11.6,8.0,28.0,SE,21.1,7.4,39.0
1206,29/06/2022,63,N,7.4,2.2,46,NW,10.4,4.1,31.0,S,24.1,3.7,24.0
1207,30/06/2022,60,SSE,11.9,1.8,31,SW,10.8,6.3,28.0,NNW,22.9,8.3,19.0


In [93]:
dummies = pd.get_dummies(df2[{"Ball_Direction of maximum wind gust", "Fern_Direction of maximum wind gust", "Wang_Direction of maximum wind gust"}])

# Display sample data
dummies.head()

  dummies = pd.get_dummies(df2[{"Ball_Direction of maximum wind gust", "Fern_Direction of maximum wind gust", "Wang_Direction of maximum wind gust"}])


Unnamed: 0,Wang_Direction of maximum wind gust_E,Wang_Direction of maximum wind gust_ENE,Wang_Direction of maximum wind gust_ESE,Wang_Direction of maximum wind gust_N,Wang_Direction of maximum wind gust_NE,Wang_Direction of maximum wind gust_NNE,Wang_Direction of maximum wind gust_NNW,Wang_Direction of maximum wind gust_NW,Wang_Direction of maximum wind gust_None,Wang_Direction of maximum wind gust_S,...,Ball_Direction of maximum wind gust_NW,Ball_Direction of maximum wind gust_None,Ball_Direction of maximum wind gust_S,Ball_Direction of maximum wind gust_SE,Ball_Direction of maximum wind gust_SSE,Ball_Direction of maximum wind gust_SSW,Ball_Direction of maximum wind gust_SW,Ball_Direction of maximum wind gust_W,Ball_Direction of maximum wind gust_WNW,Ball_Direction of maximum wind gust_WSW
334,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
335,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
336,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
365,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
366,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [94]:
df2['Incident_count'] = pd.to_numeric(df2['Incident_count'], downcast='integer')

y = df2['Incident_count'].reset_index(drop=True)
y

0       97
1       94
2      106
3      235
4      180
      ... 
841     63
842     68
843     56
844     63
845     60
Name: Incident_count, Length: 846, dtype: int16

In [95]:
# Concatenate the df_shopping_transformed and the card_dummies DataFrames
cfa_transformed = pd.concat([y, data_transformed, dummies], axis=1)

# Display sample data
cfa_transformed.tail()

Unnamed: 0,Incident_count,Ball_Maximum temperature (°C),Ball_Speed of maximum wind gust (km/h),Fern_Maximum temperature (°C),Fern_Maximum temperature (°C).1,Fern_Minimum temperature (°C),Fern_Speed of maximum wind gust (km/h),Wang_Maximum temperature (°C),Wang_Minimum temperature (°C),Wang_Speed of maximum wind gust (km/h),...,Ball_Direction of maximum wind gust_NW,Ball_Direction of maximum wind gust_None,Ball_Direction of maximum wind gust_S,Ball_Direction of maximum wind gust_SE,Ball_Direction of maximum wind gust_SSE,Ball_Direction of maximum wind gust_SSW,Ball_Direction of maximum wind gust_SW,Ball_Direction of maximum wind gust_W,Ball_Direction of maximum wind gust_WNW,Ball_Direction of maximum wind gust_WSW
1266,,-1.157028,-0.176954,-1.2974,-1.2974,-0.484317,0.042438,-0.477973,-0.65864,-0.519105,...,,,,,,,,,,
1267,,-1.171115,-1.283791,-1.375777,-1.375777,-1.076972,-1.02811,-0.353126,-0.62488,-0.660598,...,,,,,,,,,,
1268,,-1.142942,-0.176954,-0.968217,-0.968217,-1.14809,0.783588,-0.140887,-0.017199,0.4006,...,,,,,,,,,,
1269,,-1.52328,0.044413,-1.313076,-1.313076,-0.934735,0.289488,0.233653,-0.64176,-0.660598,...,,,,,,,,,,
1270,,-0.889383,-1.062423,-0.68606,-0.68606,-0.958441,-0.616361,0.083837,0.134722,-1.014331,...,,,,,,,,,,


In [58]:
pd.DataFrame(cfa_transformed)

Unnamed: 0,Incident_count,Ball_Maximum temperature (°C),Ball_Speed of maximum wind gust (km/h),Fern_Maximum temperature (°C),Fern_Maximum temperature (°C).1,Fern_Minimum temperature (°C),Fern_Speed of maximum wind gust (km/h),Wang_Maximum temperature (°C),Wang_Minimum temperature (°C),Wang_Speed of maximum wind gust (km/h),...,Wang_Direction of maximum wind gust_NW,Wang_Direction of maximum wind gust_None,Wang_Direction of maximum wind gust_S,Wang_Direction of maximum wind gust_SE,Wang_Direction of maximum wind gust_SSE,Wang_Direction of maximum wind gust_SSW,Wang_Direction of maximum wind gust_SW,Wang_Direction of maximum wind gust_W,Wang_Direction of maximum wind gust_WNW,Wang_Direction of maximum wind gust_WSW
334,97.0,1.128308,0.232890,3.107851,3.107851,4.277421,1.606546,-1.255363,-0.478240,0.215545,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
335,94.0,0.608450,2.381977,-0.021930,-0.021930,1.582713,-0.409730,-0.834827,0.060760,0.639126,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
336,106.0,0.593597,0.079384,0.798173,0.798173,-0.156598,1.186488,-1.084520,-1.448439,-1.337583,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
365,235.0,1.261986,-0.611394,0.814910,0.814910,-0.279084,-0.409730,-0.979386,-0.334507,-0.066842,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
366,180.0,-0.059939,-1.148665,0.095228,0.095228,0.014884,-0.409730,-0.979386,-1.053173,-1.337583,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,,-0.089645,0.079384,2.153854,2.153854,2.930067,-0.409730,-0.959673,-0.334507,-2.255340,...,,,,,,,,,,
361,,-0.030233,-0.150875,0.195648,0.195648,1.362237,-1.165833,-0.663984,-0.604007,0.497932,...,,,,,,,,,,
362,,-0.134205,-0.457888,0.563858,0.563858,0.725307,0.094339,-1.137087,0.150593,0.780319,...,,,,,,,,,,
363,,0.014326,0.539903,-0.021930,-0.021930,-0.230090,-0.577753,-1.518197,-0.514173,1.133303,...,,,,,,,,,,


In [51]:
cfa_transformed.to_csv('cfa_transformed', sep=',', index=False)