In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import tensorflow as tf

# It suggest sklearn.ensemble.HistGradientBoostingClassifier and Regressor, for NaN handlers, or preprocessing data

In [2]:
# Read the CSV file into a Pandas DataFrame
try:
    df = pd.read_csv("CFA_dataset_clean.csv")
except Exception as e:
    print("An error occured:", e)

In [3]:
df.isnull().values.sum()

0

In [4]:
df.dtypes

incident_datetime                          object
Incident_count                              int64
Ball_Direction of maximum wind gust        object
Ball_Maximum temperature (°C)             float64
Ball_Minimum temperature (°C)             float64
Ball_Speed of maximum wind gust (km/h)    float64
Fern_Direction of maximum wind gust        object
Fern_Maximum temperature (°C)             float64
Fern_Minimum temperature (°C)             float64
Fern_Speed of maximum wind gust (km/h)    float64
Wang_Direction of maximum wind gust        object
Wang_Maximum temperature (°C)             float64
Wang_Minimum temperature (°C)             float64
Wang_Speed of maximum wind gust (km/h)    float64
dtype: object

In [5]:
df.drop(columns=['incident_datetime'], inplace=True)

In [6]:
enc = OneHotEncoder(handle_unknown='ignore')
enc_arr = enc.fit_transform(df[['Ball_Direction of maximum wind gust']])
column_names = enc.get_feature_names_out(input_features=['Ball_Direction of maximum wind gust'])

enc_df = pd.DataFrame(enc_arr.toarray(), columns=column_names)

df = df.join(enc_df)
df

Unnamed: 0,Incident_count,Ball_Direction of maximum wind gust,Ball_Maximum temperature (°C),Ball_Minimum temperature (°C),Ball_Speed of maximum wind gust (km/h),Fern_Direction of maximum wind gust,Fern_Maximum temperature (°C),Fern_Minimum temperature (°C),Fern_Speed of maximum wind gust (km/h),Wang_Direction of maximum wind gust,...,Ball_Direction of maximum wind gust_NW,Ball_Direction of maximum wind gust_None,Ball_Direction of maximum wind gust_S,Ball_Direction of maximum wind gust_SE,Ball_Direction of maximum wind gust_SSE,Ball_Direction of maximum wind gust_SSW,Ball_Direction of maximum wind gust_SW,Ball_Direction of maximum wind gust_W,Ball_Direction of maximum wind gust_WNW,Ball_Direction of maximum wind gust_WSW
0,204,NNW,30.5,2.9,78.0,E,18.9,11.8,15.0,N,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,132,NW,16.4,9.0,61.0,NW,21.2,10.1,19.0,NNW,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,162,S,28.9,8.0,46.0,NW,10.7,7.2,24.0,WNW,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,140,SE,26.0,13.3,43.0,WNW,10.7,8.6,26.0,NNE,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,135,SE,32.8,12.3,43.0,SW,11.4,8.5,43.0,WSW,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1266,63,SW,10.0,5.6,43.0,NW,7.8,6.7,30.0,WNW,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1267,68,SSW,9.9,-0.5,28.0,WSW,7.3,4.2,17.0,NE,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1268,56,N,10.1,-0.1,43.0,N,9.9,3.9,39.0,SE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1269,63,N,7.4,2.2,46.0,N,7.7,4.8,33.0,S,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
enc = OneHotEncoder(handle_unknown='ignore')
enc_arr = enc.fit_transform(df[['Fern_Direction of maximum wind gust']])
column_names = enc.get_feature_names_out(input_features=['Fern_Direction of maximum wind gust'])

enc_df = pd.DataFrame(enc_arr.toarray(), columns=column_names)

df = df.join(enc_df)
df

Unnamed: 0,Incident_count,Ball_Direction of maximum wind gust,Ball_Maximum temperature (°C),Ball_Minimum temperature (°C),Ball_Speed of maximum wind gust (km/h),Fern_Direction of maximum wind gust,Fern_Maximum temperature (°C),Fern_Minimum temperature (°C),Fern_Speed of maximum wind gust (km/h),Wang_Direction of maximum wind gust,...,Fern_Direction of maximum wind gust_NW,Fern_Direction of maximum wind gust_None,Fern_Direction of maximum wind gust_S,Fern_Direction of maximum wind gust_SE,Fern_Direction of maximum wind gust_SSE,Fern_Direction of maximum wind gust_SSW,Fern_Direction of maximum wind gust_SW,Fern_Direction of maximum wind gust_W,Fern_Direction of maximum wind gust_WNW,Fern_Direction of maximum wind gust_WSW
0,204,NNW,30.5,2.9,78.0,E,18.9,11.8,15.0,N,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,132,NW,16.4,9.0,61.0,NW,21.2,10.1,19.0,NNW,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,162,S,28.9,8.0,46.0,NW,10.7,7.2,24.0,WNW,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,140,SE,26.0,13.3,43.0,WNW,10.7,8.6,26.0,NNE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,135,SE,32.8,12.3,43.0,SW,11.4,8.5,43.0,WSW,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1266,63,SW,10.0,5.6,43.0,NW,7.8,6.7,30.0,WNW,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1267,68,SSW,9.9,-0.5,28.0,WSW,7.3,4.2,17.0,NE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1268,56,N,10.1,-0.1,43.0,N,9.9,3.9,39.0,SE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1269,63,N,7.4,2.2,46.0,N,7.7,4.8,33.0,S,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
enc = OneHotEncoder(handle_unknown='ignore')
enc_arr = enc.fit_transform(df[['Wang_Direction of maximum wind gust']])
column_names = enc.get_feature_names_out(input_features=['Wang_Direction of maximum wind gust'])

enc_df = pd.DataFrame(enc_arr.toarray(), columns=column_names)

df = df.join(enc_df)

In [9]:
df_df = pd.DataFrame(df)
df_df

Unnamed: 0,Incident_count,Ball_Direction of maximum wind gust,Ball_Maximum temperature (°C),Ball_Minimum temperature (°C),Ball_Speed of maximum wind gust (km/h),Fern_Direction of maximum wind gust,Fern_Maximum temperature (°C),Fern_Minimum temperature (°C),Fern_Speed of maximum wind gust (km/h),Wang_Direction of maximum wind gust,...,Wang_Direction of maximum wind gust_NW,Wang_Direction of maximum wind gust_None,Wang_Direction of maximum wind gust_S,Wang_Direction of maximum wind gust_SE,Wang_Direction of maximum wind gust_SSE,Wang_Direction of maximum wind gust_SSW,Wang_Direction of maximum wind gust_SW,Wang_Direction of maximum wind gust_W,Wang_Direction of maximum wind gust_WNW,Wang_Direction of maximum wind gust_WSW
0,204,NNW,30.5,2.9,78.0,E,18.9,11.8,15.0,N,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,132,NW,16.4,9.0,61.0,NW,21.2,10.1,19.0,NNW,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,162,S,28.9,8.0,46.0,NW,10.7,7.2,24.0,WNW,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,140,SE,26.0,13.3,43.0,WNW,10.7,8.6,26.0,NNE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,135,SE,32.8,12.3,43.0,SW,11.4,8.5,43.0,WSW,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1266,63,SW,10.0,5.6,43.0,NW,7.8,6.7,30.0,WNW,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1267,68,SSW,9.9,-0.5,28.0,WSW,7.3,4.2,17.0,NE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1268,56,N,10.1,-0.1,43.0,N,9.9,3.9,39.0,SE,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1269,63,N,7.4,2.2,46.0,N,7.7,4.8,33.0,S,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# df.to_csv('df_encoded.csv', sep=',', index=False)

In [11]:
df_df.drop(columns=['Ball_Minimum temperature (°C)','Fern_Minimum temperature (°C)','Wang_Minimum temperature (°C)','Ball_Direction of maximum wind gust','Fern_Direction of maximum wind gust', 'Wang_Direction of maximum wind gust'], inplace=True)

In [12]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [13]:
y = df_df.Incident_count.values
X = df_df.drop(columns="Incident_count").values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:

svr = SVR(kernel='linear', C=1.0, epsilon=0.2)

In [15]:
svr.fit(X_train, y_train)

In [18]:
y_pred = svr.predict(X_test)

In [19]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')



Mean Squared Error: 1199.0204802013545
R-squared: 0.37046596764413964


Accuracy: 0.01768172888015717
Classification Report:
              precision    recall  f1-score   support

          40       1.00      0.00      0.00         1
          41       1.00      0.00      0.00         1
          43       1.00      0.00      0.00         1
          44       1.00      0.00      0.00         1
          46       1.00      0.00      0.00         1
          47       1.00      0.00      0.00         2
          51       1.00      0.00      0.00         2
          52       0.00      0.00      1.00         3
          53       1.00      0.00      0.00         2
          54       1.00      0.00      0.00         6
          55       1.00      0.00      0.00         3
          56       1.00      0.00      0.00         3
          57       1.00      0.00      0.00         5
          58       1.00      0.00      0.00         4
          60       0.33      0.20      0.25         5
          61       1.00      0.00      0.00         6
          62       0.00     