In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

df = pd.read_csv('US_Accidents_March23.csv', nrows=3000000)

In [2]:
df.drop(['ID','Source','Start_Time','End_Time','Start_Lat','Start_Lng','End_Lat','End_Lng',
        'Description','Street','Country','Timezone','Airport_Code','Weather_Timestamp','Turning_Loop'], axis=1, inplace=True)

In [3]:
df.sample(5)

Unnamed: 0,Severity,Distance(mi),City,County,State,Zipcode,Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),Weather_Condition,Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
1814307,2,0.0,Plano,Collin,TX,75093,37.0,37.0,81.0,29.72,10.0,CALM,0.0,0.0,Fair,False,False,False,False,False,False,False,False,False,False,False,True,Day,Day,Day,Day
296350,2,0.0,Austin,Travis,TX,78751-5011,91.9,,54.0,30.07,10.0,South,9.2,,Partly Cloudy,True,False,True,False,False,False,False,False,True,False,False,False,Day,Day,Day,Day
134287,2,0.0,Fort Myers,Lee,FL,33905-2026,84.9,,55.0,29.99,10.0,NE,17.3,,Overcast,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
2320653,2,0.0,Grand Rapids,Kent,MI,49504-3140,21.9,11.7,75.0,30.44,10.0,ESE,9.2,,Overcast,False,False,False,False,False,False,False,False,True,False,False,False,Day,Day,Day,Day
1901139,2,0.0,Minneapolis,Hennepin,MN,55416,58.0,58.0,72.0,29.08,9.0,W,18.0,0.0,Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day


In [4]:
df.shape

(3000000, 31)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 31 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Severity               int64  
 1   Distance(mi)           float64
 2   City                   object 
 3   County                 object 
 4   State                  object 
 5   Zipcode                object 
 6   Temperature(F)         float64
 7   Wind_Chill(F)          float64
 8   Humidity(%)            float64
 9   Pressure(in)           float64
 10  Visibility(mi)         float64
 11  Wind_Direction         object 
 12  Wind_Speed(mph)        float64
 13  Precipitation(in)      float64
 14  Weather_Condition      object 
 15  Amenity                bool   
 16  Bump                   bool   
 17  Crossing               bool   
 18  Give_Way               bool   
 19  Junction               bool   
 20  No_Exit                bool   
 21  Railway                bool   
 22  Roundabout        

In [6]:
## Separating into cat, num, bool variables

df_cat = [i for i in df.columns if df[i].dtype=='object']
df_num = [i for i in df.columns if df[i].dtype!='object' and df[i].dtype != 'bool' and i!='Severity']
df_bool = [i for i in df.columns if df[i].dtype == 'bool']

In [7]:
df_cat

['City',
 'County',
 'State',
 'Zipcode',
 'Wind_Direction',
 'Weather_Condition',
 'Sunrise_Sunset',
 'Civil_Twilight',
 'Nautical_Twilight',
 'Astronomical_Twilight']

In [8]:
df_num

['Distance(mi)',
 'Temperature(F)',
 'Wind_Chill(F)',
 'Humidity(%)',
 'Pressure(in)',
 'Visibility(mi)',
 'Wind_Speed(mph)',
 'Precipitation(in)']

In [9]:
df_bool

['Amenity',
 'Bump',
 'Crossing',
 'Give_Way',
 'Junction',
 'No_Exit',
 'Railway',
 'Roundabout',
 'Station',
 'Stop',
 'Traffic_Calming',
 'Traffic_Signal']

In [10]:
## Missing values in int

missing_percent_int = df[df_num].isnull().mean() * 100

missing_data_int = missing_percent_int[missing_percent_int > 0]
missing_data_int = missing_data_int.sort_values(ascending=False)

In [11]:
missing_data_int

Precipitation(in)    40.842433
Wind_Chill(F)        37.828400
Wind_Speed(mph)       9.288367
Visibility(mi)        1.867167
Humidity(%)           1.765600
Temperature(F)        1.643133
Pressure(in)          1.378833
dtype: float64

In [12]:
## Missing values in categorical

missing_percent_obj = df[df_cat].isnull().mean() * 100

missing_data_obj = missing_percent_obj[missing_percent_obj > 0]
missing_data_obj = missing_percent_obj.sort_values(ascending=False)

In [13]:
missing_data_obj

Weather_Condition        1.840000
Wind_Direction           1.616700
Sunrise_Sunset           0.055633
Civil_Twilight           0.055633
Nautical_Twilight        0.055633
Astronomical_Twilight    0.055633
Zipcode                  0.013733
City                     0.001867
County                   0.000000
State                    0.000000
dtype: float64

In [14]:
## Missing values in categorical

missing_percent_bool = df[df_bool].isnull().mean() * 100

missing_percent_bool = missing_percent_bool[missing_percent_bool > 0]
missing_percent_bool = missing_percent_bool.sort_values(ascending=False)

In [15]:
missing_percent_bool

Series([], dtype: float64)

In [16]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [17]:
df[df_cat].head()

Unnamed: 0,City,County,State,Zipcode,Wind_Direction,Weather_Condition,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,Dayton,Montgomery,OH,45424,Calm,Light Rain,Night,Night,Night,Night
1,Reynoldsburg,Franklin,OH,43068-3402,Calm,Light Rain,Night,Night,Night,Day
2,Williamsburg,Clermont,OH,45176,SW,Overcast,Night,Night,Day,Day
3,Dayton,Montgomery,OH,45417,SW,Mostly Cloudy,Night,Day,Day,Day
4,Dayton,Montgomery,OH,45459,SW,Mostly Cloudy,Day,Day,Day,Day


In [18]:
df[df_num].head()

Unnamed: 0,Distance(mi),Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Precipitation(in)
0,0.01,36.9,,91.0,29.68,10.0,,0.02
1,0.01,37.9,,100.0,29.65,10.0,,0.0
2,0.01,36.0,33.3,100.0,29.67,10.0,3.5,
3,0.01,35.1,31.0,96.0,29.64,9.0,4.6,
4,0.01,36.0,33.3,89.0,29.65,6.0,3.5,


In [19]:
df[df_bool].head()

Unnamed: 0,Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,True


In [20]:
df_cols = [i for i in df.columns if i!='Severity']

In [21]:
df_cols

['Distance(mi)',
 'City',
 'County',
 'State',
 'Zipcode',
 'Temperature(F)',
 'Wind_Chill(F)',
 'Humidity(%)',
 'Pressure(in)',
 'Visibility(mi)',
 'Wind_Direction',
 'Wind_Speed(mph)',
 'Precipitation(in)',
 'Weather_Condition',
 'Amenity',
 'Bump',
 'Crossing',
 'Give_Way',
 'Junction',
 'No_Exit',
 'Railway',
 'Roundabout',
 'Station',
 'Stop',
 'Traffic_Calming',
 'Traffic_Signal',
 'Sunrise_Sunset',
 'Civil_Twilight',
 'Nautical_Twilight',
 'Astronomical_Twilight']

In [22]:
len(df_num)

8

In [23]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        """
        Custom transformer to perform frequency encoding on columns.
        """
        self.frequency_maps = {}

    def fit(self, X, y=None):
        """
        Compute frequency maps for each column.
        :param X: Input DataFrame or array.
        :param y: Ignored.
        :return: self
        """
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        for column in X.columns:
            self.frequency_maps[column] = X[column].value_counts().to_dict()
        return self

    def transform(self, X):
        """
        Transform columns by replacing values with their frequency.
        :param X: Input DataFrame or array.
        :return: Transformed DataFrame
        """
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        X_transformed = X.copy()
        for column in X.columns:
            X_transformed[column] = X_transformed[column].map(self.frequency_maps[column])
        return X_transformed.values  # Return as array for compatibility
        
    def get_feature_names_out(self, input_features=None):
        return input_features


### Train test split

In [24]:
from sklearn.model_selection import train_test_split

y = df[['Severity']]
X = df.drop(['Severity'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

### Transformation 1: Missing value imputation

In [25]:
preprocessor1 = ColumnTransformer(transformers=[
    ('num_impute', SimpleImputer(strategy='mean'), df_num),
    ('cat_impute', SimpleImputer(strategy='most_frequent'), df_cat)
],remainder='passthrough',n_jobs=-1
)

In [26]:
X_train_transformed = pd.DataFrame(preprocessor1.fit_transform(X_train),columns=preprocessor1.get_feature_names_out())
X_test_transformed = pd.DataFrame(preprocessor1.transform(X_test),columns=preprocessor1.get_feature_names_out())

### Transformation 2: Encoding categorical columns

In [27]:
X_train_transformed.head()

Unnamed: 0,num_impute__Distance(mi),num_impute__Temperature(F),num_impute__Wind_Chill(F),num_impute__Humidity(%),num_impute__Pressure(in),num_impute__Visibility(mi),num_impute__Wind_Speed(mph),num_impute__Precipitation(in),cat_impute__City,cat_impute__County,cat_impute__State,cat_impute__Zipcode,cat_impute__Wind_Direction,cat_impute__Weather_Condition,cat_impute__Sunrise_Sunset,cat_impute__Civil_Twilight,cat_impute__Nautical_Twilight,cat_impute__Astronomical_Twilight,remainder__Amenity,remainder__Bump,remainder__Crossing,remainder__Give_Way,remainder__Junction,remainder__No_Exit,remainder__Railway,remainder__Roundabout,remainder__Station,remainder__Stop,remainder__Traffic_Calming,remainder__Traffic_Signal
0,0.0,84.9,57.93296,69.0,30.11,10.0,13.8,0.011069,Palm Beach Gardens,Palm Beach,FL,33418,NE,Scattered Clouds,Day,Day,Day,Day,False,False,False,False,False,False,False,False,False,False,False,False
1,0.0,62.61395,57.93296,66.309525,29.639324,9.091887,7.759205,0.011069,Thurmont,Frederick,MD,21788,CALM,Fair,Day,Day,Day,Day,False,False,False,False,False,False,False,False,False,False,False,False
2,0.0,69.0,69.0,70.0,29.18,10.0,8.0,0.0,Minneapolis,Hennepin,MN,55423,SSE,Mostly Cloudy,Day,Day,Day,Day,False,False,False,False,False,False,False,False,False,False,False,False
3,0.0,88.0,57.93296,57.0,30.0,10.0,9.2,0.011069,Dallas,Dallas,TX,75237,South,Partly Cloudy,Day,Day,Day,Day,False,False,True,False,False,False,False,False,False,False,False,True
4,0.0,88.0,57.93296,34.0,30.06,10.0,7.759205,0.011069,Tulsa,Tulsa,OK,74107,Calm,Partly Cloudy,Day,Day,Day,Day,False,False,False,False,False,False,False,False,False,False,False,False


In [28]:
X_train_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400000 entries, 0 to 2399999
Data columns (total 30 columns):
 #   Column                             Dtype 
---  ------                             ----- 
 0   num_impute__Distance(mi)           object
 1   num_impute__Temperature(F)         object
 2   num_impute__Wind_Chill(F)          object
 3   num_impute__Humidity(%)            object
 4   num_impute__Pressure(in)           object
 5   num_impute__Visibility(mi)         object
 6   num_impute__Wind_Speed(mph)        object
 7   num_impute__Precipitation(in)      object
 8   cat_impute__City                   object
 9   cat_impute__County                 object
 10  cat_impute__State                  object
 11  cat_impute__Zipcode                object
 12  cat_impute__Wind_Direction         object
 13  cat_impute__Weather_Condition      object
 14  cat_impute__Sunrise_Sunset         object
 15  cat_impute__Civil_Twilight         object
 16  cat_impute__Nautical_Twilight      o

In [29]:
preprocessor2 = ColumnTransformer(transformers=[
    ('frequency_encode', FrequencyEncoder(), [i for i in range(8,18)]),
    ('ohe_encode', OneHotEncoder(drop='first'), [i for i in range(18,30)])
],remainder='passthrough',n_jobs=-1
)

In [30]:
X_train_transformed = pd.DataFrame(preprocessor2.fit_transform(X_train_transformed),columns=preprocessor2.get_feature_names_out())
X_test_transformed = pd.DataFrame(preprocessor2.transform(X_test_transformed),columns=preprocessor2.get_feature_names_out())

In [31]:
X_train_transformed.head()

Unnamed: 0,frequency_encode__cat_impute__City,frequency_encode__cat_impute__County,frequency_encode__cat_impute__State,frequency_encode__cat_impute__Zipcode,frequency_encode__cat_impute__Wind_Direction,frequency_encode__cat_impute__Weather_Condition,frequency_encode__cat_impute__Sunrise_Sunset,frequency_encode__cat_impute__Civil_Twilight,frequency_encode__cat_impute__Nautical_Twilight,frequency_encode__cat_impute__Astronomical_Twilight,ohe_encode__remainder__Amenity_True,ohe_encode__remainder__Bump_True,ohe_encode__remainder__Crossing_True,ohe_encode__remainder__Give_Way_True,ohe_encode__remainder__Junction_True,ohe_encode__remainder__No_Exit_True,ohe_encode__remainder__Railway_True,ohe_encode__remainder__Roundabout_True,ohe_encode__remainder__Station_True,ohe_encode__remainder__Stop_True,ohe_encode__remainder__Traffic_Calming_True,ohe_encode__remainder__Traffic_Signal_True,remainder__num_impute__Distance(mi),remainder__num_impute__Temperature(F),remainder__num_impute__Wind_Chill(F),remainder__num_impute__Humidity(%),remainder__num_impute__Pressure(in),remainder__num_impute__Visibility(mi),remainder__num_impute__Wind_Speed(mph),remainder__num_impute__Precipitation(in)
0,710,11349,203672,400,83197,96458,1778496,1905291,2030158,2125242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,84.9,57.93296,69.0,30.11,10.0,13.8,0.011069
1,102,3714,30923,82,281222,649220,1778496,1905291,2030158,2125242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.61395,57.93296,66.309525,29.639324,9.091887,7.759205,0.011069
2,13924,15537,41929,771,113811,331878,1778496,1905291,2030158,2125242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,69.0,69.0,70.0,29.18,10.0,8.0,0.0
3,51980,62154,254778,947,84462,226471,1778496,1905291,2030158,2125242,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,88.0,57.93296,57.0,30.0,10.0,9.2,0.011069
4,13533,13883,47188,444,171549,226471,1778496,1905291,2030158,2125242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,88.0,57.93296,34.0,30.06,10.0,7.759205,0.011069


In [32]:
X_train_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400000 entries, 0 to 2399999
Data columns (total 30 columns):
 #   Column                                               Dtype 
---  ------                                               ----- 
 0   frequency_encode__cat_impute__City                   object
 1   frequency_encode__cat_impute__County                 object
 2   frequency_encode__cat_impute__State                  object
 3   frequency_encode__cat_impute__Zipcode                object
 4   frequency_encode__cat_impute__Wind_Direction         object
 5   frequency_encode__cat_impute__Weather_Condition      object
 6   frequency_encode__cat_impute__Sunrise_Sunset         object
 7   frequency_encode__cat_impute__Civil_Twilight         object
 8   frequency_encode__cat_impute__Nautical_Twilight      object
 9   frequency_encode__cat_impute__Astronomical_Twilight  object
 10  ohe_encode__remainder__Amenity_True                  object
 11  ohe_encode__remainder__Bump_True     

### Transformation 3: Feature Scaling

In [33]:
preprocessor3 = ColumnTransformer(transformers=[
    ('feat_scaling', StandardScaler(), [i for i in range(0,30)])
],remainder='passthrough',n_jobs=-1
)

In [34]:
X_train_transformed = pd.DataFrame(preprocessor3.fit_transform(X_train_transformed),columns=preprocessor3.get_feature_names_out())
X_test_transformed = pd.DataFrame(preprocessor3.transform(X_test_transformed),columns=preprocessor3.get_feature_names_out())

In [35]:
##pipeline = Pipeline(
##    [
##        ('imputer', trf1),
##        ('encoder', trf2),
##        ('scaling', trf3)
#    ]
#)

IndentationError: unexpected indent (1726107564.py, line 6)

In [36]:
X_train_transformed.head()

Unnamed: 0,feat_scaling__frequency_encode__cat_impute__City,feat_scaling__frequency_encode__cat_impute__County,feat_scaling__frequency_encode__cat_impute__State,feat_scaling__frequency_encode__cat_impute__Zipcode,feat_scaling__frequency_encode__cat_impute__Wind_Direction,feat_scaling__frequency_encode__cat_impute__Weather_Condition,feat_scaling__frequency_encode__cat_impute__Sunrise_Sunset,feat_scaling__frequency_encode__cat_impute__Civil_Twilight,feat_scaling__frequency_encode__cat_impute__Nautical_Twilight,feat_scaling__frequency_encode__cat_impute__Astronomical_Twilight,feat_scaling__ohe_encode__remainder__Amenity_True,feat_scaling__ohe_encode__remainder__Bump_True,feat_scaling__ohe_encode__remainder__Crossing_True,feat_scaling__ohe_encode__remainder__Give_Way_True,feat_scaling__ohe_encode__remainder__Junction_True,feat_scaling__ohe_encode__remainder__No_Exit_True,feat_scaling__ohe_encode__remainder__Railway_True,feat_scaling__ohe_encode__remainder__Roundabout_True,feat_scaling__ohe_encode__remainder__Station_True,feat_scaling__ohe_encode__remainder__Stop_True,feat_scaling__ohe_encode__remainder__Traffic_Calming_True,feat_scaling__ohe_encode__remainder__Traffic_Signal_True,feat_scaling__remainder__num_impute__Distance(mi),feat_scaling__remainder__num_impute__Temperature(F),feat_scaling__remainder__num_impute__Wind_Chill(F),feat_scaling__remainder__num_impute__Humidity(%),feat_scaling__remainder__num_impute__Pressure(in),feat_scaling__remainder__num_impute__Visibility(mi),feat_scaling__remainder__num_impute__Wind_Speed(mph),feat_scaling__remainder__num_impute__Precipitation(in)
0,-0.612457,-0.495233,0.169284,-0.062365,-0.587781,-1.203856,0.591147,0.509559,0.426818,0.35956,-0.119799,-0.019529,-0.404098,-0.08063,-0.233203,-0.051798,-0.098867,-0.005845,-0.163756,-0.181298,-0.031606,-0.527526,-0.131575,1.221471,-2.688852e-13,0.1211695,0.5473819,0.3433474,1.21866,2.496378e-12
1,-0.643948,-0.69725,-0.941289,-0.635641,2.489001,1.431782,0.591147,0.509559,0.426818,0.35956,-0.119799,-0.019529,-0.404098,-0.08063,-0.233203,-0.051798,-0.098867,-0.005845,-0.163756,-0.181298,-0.031606,-0.527526,-0.131575,1.009038e-12,-2.688852e-13,1.152012e-12,-9.70949e-13,2.769092e-12,-1.265008e-13,2.496378e-12
2,0.071954,-0.384422,-0.870533,0.606458,-0.112121,-0.081344,0.591147,0.509559,0.426818,0.35956,-0.119799,-0.019529,-0.404098,-0.08063,-0.233203,-0.051798,-0.098867,-0.005845,-0.163756,-0.181298,-0.031606,-0.527526,-0.131575,0.3500116,0.6278883,0.1662059,-0.5341798,0.3433474,0.04857755,-0.1027167
3,2.043041,0.849032,0.497835,0.923743,-0.568126,-0.583938,0.591147,0.509559,0.426818,0.35956,-0.119799,-0.019529,2.474648,-0.08063,-0.233203,-0.051798,-0.098867,-0.005845,-0.163756,-0.181298,-0.031606,1.89564,-0.131575,1.391378,-2.688852e-13,-0.4192681,0.4194553,0.3433474,0.2906635,2.496378e-12
4,0.051702,-0.428185,-0.836724,0.016957,0.784974,-0.583938,0.591147,0.509559,0.426818,0.35956,-0.119799,-0.019529,-0.404098,-0.08063,-0.233203,-0.051798,-0.098867,-0.005845,-0.163756,-0.181298,-0.031606,-0.527526,-0.131575,1.391378,-2.688852e-13,-1.455107,0.4892334,0.3433474,-1.265008e-13,2.496378e-12


In [37]:
X_train_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400000 entries, 0 to 2399999
Data columns (total 30 columns):
 #   Column                                                             Dtype  
---  ------                                                             -----  
 0   feat_scaling__frequency_encode__cat_impute__City                   float64
 1   feat_scaling__frequency_encode__cat_impute__County                 float64
 2   feat_scaling__frequency_encode__cat_impute__State                  float64
 3   feat_scaling__frequency_encode__cat_impute__Zipcode                float64
 4   feat_scaling__frequency_encode__cat_impute__Wind_Direction         float64
 5   feat_scaling__frequency_encode__cat_impute__Weather_Condition      float64
 6   feat_scaling__frequency_encode__cat_impute__Sunrise_Sunset         float64
 7   feat_scaling__frequency_encode__cat_impute__Civil_Twilight         float64
 8   feat_scaling__frequency_encode__cat_impute__Nautical_Twilight      float64
 9   fe

In [38]:
y_train.value_counts()

Severity
2           1562789
3            794714
1             31222
4             11275
Name: count, dtype: int64

In [39]:
y_train = y_train - 1

In [40]:
y_train.value_counts()

Severity
1           1562789
2            794714
0             31222
3             11275
Name: count, dtype: int64

In [41]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [42]:
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
#y_pred.argmax(axis=1)

In [None]:
#y_pred = y_pred.argmax(axis=1)

In [None]:
#from sklearn.metrics import recall_score, classification_report

In [None]:
#y_test = y_test - 1
#y_test.value_counts()

In [None]:
#print(classification_report(y_test, y_pred))

### ANN Hyperparameter tuning using Keras Tuner

In [43]:
import keras_tuner as kt

In [44]:
def build_model(hp):

    model = Sequential()

    c= 0
    
    for i in range(hp.Int('num_layers', min_value=1, max_value=10)):

        if c==0:
            model.add(
                Dense(
                    hp.Int('units'+str(i), min_value=9, max_value=128, step=8), 
                    activation=hp.Choice('activation' + str(i), values = ['relu', 'tanh', 'sigmoid']),
                    input_dim=30
                )
            )
        else:
            model.add(
                Dense(
                    hp.Int('units'+str(i), min_value=9, max_value=128, step=8), 
                    activation=hp.Choice('activation' + str(i), values = ['relu', 'tanh', 'sigmoid'])
                )
            )

        c = c+1

    model.add(Dense(4, activation='softmax'))
    
    model.compile(optimizer=hp.Choice('optimizer', values = ['adam','rmsprop','sgd','adadelta']), 
                                      loss='sparse_categorical_crossentropy', 
                                      metrics=['accuracy'])

    return model

In [45]:
tuner = kt.RandomSearch(build_model,
                        objective = 'accuracy',
                        max_trials=3,
                        directory = 'mydir',
                        project_name = 'final')

Reloading Tuner from mydir\final\tuner0.json


In [46]:
tuner.search(X_train_transformed, y_train, epochs = 5)

In [47]:
tuner.get_best_hyperparameters()[0].values

{'num_layers': 10,
 'units0': 89,
 'activation0': 'relu',
 'optimizer': 'rmsprop',
 'units1': 9,
 'activation1': 'relu',
 'units2': 9,
 'activation2': 'relu',
 'units3': 9,
 'activation3': 'relu',
 'units4': 9,
 'activation4': 'relu',
 'units5': 9,
 'activation5': 'relu',
 'units6': 9,
 'activation6': 'relu',
 'units7': 9,
 'activation7': 'relu',
 'units8': 9,
 'activation8': 'relu',
 'units9': 9,
 'activation9': 'relu'}

In [48]:
model = tuner.get_best_models(num_models=1)[0]




  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  saveable.load_own_variables(weights_store.get(inner_path))


In [49]:
model.fit(X_train_transformed, y_train, epochs=10, initial_epoch=5, validation_data=0.2)

Epoch 6/10
[1m74962/75000[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 849us/step - accuracy: 0.7422 - loss: 0.5566

ValueError: Unrecognized data type: x=0.2 (of type <class 'float'>)

In [105]:
y_pred = y_pred.argmax(axis=1)

NameError: name 'y_pred' is not defined