In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import log_loss
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.utils import to_categorical
import numpy as np
 
#Load Data with pandas, and parse the first column into datetime
# train=pd.read_csv('train.csv', parse_dates = ['Dates'])
# test=pd.read_csv('test.csv', parse_dates = ['Dates'])
data=pd.read_csv('data/combined_weather_crime_lunar_final.csv')
data.head()

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Unnamed: 0.1,Unnamed: 0,DATE,CCN,STATION,NAME,PRCP,TMAX,TMIN,Year,Month,Date,Day,Illum
0,0,1/1/16,103,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,56,41,2016,1,1,6,0.6
1,1,1/2/16,93,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,46,31,2016,1,2,7,0.5
2,2,1/3/16,100,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,48,29,2016,1,3,1,0.41
3,3,1/4/16,77,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,53,29,2016,1,4,2,0.32
4,4,1/5/16,92,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,38,15,2016,1,5,3,0.23


In [2]:
data.drop(data.columns[[0]], axis=1, inplace=True)
data.head()

Unnamed: 0,DATE,CCN,STATION,NAME,PRCP,TMAX,TMIN,Year,Month,Date,Day,Illum
0,1/1/16,103,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,56,41,2016,1,1,6,0.6
1,1/2/16,93,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,46,31,2016,1,2,7,0.5
2,1/3/16,100,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,48,29,2016,1,3,1,0.41
3,1/4/16,77,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,53,29,2016,1,4,2,0.32
4,1/5/16,92,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,38,15,2016,1,5,3,0.23


In [3]:
data.rename(columns={"CCN": "CRIME-COUNT"}, inplace=True)

In [4]:
data['bins'] = np.nan

In [5]:
data.head()

Unnamed: 0,DATE,CRIME-COUNT,STATION,NAME,PRCP,TMAX,TMIN,Year,Month,Date,Day,Illum,bins
0,1/1/16,103,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,56,41,2016,1,1,6,0.6,
1,1/2/16,93,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,46,31,2016,1,2,7,0.5,
2,1/3/16,100,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,48,29,2016,1,3,1,0.41,
3,1/4/16,77,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,53,29,2016,1,4,2,0.32,
4,1/5/16,92,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,38,15,2016,1,5,3,0.23,


In [6]:
data.loc[data['CRIME-COUNT']<=75, ['bins']] = 'VeryLow'


In [7]:
data.loc[(data['CRIME-COUNT']>=76) & (data['CRIME-COUNT']<=90), ['bins']] = 'Low'

In [8]:
data.loc[(data['CRIME-COUNT']>=91) & (data['CRIME-COUNT']<=105), ['bins']] = 'Medium'

In [9]:
data.loc[(data['CRIME-COUNT']>=106) & (data['CRIME-COUNT']<=120), ['bins']] = 'High'

In [10]:
data.loc[(data['CRIME-COUNT']>=121, ['bins'])] = 'VeryHigh'

In [11]:
data['bins'].value_counts()

Medium      410
Low         374
High        212
VeryLow     175
VeryHigh     78
Name: bins, dtype: int64

In [12]:
data['CRIME-COUNT'].describe()

count    1249.000000
mean       93.541233
std        17.111019
min        28.000000
25%        82.000000
50%        93.000000
75%       105.000000
max       146.000000
Name: CRIME-COUNT, dtype: float64

In [13]:
data.isnull().any()

DATE           False
CRIME-COUNT    False
STATION        False
NAME           False
PRCP           False
TMAX           False
TMIN           False
Year           False
Month          False
Date           False
Day            False
Illum          False
bins           False
dtype: bool

In [14]:
X = data.drop(["bins", "DATE", "STATION", "NAME", "CRIME-COUNT", "Year"], axis=1)
y = data["bins"]

In [15]:
data.head()

Unnamed: 0,DATE,CRIME-COUNT,STATION,NAME,PRCP,TMAX,TMIN,Year,Month,Date,Day,Illum,bins
0,1/1/16,103,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,56,41,2016,1,1,6,0.6,Medium
1,1/2/16,93,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,46,31,2016,1,2,7,0.5,Medium
2,1/3/16,100,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,48,29,2016,1,3,1,0.41,Medium
3,1/4/16,77,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,53,29,2016,1,4,2,0.32,Low
4,1/5/16,92,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,38,15,2016,1,5,3,0.23,Medium


In [16]:
X['TMAX'] = X['TMAX'].astype(float)
X['TMIN'] = X['TMIN'].astype(float)
X['Month'] = X['Month'].astype(float)
X['Date'] = X['Date'].astype(float)
X['Day'] = X['Illum'].astype(float)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.6)

In [18]:
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [19]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [20]:
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [22]:
model = Sequential()
model.add(Dense(units=100, activation='sigmoid', input_dim=7))
# model.add(Dense(units=200, activation='sigmoid'))
model.add(Dense(units=200, activation='relu'))
# model.add(Dense(units=500, activation='tanh'))
model.add(Dense(units=5, activation='softmax'))

W0817 12:51:04.195394 4620068288 deprecation.py:506] From /anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [23]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               800       
_________________________________________________________________
dense_1 (Dense)              (None, 200)               20200     
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 1005      
Total params: 22,005
Trainable params: 22,005
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=500,
    shuffle=True,
    verbose=2
)

Epoch 1/500
749/749 - 0s - loss: 1.5172 - acc: 0.3111
Epoch 2/500
749/749 - 0s - loss: 1.4819 - acc: 0.3364
Epoch 3/500
749/749 - 0s - loss: 1.4805 - acc: 0.3231
Epoch 4/500
749/749 - 0s - loss: 1.4852 - acc: 0.3057
Epoch 5/500
749/749 - 0s - loss: 1.4710 - acc: 0.3364
Epoch 6/500
749/749 - 0s - loss: 1.4682 - acc: 0.3164
Epoch 7/500
749/749 - 0s - loss: 1.4606 - acc: 0.3164
Epoch 8/500
749/749 - 0s - loss: 1.4562 - acc: 0.3324
Epoch 9/500
749/749 - 0s - loss: 1.4454 - acc: 0.3538
Epoch 10/500
749/749 - 0s - loss: 1.4492 - acc: 0.3271
Epoch 11/500
749/749 - 0s - loss: 1.4413 - acc: 0.3391
Epoch 12/500
749/749 - 0s - loss: 1.4409 - acc: 0.3097
Epoch 13/500
749/749 - 0s - loss: 1.4212 - acc: 0.3471
Epoch 14/500
749/749 - 0s - loss: 1.4044 - acc: 0.3632
Epoch 15/500
749/749 - 0s - loss: 1.3948 - acc: 0.3818
Epoch 16/500
749/749 - 0s - loss: 1.3938 - acc: 0.3805
Epoch 17/500
749/749 - 0s - loss: 1.3914 - acc: 0.4045
Epoch 18/500
749/749 - 0s - loss: 1.3772 - acc: 0.4112
Epoch 19/500
749/74

749/749 - 0s - loss: 1.3217 - acc: 0.4366
Epoch 150/500
749/749 - 0s - loss: 1.3163 - acc: 0.4459
Epoch 151/500
749/749 - 0s - loss: 1.3181 - acc: 0.4366
Epoch 152/500
749/749 - 0s - loss: 1.3141 - acc: 0.4352
Epoch 153/500
749/749 - 0s - loss: 1.3230 - acc: 0.4312
Epoch 154/500
749/749 - 0s - loss: 1.3263 - acc: 0.4206
Epoch 155/500
749/749 - 0s - loss: 1.3221 - acc: 0.4366
Epoch 156/500
749/749 - 0s - loss: 1.3231 - acc: 0.4379
Epoch 157/500
749/749 - 0s - loss: 1.3195 - acc: 0.4286
Epoch 158/500
749/749 - 0s - loss: 1.3207 - acc: 0.4326
Epoch 159/500
749/749 - 0s - loss: 1.3267 - acc: 0.4246
Epoch 160/500
749/749 - 0s - loss: 1.3255 - acc: 0.4419
Epoch 161/500
749/749 - 0s - loss: 1.3166 - acc: 0.4246
Epoch 162/500
749/749 - 0s - loss: 1.3171 - acc: 0.4366
Epoch 163/500
749/749 - 0s - loss: 1.3213 - acc: 0.4259
Epoch 164/500
749/749 - 0s - loss: 1.3166 - acc: 0.4419
Epoch 165/500
749/749 - 0s - loss: 1.3223 - acc: 0.4393
Epoch 166/500
749/749 - 0s - loss: 1.3212 - acc: 0.4272
Epoch 

Epoch 296/500
749/749 - 0s - loss: 1.2909 - acc: 0.4459
Epoch 297/500
749/749 - 0s - loss: 1.2869 - acc: 0.4446
Epoch 298/500
749/749 - 0s - loss: 1.2854 - acc: 0.4419
Epoch 299/500
749/749 - 0s - loss: 1.2879 - acc: 0.4473
Epoch 300/500
749/749 - 0s - loss: 1.2848 - acc: 0.4486
Epoch 301/500
749/749 - 0s - loss: 1.2840 - acc: 0.4499
Epoch 302/500
749/749 - 0s - loss: 1.2853 - acc: 0.4513
Epoch 303/500
749/749 - 0s - loss: 1.2803 - acc: 0.4393
Epoch 304/500
749/749 - 0s - loss: 1.2867 - acc: 0.4446
Epoch 305/500
749/749 - 0s - loss: 1.2887 - acc: 0.4379
Epoch 306/500
749/749 - 0s - loss: 1.2807 - acc: 0.4486
Epoch 307/500
749/749 - 0s - loss: 1.2797 - acc: 0.4326
Epoch 308/500
749/749 - 0s - loss: 1.2812 - acc: 0.4446
Epoch 309/500
749/749 - 0s - loss: 1.2898 - acc: 0.4473
Epoch 310/500
749/749 - 0s - loss: 1.2836 - acc: 0.4433
Epoch 311/500
749/749 - 0s - loss: 1.2815 - acc: 0.4553
Epoch 312/500
749/749 - 0s - loss: 1.2835 - acc: 0.4459
Epoch 313/500
749/749 - 0s - loss: 1.2809 - acc:

Epoch 443/500
749/749 - 0s - loss: 1.2416 - acc: 0.4660
Epoch 444/500
749/749 - 0s - loss: 1.2382 - acc: 0.4619
Epoch 445/500
749/749 - 0s - loss: 1.2440 - acc: 0.4553
Epoch 446/500
749/749 - 0s - loss: 1.2327 - acc: 0.4686
Epoch 447/500
749/749 - 0s - loss: 1.2374 - acc: 0.4633
Epoch 448/500
749/749 - 0s - loss: 1.2403 - acc: 0.4740
Epoch 449/500
749/749 - 0s - loss: 1.2337 - acc: 0.4780
Epoch 450/500
749/749 - 0s - loss: 1.2378 - acc: 0.4646
Epoch 451/500
749/749 - 0s - loss: 1.2392 - acc: 0.4686
Epoch 452/500
749/749 - 0s - loss: 1.2418 - acc: 0.4686
Epoch 453/500
749/749 - 0s - loss: 1.2330 - acc: 0.4726
Epoch 454/500
749/749 - 0s - loss: 1.2358 - acc: 0.4673
Epoch 455/500
749/749 - 0s - loss: 1.2343 - acc: 0.4673
Epoch 456/500
749/749 - 0s - loss: 1.2388 - acc: 0.4660
Epoch 457/500
749/749 - 0s - loss: 1.2309 - acc: 0.4766
Epoch 458/500
749/749 - 0s - loss: 1.2296 - acc: 0.4740
Epoch 459/500
749/749 - 0s - loss: 1.2399 - acc: 0.4579
Epoch 460/500
749/749 - 0s - loss: 1.2335 - acc:

<tensorflow.python.keras.callbacks.History at 0x1a28863828>

In [26]:
X_train_scaled

array([[0.        , 0.64556962, 0.46575342, ..., 0.8       , 0.26      ,
        0.26      ],
       [0.05472637, 0.73417722, 0.5890411 , ..., 0.66666667, 0.99      ,
        0.99      ],
       [0.        , 0.88607595, 0.78082192, ..., 0.63333333, 0.54      ,
        0.54      ],
       ...,
       [0.        , 0.92405063, 0.87671233, ..., 0.9       , 0.18      ,
        0.18      ],
       [0.        , 0.30379747, 0.36986301, ..., 0.8       , 0.38      ,
        0.38      ],
       [0.        , 0.82278481, 0.79452055, ..., 0.86666667, 0.99      ,
        0.99      ]])

In [27]:
y_train_categorical.shape

(749, 5)

In [28]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

500/500 - 0s - loss: 1.3872 - acc: 0.3700
Normal Neural Network - Loss: 1.3871876764297486, Accuracy: 0.3700000047683716


In [29]:
encoded_predictions = model.predict_classes(X_test_scaled[:10])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

In [30]:
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:10])}")

Predicted classes: ['Medium' 'Medium' 'Medium' 'VeryLow' 'Medium' 'Medium' 'Low' 'Medium'
 'Medium' 'High']
Actual Labels: ['Medium', 'VeryLow', 'VeryLow', 'VeryLow', 'Medium', 'High', 'VeryLow', 'High', 'Low', 'High']
