In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import log_loss
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.utils import to_categorical
import numpy as np
 
#Load Data with pandas, and parse the first column into datetime
# train=pd.read_csv('train.csv', parse_dates = ['Dates'])
# test=pd.read_csv('test.csv', parse_dates = ['Dates'])
data=pd.read_csv('data/combined_weather_crime_lunar_final.csv')
data.head()

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Unnamed: 0.1,Unnamed: 0,DATE,CCN,STATION,NAME,PRCP,TMAX,TMIN,Year,Month,Date,Day,Illum
0,0,1/1/16,103,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,56,41,2016,1,1,6,0.6
1,1,1/2/16,93,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,46,31,2016,1,2,7,0.5
2,2,1/3/16,100,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,48,29,2016,1,3,1,0.41
3,3,1/4/16,77,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,53,29,2016,1,4,2,0.32
4,4,1/5/16,92,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,38,15,2016,1,5,3,0.23


In [2]:
data.drop(data.columns[[0]], axis=1, inplace=True)
data.head()

Unnamed: 0,DATE,CCN,STATION,NAME,PRCP,TMAX,TMIN,Year,Month,Date,Day,Illum
0,1/1/16,103,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,56,41,2016,1,1,6,0.6
1,1/2/16,93,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,46,31,2016,1,2,7,0.5
2,1/3/16,100,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,48,29,2016,1,3,1,0.41
3,1/4/16,77,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,53,29,2016,1,4,2,0.32
4,1/5/16,92,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,38,15,2016,1,5,3,0.23


In [3]:
data.rename(columns={"CCN": "CRIME-COUNT"}, inplace=True)

In [4]:
data['bins'] = np.nan

In [5]:
data.head()

Unnamed: 0,DATE,CRIME-COUNT,STATION,NAME,PRCP,TMAX,TMIN,Year,Month,Date,Day,Illum,bins
0,1/1/16,103,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,56,41,2016,1,1,6,0.6,
1,1/2/16,93,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,46,31,2016,1,2,7,0.5,
2,1/3/16,100,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,48,29,2016,1,3,1,0.41,
3,1/4/16,77,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,53,29,2016,1,4,2,0.32,
4,1/5/16,92,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.0,38,15,2016,1,5,3,0.23,


In [6]:
data.loc[data['CRIME-COUNT']<=75, ['bins']] = 'VeryLow'


In [7]:
data.loc[(data['CRIME-COUNT']>=76) & (data['CRIME-COUNT']<=90), ['bins']] = 'Low'

In [8]:
data.loc[(data['CRIME-COUNT']>=91) & (data['CRIME-COUNT']<=105), ['bins']] = 'Medium'

In [9]:
data.loc[(data['CRIME-COUNT']>=106) & (data['CRIME-COUNT']<=120), ['bins']] = 'High'

In [10]:
data.loc[(data['CRIME-COUNT']>=121, ['bins'])] = 'VeryHigh'

In [11]:
data['bins'].value_counts()

Medium      410
Low         374
High        212
VeryLow     175
VeryHigh     78
Name: bins, dtype: int64

In [12]:
data['CRIME-COUNT'].describe()

count    1249.000000
mean       93.541233
std        17.111019
min        28.000000
25%        82.000000
50%        93.000000
75%       105.000000
max       146.000000
Name: CRIME-COUNT, dtype: float64

In [13]:
data.isnull().any()

DATE           False
CRIME-COUNT    False
STATION        False
NAME           False
PRCP           False
TMAX           False
TMIN           False
Year           False
Month          False
Date           False
Day            False
Illum          False
bins           False
dtype: bool

In [14]:
X = data.drop(["bins", "DATE", "STATION", "NAME", "CRIME-COUNT", "Year"], axis=1)
y = data["bins"]

In [15]:
X.head()

Unnamed: 0,PRCP,TMAX,TMIN,Month,Date,Day,Illum
0,0.0,56,41,1,1,6,0.6
1,0.0,46,31,1,2,7,0.5
2,0.0,48,29,1,3,1,0.41
3,0.0,53,29,1,4,2,0.32
4,0.0,38,15,1,5,3,0.23


In [16]:
X['TMAX'] = X['TMAX'].astype(float)
X['TMIN'] = X['TMIN'].astype(float)
X['Month'] = X['Month'].astype(float)
X['Date'] = X['Date'].astype(float)
X['Day'] = X['Illum'].astype(float)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [18]:
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [19]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [20]:
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [22]:
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=7))
model.add(Dense(units=100, activation='tanh'))
model.add(Dense(units=100, activation='tanh'))
model.add(Dense(units=5, activation='softmax'))

W0817 11:31:39.719125 4597863872 deprecation.py:506] From /anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [23]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               800       
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 505       
Total params: 21,505
Trainable params: 21,505
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2
)

Epoch 1/60
936/936 - 0s - loss: 1.5055 - acc: 0.3312
Epoch 2/60
936/936 - 0s - loss: 1.4439 - acc: 0.3675
Epoch 3/60
936/936 - 0s - loss: 1.4188 - acc: 0.3739
Epoch 4/60
936/936 - 0s - loss: 1.3990 - acc: 0.3868
Epoch 5/60
936/936 - 0s - loss: 1.3865 - acc: 0.3921
Epoch 6/60
936/936 - 0s - loss: 1.3793 - acc: 0.3974
Epoch 7/60
936/936 - 0s - loss: 1.3683 - acc: 0.3814
Epoch 8/60
936/936 - 0s - loss: 1.3612 - acc: 0.3964
Epoch 9/60
936/936 - 0s - loss: 1.3456 - acc: 0.3953
Epoch 10/60
936/936 - 0s - loss: 1.3482 - acc: 0.4028
Epoch 11/60
936/936 - 0s - loss: 1.3465 - acc: 0.4049
Epoch 12/60
936/936 - 0s - loss: 1.3405 - acc: 0.4145
Epoch 13/60
936/936 - 0s - loss: 1.3310 - acc: 0.4124
Epoch 14/60
936/936 - 0s - loss: 1.3337 - acc: 0.4081
Epoch 15/60
936/936 - 0s - loss: 1.3296 - acc: 0.4049
Epoch 16/60
936/936 - 0s - loss: 1.3335 - acc: 0.4017
Epoch 17/60
936/936 - 0s - loss: 1.3243 - acc: 0.4188
Epoch 18/60
936/936 - 0s - loss: 1.3231 - acc: 0.4060
Epoch 19/60
936/936 - 0s - loss: 1.31

<tensorflow.python.keras.callbacks.History at 0x1a30e75a90>

In [26]:
X.dtypes

PRCP     float64
TMAX     float64
TMIN     float64
Month    float64
Date     float64
Day      float64
Illum    float64
dtype: object

In [27]:
X_train_scaled

array([[0.125     , 0.8625    , 0.67123288, ..., 0.06666667, 0.03      ,
        0.03      ],
       [0.        , 0.4125    , 0.46575342, ..., 0.06666667, 0.12      ,
        0.12      ],
       [0.        , 0.2875    , 0.2739726 , ..., 0.2       , 0.55      ,
        0.55      ],
       ...,
       [0.        , 0.6       , 0.45205479, ..., 0.93333333, 0.67      ,
        0.67      ],
       [0.        , 0.925     , 0.82191781, ..., 0.6       , 0.48      ,
        0.48      ],
       [0.        , 0.8       , 0.69863014, ..., 0.56666667, 0.94      ,
        0.94      ]])

In [28]:
y_train_categorical.shape

(936, 5)