In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import log_loss
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.utils import to_categorical
import numpy as np
 
#Load Data with pandas, and parse the first column into datetime
# train=pd.read_csv('train.csv', parse_dates = ['Dates'])
# test=pd.read_csv('test.csv', parse_dates = ['Dates'])
data=pd.read_csv('data/combined_weather_crime_lunar_final.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,DATE,CCN,STATION,NAME,DAPR,MDPR,PRCP,SNOW,SNWD,...,WT01,WT03,WT04,WT06,WT11,Year,Month,Date,Day,Illum
0,0,2016-01-01,103,USC00186350,"NATIONAL ARBORETUM DC, MD US",,,0.0,,,...,,,,,,2016,1,1,6,0.6
1,1,2016-01-02,93,USC00186350,"NATIONAL ARBORETUM DC, MD US",,,0.0,0.0,0.0,...,,,,,,2016,1,2,7,0.5
2,2,2016-01-03,100,USC00186350,"NATIONAL ARBORETUM DC, MD US",,,0.0,,,...,,,,,,2016,1,3,1,0.41
3,3,2016-01-04,77,USC00186350,"NATIONAL ARBORETUM DC, MD US",,,0.0,,,...,,,,,,2016,1,4,2,0.32
4,4,2016-01-05,92,USC00186350,"NATIONAL ARBORETUM DC, MD US",,,0.0,,,...,,,,,,2016,1,5,3,0.23


In [2]:
data.drop(data.columns[[0]], axis=1, inplace=True)
data.head()

Unnamed: 0,DATE,CCN,STATION,NAME,DAPR,MDPR,PRCP,SNOW,SNWD,TMAX,...,WT01,WT03,WT04,WT06,WT11,Year,Month,Date,Day,Illum
0,2016-01-01,103,USC00186350,"NATIONAL ARBORETUM DC, MD US",,,0.0,,,56.0,...,,,,,,2016,1,1,6,0.6
1,2016-01-02,93,USC00186350,"NATIONAL ARBORETUM DC, MD US",,,0.0,0.0,0.0,46.0,...,,,,,,2016,1,2,7,0.5
2,2016-01-03,100,USC00186350,"NATIONAL ARBORETUM DC, MD US",,,0.0,,,48.0,...,,,,,,2016,1,3,1,0.41
3,2016-01-04,77,USC00186350,"NATIONAL ARBORETUM DC, MD US",,,0.0,,,53.0,...,,,,,,2016,1,4,2,0.32
4,2016-01-05,92,USC00186350,"NATIONAL ARBORETUM DC, MD US",,,0.0,,,38.0,...,,,,,,2016,1,5,3,0.23


In [7]:
data.rename(columns={"CCN": "CRIME-COUNT"}, inplace=True)

In [8]:
data['bins'] = np.nan

In [9]:
data.head()

Unnamed: 0,DATE,CRIME-COUNT,STATION,NAME,DAPR,MDPR,PRCP,SNOW,SNWD,TMAX,...,WT03,WT04,WT06,WT11,Year,Month,Date,Day,Illum,bins
0,2016-01-01,103,USC00186350,"NATIONAL ARBORETUM DC, MD US",,,0.0,,,56.0,...,,,,,2016,1,1,6,0.6,
1,2016-01-02,93,USC00186350,"NATIONAL ARBORETUM DC, MD US",,,0.0,0.0,0.0,46.0,...,,,,,2016,1,2,7,0.5,
2,2016-01-03,100,USC00186350,"NATIONAL ARBORETUM DC, MD US",,,0.0,,,48.0,...,,,,,2016,1,3,1,0.41,
3,2016-01-04,77,USC00186350,"NATIONAL ARBORETUM DC, MD US",,,0.0,,,53.0,...,,,,,2016,1,4,2,0.32,
4,2016-01-05,92,USC00186350,"NATIONAL ARBORETUM DC, MD US",,,0.0,,,38.0,...,,,,,2016,1,5,3,0.23,


In [10]:
data.loc[data['CRIME-COUNT']<75, ['bins']] = 'VeryLow'


In [11]:
data.loc[(data['CRIME-COUNT']>76) & (data['CRIME-COUNT']<90), ['bins']] = 'Low'

In [12]:
data.loc[(data['CRIME-COUNT']>91) & (data['CRIME-COUNT']<105), ['bins']] = 'Medium'

In [13]:
data.loc[(data['CRIME-COUNT']>106) & (data['CRIME-COUNT']<120), ['bins']] = 'High'

In [14]:
data.loc[(data['CRIME-COUNT']>=121, ['bins'])] = 'VeryHigh'

In [15]:
data['bins'].value_counts()

Medium      382
Low         345
High        194
VeryLow     164
VeryHigh     80
Name: bins, dtype: int64

In [16]:
data['CRIME-COUNT'].describe()

count    1320.000000
mean       93.359091
std        17.066382
min        28.000000
25%        82.000000
50%        93.000000
75%       104.000000
max       146.000000
Name: CRIME-COUNT, dtype: float64

In [17]:
data.head()

Unnamed: 0,DATE,CRIME-COUNT,STATION,NAME,DAPR,MDPR,PRCP,SNOW,SNWD,TMAX,...,WT03,WT04,WT06,WT11,Year,Month,Date,Day,Illum,bins
0,2016-01-01,103,USC00186350,"NATIONAL ARBORETUM DC, MD US",,,0.0,,,56.0,...,,,,,2016,1,1,6,0.6,Medium
1,2016-01-02,93,USC00186350,"NATIONAL ARBORETUM DC, MD US",,,0.0,0.0,0.0,46.0,...,,,,,2016,1,2,7,0.5,Medium
2,2016-01-03,100,USC00186350,"NATIONAL ARBORETUM DC, MD US",,,0.0,,,48.0,...,,,,,2016,1,3,1,0.41,Medium
3,2016-01-04,77,USC00186350,"NATIONAL ARBORETUM DC, MD US",,,0.0,,,53.0,...,,,,,2016,1,4,2,0.32,Low
4,2016-01-05,92,USC00186350,"NATIONAL ARBORETUM DC, MD US",,,0.0,,,38.0,...,,,,,2016,1,5,3,0.23,Medium


In [18]:
data.drop(columns=["DAPR", "MDPR", "SNOW", "SNWD", 'WESD', 'WT01', 'WT03', 'WT04', 'WT06', 'WT11'])

Unnamed: 0,DATE,CRIME-COUNT,STATION,NAME,PRCP,TMAX,TMIN,TOBS,Year,Month,Date,Day,Illum,bins
0,2016-01-01,103,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.00,56.0,41.0,42.0,2016,1,1,6,0.60,Medium
1,2016-01-02,93,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.00,46.0,31.0,34.0,2016,1,2,7,0.50,Medium
2,2016-01-03,100,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.00,48.0,29.0,35.0,2016,1,3,1,0.41,Medium
3,2016-01-04,77,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.00,53.0,29.0,34.0,2016,1,4,2,0.32,Low
4,2016-01-05,92,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.00,38.0,15.0,17.0,2016,1,5,3,0.23,Medium
5,2016-01-06,88,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.00,34.0,16.0,22.0,2016,1,6,4,0.16,Low
6,2016-01-07,78,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.00,41.0,22.0,35.0,2016,1,7,5,0.09,Low
7,2016-01-08,120,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.00,43.0,35.0,40.0,2016,1,8,6,0.04,
8,2016-01-09,89,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.01,44.0,40.0,41.0,2016,1,9,7,0.01,Low
9,2016-01-10,84,USC00186350,"NATIONAL ARBORETUM DC, MD US",0.74,70.0,41.0,65.0,2016,1,10,1,0.00,Low


In [19]:
X = data.drop("bins", axis=1)
y = data["bins"]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

ValueError: Input contains NaN

In [60]:
data.columns

Index(['DATE', 'CRIME-COUNT', 'STATION', 'NAME', 'DAPR', 'MDPR', 'PRCP',
       'SNOW', 'SNWD', 'TMAX', 'TMIN', 'TOBS', 'WESD', 'WT01', 'WT03', 'WT04',
       'WT06', 'WT11', 'Year', 'Month', 'Date', 'Day', 'Illum', 'bins'],
      dtype='object')

0       NaN
1       0.0
2       NaN
3       NaN
4       NaN
5       NaN
6       NaN
7       NaN
8       NaN
9       NaN
10      NaN
11      NaN
12      0.0
13      NaN
14      NaN
15      NaN
16      0.0
17      NaN
18      NaN
19      NaN
20      1.2
21      0.0
22      NaN
23      NaN
24      NaN
25      NaN
26      0.0
27      0.0
28      0.0
29      NaN
       ... 
1290    NaN
1291    NaN
1292    NaN
1293    NaN
1294    NaN
1295    NaN
1296    NaN
1297    NaN
1298    NaN
1299    NaN
1300    NaN
1301    NaN
1302    NaN
1303    NaN
1304    NaN
1305    NaN
1306    NaN
1307    NaN
1308    NaN
1309    NaN
1310    NaN
1311    NaN
1312    NaN
1313    NaN
1314    NaN
1315    NaN
1316    NaN
1317    NaN
1318    NaN
1319    NaN
Name: SNOW, Length: 1320, dtype: float64