In [1]:
import pandas as pd

import numpy as np
from numpy import dot as dot

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

import math
import copy

In [2]:
def modify_data(df):
    for feature_split in feature_modifications:
        df = feature_split(df)
    x = df[feature_cols]
    full_cols = feature_cols[:]
    full_cols.append('Category')
    print(feature_cols)
    df_ = df[full_cols]
    y = output_col_mod(df)
    return x, y, df_

def date_time_split(df):
    def time_in_mins_since_midnight(time):
        return (time.hour * 60) + time.minute

    df['Time'] = pd.to_datetime(df['Dates']).dt.time.map(time_in_mins_since_midnight)

    def time_bin(hour):
        for bin_ in time_bins:
            if hour < bin_:
                return time_bins[bin_]

    df['Time_Class'] = pd.to_datetime(df['Dates']).dt.hour.map(time_bin)
    return df

def day_map(df):
    df = df.replace({"DayOfWeek": day_mapping})
    return df

def round_xy(df):
    if 'X' in feature_cols:
        df = df.round({'X': 2})
    if 'Y' in feature_cols:
        df = df.round({'Y': 2})
    return df

def drop_na(df):
    return df.dropna(subset=feature_cols)

# Modify Category to only include the top 5 most common crime categories 
# and everything else as other. Then assign values to each
def output_col_mod(df):
    df = df[['Category']]
    cols = output_col_mapping.keys()
    df = df.Category.map(output_col_mapping).fillna(output_col_mapping["OTHER OFFENSES"]).astype(int)
    df = df.to_frame().reset_index()
    df = df[['Category']]
    return df

In [3]:
dataset_dir = '../CS383_datasets/'

# Include any columns that might be produced as a result of any feature modification functions
# All columns will be numeric and this is being enforced
# X and Y will be automatically rounded if in feature_cols
# Drop_na should always be run last in feature modifications
feature_cols = ['Time', 'Time_Class', 'DayOfWeek', 'X', 'Y']
feature_modifications = [
    date_time_split,
    day_map,
    round_xy,
    drop_na
]


output_col = 'Category'

# How many different classifications to make?
output_col_mapping = {
    'LARCENY/THEFT': 0, 
    'NON-CRIMINAL': 1, 
    'ASSAULT': 2,
    'DRUG/NARCOTIC': 3,
    'VEHICLE THEFT': 4,
#     Includes other offenses and all offenses not included above
    'OTHER OFFENSES': 5,
}

# {'LARCENY/THEFT': 174900, 'OTHER OFFENSES': 126182, 'NON-CRIMINAL': 92304, 'ASSAULT': 76876, 
# 'DRUG/NARCOTIC': 53971, 'VEHICLE THEFT': 53781, 'VANDALISM': 44725, 'WARRANTS': 42214, 
# 'BURGLARY': 36755, 'SUSPICIOUS OCC': 31414, 'MISSING PERSON': 25989, 'ROBBERY': 23000, 
# 'FRAUD': 16679, 'FORGERY/COUNTERFEITING': 10609, 'SECONDARY CODES': 9985, 'WEAPON LAWS': 8555, 
# 'PROSTITUTION': 7484, 'TRESPASS': 7326, 'STOLEN PROPERTY': 4540, 'SEX OFFENSES FORCIBLE': 4388, 
# 'DISORDERLY CONDUCT': 4320, 'DRUNKENNESS': 4280, 'RECOVERED VEHICLE': 3138, 'KIDNAPPING': 2341, 
# 'DRIVING UNDER THE INFLUENCE': 2268, 'RUNAWAY': 1946, 'LIQUOR LAWS': 1903, 'ARSON': 1513, 
# 'LOITERING': 1225, 'EMBEZZLEMENT': 1166, 'SUICIDE': 508, 'FAMILY OFFENSES': 491, 'BAD CHECKS': 406, 
# 'BRIBERY': 289, 'EXTORTION': 256, 'SEX OFFENSES NON FORCIBLE': 148, 'GAMBLING': 146, 
# 'PORNOGRAPHY/OBSCENE MAT': 22, 'TREA': 6}


# Keys are evaluated as: hour is less than key
# Cycle runs from midnight 00:00 to 23:59
time_bins = {
    4: 0, # Before 4am is 0
    6: 1, # Before 6am is 1
    12: 2, # Before 12pm is 2
    18: 3, # Before 4pm is 3
    24: 4, # Before midnight is 4
}

day_mapping = {
    'Sunday': 0,
    'Monday': 1,
    'Tuesday': 2,
    'Wednesday': 3,
    'Thursday': 4,
    'Friday': 5,
    'Saturday': 6,
}

In [4]:
def test_train_split(x, y, shuffle=True, test_size=0.2):
    return train_test_split(x, y, test_size=test_size, random_state=0, shuffle=shuffle)

# Standardize by subtracting column mean and divide by the standard deviation of the column
def standardize_data(training_data, testing_data=None):
    scaler = StandardScaler()
    scaler.fit(training_data.to_numpy())
    training_data = scaler.transform(training_data.to_numpy())
    testing_data = scaler.transform(testing_data.to_numpy())
    
    return training_data, testing_data

# Prepare and Get test train data
def get_data():
    # Fixing random state for reproducibility
    np.random.seed(0)

    train = pd.read_csv(dataset_dir + 'train.csv')

    x, y, df = modify_data(train)

    X_train, X_test, y_train, y_test = test_train_split(x, y)

    # Standardize training and testing data using training
    # (Feature columns only)
    X_train, X_test = standardize_data(X_train, X_test)

    # Fixing random state for reproducibility again cause optional preparation steps
    # might have random steps
    np.random.seed(0)

    return X_train, X_test, y_train.to_numpy(), y_test.to_numpy(), df

In [5]:
X_train, X_test, y_train, y_test, df = get_data()
# df is pandas dataframe before standardizing data
# y_test and X_test are Category data and style can be found above
# X_train and y_train cols are feature cols in the order
# Currently Feature Cols are: ['Time', 'Time_Class', 'DayOfWeek', 'X', 'Y']
print(X_train)
print(X_test)
print(y_train)
print(y_test)
print(df)
print('Number of rows in the Training Dataset', X_train.shape[0])
print('Number of rows in the Testing Dataset', X_test.shape[0])

['Time', 'Time_Class', 'DayOfWeek', 'X', 'Y']
[[-0.72389912 -0.55441529  0.97616645 -0.2431428   0.04298224]
 [-0.19307212  0.23012869 -1.54299048  0.08688813 -0.00188857]
 [-0.86359043 -0.55441529 -1.54299048  0.74694999  0.06541764]
 ...
 [ 1.10478719  1.01467266 -0.53532771  0.41691906  0.02054683]
 [-0.36832122 -0.55441529 -1.54299048  0.41691906  0.04298224]
 [-0.05084096  0.23012869  1.47999784  1.07698092 -0.11406559]]
[[ 0.57142035  0.23012869  0.97616645  1.07698092 -0.02432397]
 [ 0.72381088  1.01467266 -0.03149632 -0.2431428  -0.02432397]
 [ 0.42664935  0.23012869 -0.53532771  0.41691906 -0.04675938]
 ...
 [-0.57150859 -0.55441529  1.47999784  1.40701185 -0.09163019]
 [-0.15751433  0.23012869 -1.03915909  0.41691906  0.02054683]
 [-0.54611017 -0.55441529  1.47999784  0.74694999  0.02054683]]
[[0]
 [5]
 [0]
 ...
 [0]
 [3]
 [2]]
[[5]
 [5]
 [5]
 ...
 [5]
 [5]
 [2]]
        Time  Time_Class  DayOfWeek       X      Y                Category
0       1433           4          3 -12

In [6]:
X_test_sub = X_test[:1000,:]  #test subset for sklearn knn
y_test_sub = y_test[:1000,:]

In [7]:
trainset = np.append(X_train,y_train,axis=1)
testset=np.append(X_test_sub,y_test_sub,axis=1) ##test subset for our knn (above X_test_sub and his having same data )

## Finding best k

In [0]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
# Creating odd list K for KNN
neighbors = list(range(1,50,2))
# empty list that will hold cv scores
cv_scores = [ ]
#perform 10-fold cross-validation
for K in neighbors:
    knn = KNeighborsClassifier(n_neighbors = K)
    scores = cross_val_score(knn,X_train,y_train,cv = 10,scoring =
    "accuracy")
    cv_scores.append(scores.mean())

In [0]:
# Changing to mis classification error
mse = [1-x for x in cv_scores]
# determing best k
optimal_k = neighbors[mse.index(min(mse))]
print("The optimal no. of neighbors is {}".format(optimal_k))

In [0]:
import matplotlib.pyplot as plt
plt.plot(neighbors, mse)
plt.xlabel("Number of Neighbors K")
plt.ylabel("Misclassification Error")
plt.show()

# Knn from sklearn

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# instantiate learning model (k = 3)
knn = KNeighborsClassifier(n_neighbors=10)

# fitting the model
knn.fit(X_train, y_train)

# predict the response
pred = knn.predict(X_test_sub)

# evaluate accuracy
print("accuracy: {}".format(accuracy_score(y_test_sub, pred)))

  


accuracy: 0.447


In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# instantiate learning model (k = 49)
knn = KNeighborsClassifier(n_neighbors=49)

# fitting the model
knn.fit(X_train, y_train)

# predict the response
pred = knn.predict(X_test_sub)

# evaluate accuracy
print("accuracy: {}".format(accuracy_score(y_test_sub, pred)))

  


accuracy: 0.495


# our Knn

In [10]:
#Define Euclidean distances
import math
def Euclideandist(x,xi, length):
    d = 0.0
    for i in range(length):
        d += pow(float(x[i])- float(xi[i]),2)
    return math.sqrt(d)

In [11]:
#Getting the K neighbours having the closest Euclidean distance to the test instance
import operator
def getNeighbors(trainingSet, testInstance, k):
    distances = []
    length = len(testInstance)-1
    for x in range(len(trainingSet)):
        dist = Euclideandist(testInstance, trainingSet[x], length)
        distances.append((trainingSet[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

In [12]:
#After sorting the neighbours based on their respective classes, max voting to give the final class of the test instance
def getResponse(neighbors):
    classVotes = {}
    for x in range(len(neighbors)):
        response = neighbors[x][-1]
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)#Sorting it based on votes
    return sortedVotes[0][0]#Please note we need the class for the top voted class, hence [0][0]

In [13]:
#Getting the accuracy
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [None]:
# generate predictions
#it will take long time depending upon test datasize
predictions=[]
k = 49
for x in range(len(testset)):
    neighbors = getNeighbors(trainset, testset[x], k)
    result = getResponse(neighbors)
    predictions.append(result)
    print('> predicted=' + repr(result) + ', actual=' + repr(testset[x][-1]))

> predicted=5.0, actual=5.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=5.0
> predicted=0.0, actual=2.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=1.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=0.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=1.0
> predicted=5.0, actual=2.0
> predicted=5.0, actual=0.0
> predicted=5.0, actual=3.0
> predicted=0.0, actual=5.0
> predicted=0.0, actual=4.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=2.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=1.0
> predicted=5.0, actual=0.0
> predicted=5.0, actual=3.0
> predicted=5.0, actual=2.0
> predicted=0.0, actual=0.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=4.0
> predicted=5.0, actual=2.0
> predicted=5.0, actual=5.0
> predicted=0.0, actual=5.0
> predicted=5.0, actual=0.0
> predicted=5.0, actual=5.0
> predicted=0.0, act

> predicted=0.0, actual=0.0
> predicted=5.0, actual=2.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=2.0
> predicted=5.0, actual=0.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=0.0
> predicted=0.0, actual=0.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=4.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=1.0
> predicted=5.0, actual=4.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=2.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=4.0
> predicted=5.0, actual=3.0
> predicted=0.0, actual=5.0
> predicted=0.0, actual=0.0
> predicted=0.0, actual=5.0
> predicted=5.0, actual=2.0
> predicted=0.0, actual=3.0
> predicted=5.0, actual=3.0
> predicted=5.0, actual=1.0
> predicted=0.0, actual=5.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=1.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=5.0
> predicted=5.0, act

> predicted=5.0, actual=0.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=1.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=4.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=0.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=4.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=5.0
> predicted=0.0, actual=0.0
> predicted=5.0, actual=0.0
> predicted=0.0, actual=5.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=5.0
> predicted=0.0, actual=0.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=4.0
> predicted=5.0, actual=0.0
> predicted=5.0, actual=3.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=5.0
> predicted=5.0, actual=1.0
> predicted=5.0, actual=0.0
> predicted=0.0, actual=0.0
> predicted=5.0, actual=0.0
> predicted=5.0, actual=0.0
> predicted=5.0, actual=0.0
> predicted=5.0, actual=1.0
> predicted=5.0, actual=1.0
> predicted=5.0, act

In [0]:
accuracy = getAccuracy(y_test_sub, predictions)
print('Accuracy: ' + repr(accuracy) + '%')