In [None]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import math
import zipfile
import copy
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.utils import shuffle
from mpl_toolkits.basemap import Basemap
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.feature_selection import chi2, VarianceThreshold, SelectKBest
from sklearn.metrics import accuracy_score

In [3]:
modelSVC1 = SVC(probability=True, 
            class_weight='balanced',
            shrinking=False,
            verbose=True)
modelSVC2 = SVC(probability=True,
            class_weight=None,
            shrinking=True,
            verbose=False)
modelLSVC1 = LinearSVC(dual=True,
            class_weight='balanced',
            loss='hinge',
            fit_intercept=True)
modelLSVC2 = LinearSVC(dual=False,
                       loss='squared_hinge', 
                       fit_intercept=True)
modelDT1 = DT(criterion="gini",
            splitter="best",
             class_weight="balanced")
modelDT2 = DT(criterion="entropy",
             splitter="random",
             class_weight=None)

In [4]:
models = {'SVC1' : modelSVC1}

In [5]:
#Expects sampled data already
def preprocess(to_process):
    #remove 0s and nans
    to_process = to_process.loc[~(to_process==0).all(axis=1)]
    to_process.dropna()
    
    
    #convert X to positive (all are negative and it causes problems later on)
    to_process['X'] = to_process['X'].map(lambda x: abs(x))
    
    #Create new features from the date data
    to_process['Year'] = to_process['Dates'].map(lambda x: x.year)
    to_process['Week'] = to_process['Dates'].map(lambda x: x.week)
    to_process['Hour'] = to_process['Dates'].map(lambda x: x.hour)
    
    #drop descript and resolution because they aren't in the test set so they can't be used
    del to_process['Dates']
    del to_process['Descript']
    del to_process['Resolution']
    del to_process['Address']
    
    le = preprocessing.LabelEncoder()
    to_process['Category'] = le.fit_transform(to_process['Category'])
    to_process['DayOfWeek'] = le.fit_transform(to_process['DayOfWeek'])
    to_process['PdDistrict'] = le.fit_transform(to_process['PdDistrict'])
    
    return to_process

In [6]:
#Selects K best features using the featureSelector selector and verifying they fit a variance threshhold
def extract_features(to_process, to_process_predicted, featureSelector, threshold=.8, kBest=1):
    selector = VarianceThreshold() 
    selector.fit_transform(to_process, threshold)
    
    #fit the data to the VarianceThreshold
    selector = SelectKBest(featureSelector, k=kBest)
    selector.fit(to_process, to_process_predicted)
    
    # Get idxs of columns to keep
    idxs_selected = selector.get_support(indices=True)
    #Create new dataframe with only desired columns, or overwrite existing
    data_new = to_process[idxs_selected]
    
    return data_new

In [7]:
def model_kfolds(sampled_data, y_val, model_dict, kFolds=5):
    results = []
    
    #metrics to score with
    metrics = ['accuracy', 'f1_micro', 'precision_micro', 'recall_micro']#, 'precision', 'recall']
    
    for name, model in model_dict.items():
        #copy the data before each use
        data_copy = copy.deepcopy(sampled_data)
        
        
        scores = cross_validate(model, 
                                data_copy, 
                                data_copy[y_val], 
                                scoring=metrics, 
                                cv=kFolds)
        
        results.append([name, scores])
        
    return results

In [8]:
def pipeline(train_filename, test_filename, sample_size, kFolds=True):
    #get the data from the CSVs
    train = pd.read_csv(train_filename, parse_dates=['Dates'])
    test = pd.read_csv(test_filename, parse_dates=['Dates'])
    
    #sample data
    sampled_data = train.sample(n=sample_size)
    
    #preprocesses data by converting to nominal features, removing zeros,
    #deleting unusable features, creating new features, and modifying negatives
    processed_data = preprocess(sampled_data)
    
    cats = [np.array(data.Category), np.array(data.transformed_cat)]
    
    del processed_data['Category']
    
    #select K best features
    final = extract_features(processed_data, processed_data['Category'], chi2, threshold=.8, kBest=5)
    #print(final)
    
    
    
    #get models
    results = model_kfolds(final, "Category", models)
    
    
    #return performance metrics 
    results = [{x[0] : {'accuracy' : x[1]['test_accuracy'],
                        'precision' : x[1]['test_precision_micro'],
                        'recall' : x[1]['test_recall_micro'],
                        'f1' : x[1]['test_f1_micro']}} for x in results]
    
    return results

# Sampling

In [21]:
data = pd.read_csv('train.csv', parse_dates=['Dates'])

In [22]:
data_sample = data.sample(5000)

In [23]:
data_sample.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
821148,2003-10-03 13:45:00,SEX OFFENSES FORCIBLE,SEXUAL BATTERY,Friday,PARK,NONE,1400 Block of SCOTT ST,-122.437674,37.783209
82731,2014-04-05 18:00:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Saturday,SOUTHERN,NONE,HARRISON ST / 8TH ST,-122.408518,37.77376
445186,2009-02-05 15:40:00,ROBBERY,ROBBERY ON THE STREET WITH A DANGEROUS WEAPON,Thursday,MISSION,JUVENILE BOOKED,2700 Block of MISSION ST,-122.418587,37.753019
278519,2011-07-28 18:00:00,LARCENY/THEFT,PETTY THEFT OF PROPERTY,Thursday,SOUTHERN,NONE,800 Block of MARKET ST,-122.407634,37.784189
414045,2009-07-22 15:30:00,WARRANTS,ENROUTE TO OUTSIDE JURISDICTION,Wednesday,TENDERLOIN,"ARREST, BOOKED",EDDY ST / MASON ST,-122.409313,37.784348


In [24]:
data_sample.shape

(5000, 9)

# Preprocessing

In [25]:
p_data = preprocess(data_sample)

In [26]:
p_data.shape

(5000, 8)

In [27]:
p_data.head()

Unnamed: 0,Category,DayOfWeek,PdDistrict,X,Y,Year,Week,Hour
821148,26,0,5,122.437674,37.783209,2003,40,13
82731,15,2,7,122.408518,37.77376,2014,14,18
445186,23,4,3,122.418587,37.753019,2009,6,15
278519,15,4,7,122.407634,37.784189,2011,30,18
414045,33,6,9,122.409313,37.784348,2009,30,15


# Feature Selection

In [28]:
p_final = extract_features(p_data, p_data.Category, chi2, threshold=.8, kBest=5)

In [29]:
p_final.head()

Unnamed: 0,Category,DayOfWeek,PdDistrict,Week,Hour
821148,26,0,5,40,13
82731,15,2,7,14,18
445186,23,4,3,6,15
278519,15,4,7,30,18
414045,33,6,9,30,15


In [30]:
p_final.shape

(5000, 5)

# kFolds Cross Validation

In [33]:
results = model_kfolds(p_final, "Category", models)
    
    
#return performance metrics 
results = [{x[0] : {'accuracy' : x[1]['test_accuracy'].mean(),
                    'precision' : x[1]['test_precision_micro'].mean(),
                    'recall' : x[1]['test_recall_micro'].mean(),
                    'f1' : x[1]['test_f1_micro'].mean()}} for x in results]

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [32]:
results

[{'SVC1': {'accuracy': 0.43990409209142306,
   'f1': 0.43990409209142306,
   'precision': 0.43990409209142306,
   'recall': 0.43990409209142306}}]