### Necessary Lab Imports

In [63]:
import numpy as np
import pandas as pd
import patsy

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.grid_search import GridSearchCV

#### Read in the Data

In [74]:
# Read in data and conduct an exploratory data analysis.
# Resolve any data issues you identify and articulate why you 
# did what you did.

sf_crime = pd.read_csv('datasets/sf_crime_train.csv')
sf_crime.head()

In [75]:
# There is a column that is is a Datetime and I want to check and see if it is currently an object
sf_crime.Dates.dtype

dtype('O')

### Create column for hour, month, and year from 'Dates' column.
- Hint: pd.to_datetime may be helpful.


In [None]:
# pd.datetime was not helpful
sf_time = pd.DataFrame(sf_crime['Dates'].str.split(' ',1).tolist(),columns = ['date','time'])
# sf_time is a dataframe where the Date and time are in separate columns


sf_date = pd.DataFrame(sf_time['date'].str.split('/').tolist(),columns = ['month','day','year'])
# sf_date is a dataframe where all the month, day and year are all in separate columns

In [None]:
# Merge data frames with individual time values back onto main df
sf_crime = sf_crime.merge(sf_date, left_index = True, right_index = True,how = 'outer')
sf_crime = sf_crime.merge(sf_time, left_index = True, right_index = True,how = 'outer')


In [None]:
# Check out Currnet dictionary if you are interested
sf_crime

In [94]:
# Dropping columsn where time is expressed in human speak
sf_crime.drop(['Dates','date'], axis = 1, inplace = True)

In [43]:
'''I know Matt messed with some value counts so lets see what they are.'''
#sf_crime['Dates'].value_counts() '''Dates look good'''

#sf_crime['Category'].value_counts()
'''1 Trespassing, all others are trespass,  1 Assualt because someone can't spell.'''

#sf_crime['Descript'].value_counts()
'''data is too diverse, keywords are going to be what matters here'''

#sf_crime['DayOfWeek'].value_counts()
'''all days off week are there'''

#sf_crime['PdDistrict'].value_counts()
'''Values look good'''

#sf_crime['Resolution'].value_counts()
''' 1 non prosecuted.  Seems legit'''

sf_crime[['X','Y']].describe()
'''all coordinates appear to be relative'''

Unnamed: 0,X,Y
count,18000.0,18000.0
mean,-122.423639,37.768466
std,0.026532,0.024391
min,-122.513642,37.708154
25%,-122.434199,37.753838
50%,-122.416949,37.775608
75%,-122.406539,37.78539
max,-122.365565,37.819923


In [96]:
# Figuring out where that wrong data exists in the DataFrame
sf_crime[sf_crime['Category'] == 'ASSUALT']
# rows 2750 and 4330
sf_crime[sf_crime['Category'] == 'TRESPASSING']
# row 5519

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,month,day,year,time


In [None]:
# Issues with data are small enough to be manually changes
sf_crime.set_value(2750, 'Category', 'ASSAULT')
sf_crime.set_value(4330, 'Category', 'ASSAULT')
sf_crime.set_value(5519, 'Category', 'TRESPASS')

##### This was probably not the most optimal efficient way to clean this.  If there was an associated dictionary I would have used it to identify and replace any values that were not represented.

### Build a logit model predicting violent crime versus non-violent crime versus non-crimes.

##### Non-Violent Crimes: 
bad checks, bribery, drug/narcotic, drunkenness, embezzlement, forgery/counterfeiting, fraud, gambling, liquor, loitering, trespass.

#### Non-Crimes: 
non-criminal, runaway, secondary codes,  suspicious occ, warrants.

#### Violent Crimes: 
everything else.



##### Hint: What type of model do you need here? What should your "baseline" category be?
Multiclass regression.  Our Baseline will probably be Violent as that is the left-over group so to speak.
However, it would naturally make sence to set our baseline be the class that has the most observations.


In [160]:
#First i'll need to convert sub categories into overlaying categories.

zeros = ['non-criminal', 'runaway', 'secondary codes', 'suspicious occ', 'warrants']
ones  = ['bad checks', 'bribery', 'drug/narcotic', 'drunkenness', 'embezzlement', 'forgery/counterfeiting', 'fraud', 
         'gambling','liquor', 'loitering', 'trespass', 'other offenses']
#twos  = all other things  

# Empty list to append values into
crime_cat = []
#iterate through sf_crime Category
for crime in sf_crime['Category']:
    # convert values to lower
    crime = crime.lower()
    # checks list of sub categories
    if crime in zeros:
        # appends the overlaying category
        crime_cat.append('non-crime')
    elif crime in ones:
        crime_cat.append('non-violent')
    else:
        crime_cat.append('violent')
        
# take that list and add it to the DF
sf_crime['cat_number'] = crime_cat

In [139]:
# also going to convert DayOfWeek, PdDistrict and Resolution to dummy variables.
dummies = pd.get_dummies(sf_crime[['DayOfWeek','PdDistrict','Resolution']], drop_first = True)

# Merge the dataframe result back onto the original dataframe
sf_crime = sf_crime.merge(dummies, left_index = True, right_index = True,how = 'outer')

### Build a logit model predicting violent crime vs. non-violent crime vs. non-crimes.

In [161]:
sf_crime.head(0)
# Dropping all the categorical values the I don't think will be relevant or have been converted to dummies for X
X = sf_crime.drop(['Category','Descript','DayOfWeek','PdDistrict','Resolution','Address','X','Y','cat_number'], axis = 1)
y = sf_crime['cat_number']

In [191]:
# fit model with five folds and lasso regularization
# use Cs=15 to test a grid of 15 distinct parameters
# remember: Cs describes the inverse of regularization strength
logreg_cv = LogisticRegressionCV(solver='liblinear', Cs =[1,5,10], cv =5, penalty='l1' ) # update inputs here



#### Here is a reference table for using Regularization.
- Regularization - Strength of our regularization
- C  -  Larger values Specify stronger regularization
- Cs -  Smaller values specify stronger regularization. (Inverse of C : 1 over c)
- Area - Refering do those Diamonds and Circles when visualizing Regularization

|Regularization| Decrease | Increase |
|--------------|----------|----------|
|Penalty       | Decrease | Increase |
|C             | Increase | Decrease |
|Cs            | Decrease | Increase |
|Area          | Increase | Decrease |

##### Solver = Algorithm used for Optimization.  
    - Newton-cg - Handles Multinomial Loss, L2 only
    - Sag - Handles Multinomial Loss, Large Datasets, L2 Only, Works best on sclaed data
    - lbfgs - Handles Multinomial Loss, L2, Only
    - Liblinear - Small Datasets, no Warm Starts
##### Cs = Increasing this increases penalty and the affect of regularization because it shrinks the contact area.
##### CV = CrossValidations or number of folds
##### Penalty = Regularization Tactic, l1 - LASSO, l2 - Ridge 

In [190]:
# TTS our data.
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 12,)

# Lets set our model parameters 
logreg_cv = LogisticRegressionCV(Cs=15, cv=5, penalty='l1', scoring='accuracy', solver='liblinear')
logreg_cv.fit(X_train, y_train)

LogisticRegressionCV(Cs=15, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
           refit=True, scoring='accuracy', solver='liblinear', tol=0.0001,
           verbose=0)

In [164]:
# find best C per class  
print('best C for class:')

#Building a dictionary that does a regression for each of the Y classes
# after the fit it grabs the C value for said logistic regression and puts them together.
best_C = {logreg_cv.classes_[i]:x for i, (x, c) in enumerate(zip(logreg_cv.C_, logreg_cv.classes_))}
print(best_C)

best C for class:
{'non-violent': 0.071968567300115138, 'violent': 0.26826957952797248, 'non-crime': 2682.695795279722}


In [166]:
# fit regular logit model to 'non-crime', 'non-violent', and 'violent' classes
# use lasso penalty
logreg_1 = LogisticRegression(C=best_C['non-crime'], penalty='l1', solver='liblinear', multi_class = 'ovr')
logreg_2 = LogisticRegression(C=best_C['non-violent'], penalty='l1', solver='liblinear', multi_class = 'ovr')
logreg_3 = LogisticRegression(C=best_C['violent'], penalty='l1', solver='liblinear', multi_class = 'ovr')

# Lets check out all of our outputs for all of our models
# Non Crimes
logreg_1.fit(X_train, y_train)

LogisticRegression(C=2682.695795279722, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [167]:
# Non Violent
logreg_2.fit(X_train, y_train)

LogisticRegression(C=0.071968567300115138, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [168]:
# Violent
logreg_3.fit(X_train, y_train)

LogisticRegression(C=0.26826957952797248, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

## Build confusion matrices for the models above

In [171]:
# using our logregs to predict on our test set and storing predictions
Y_1_pred = logreg_1.predict(X_test)
Y_2_pred = logreg_2.predict(X_test)
Y_3_pred = logreg_3.predict(X_test)

# stores confusion matrix for Y Test and Y Pred  
conmat_1 = confusion_matrix(y_test, Y_1_pred, labels=logreg_1.classes_)
# converts np.matrix format matrix to a dataframe and adds index and column names
conmat_1 = pd.DataFrame(conmat_1, columns=logreg_1.classes_, index=logreg_1.classes_)

conmat_2 = confusion_matrix(y_test, Y_2_pred, labels=logreg_2.classes_)
conmat_2 = pd.DataFrame(conmat_2, columns=logreg_2.classes_, index=logreg_2.classes_)

conmat_3 = confusion_matrix(y_test, Y_3_pred, labels=logreg_3.classes_)
conmat_3 = pd.DataFrame(conmat_3, columns=logreg_3.classes_, index=logreg_3.classes_)

print conmat_1
print conmat_2
print conmat_3


             non-crime  non-violent  violent
non-crime           67          172      655
non-violent         15          419      449
violent             17          338     2368
             non-crime  non-violent  violent
non-crime           57          197      640
non-violent         15          425      443
violent             15          362     2346
             non-crime  non-violent  violent
non-crime           65          176      653
non-violent         15          417      451
violent             17          342     2364


Interpretting multiclass confussion matrix is a bit different from a single class.  With single class the options are 'Is' and 'Is Not' or 'True' and 'False'.  With a multiclass confusion matrix our cells in our matrix can represent more than just a single outcome ("true Positive", "True Negative","False Positive", "False" Negative").

Lets take a look at the First confusion Matrix.
There is a clear trend of True Positives along the top-left to bottom-right diagonal, perfectly classified data, however if we are only interested in how Violent crimes are classified we can see that there are 2368 True Positives at the intersection of Violent and Violent.  We can also see that every value that did not make it into a violent row or column is technically a True Negative with respect to looking at the classification of violent crimes.  This is so because with respect to violent crimes, all of those crimes in that area are Truely not Violent.

## Print classification reports

In [173]:

print(classification_report(y_test, Y_1_pred))
print(classification_report(y_test, Y_2_pred))
print(classification_report(y_test, Y_3_pred))

             precision    recall  f1-score   support

  non-crime       0.68      0.07      0.13       894
non-violent       0.45      0.47      0.46       883
    violent       0.68      0.87      0.76      2723

avg / total       0.64      0.63      0.58      4500

             precision    recall  f1-score   support

  non-crime       0.66      0.06      0.12       894
non-violent       0.43      0.48      0.46       883
    violent       0.68      0.86      0.76      2723

avg / total       0.63      0.63      0.57      4500

             precision    recall  f1-score   support

  non-crime       0.67      0.07      0.13       894
non-violent       0.45      0.47      0.46       883
    violent       0.68      0.87      0.76      2723

avg / total       0.63      0.63      0.58      4500



#### - Precision ( True Positives divided by Total Predicted Positives)
Of our First models True predictions for non-crime classification, 68% of them were correct(True Positive) and the other 32% of them were incorrect (False Positive).


#### - Recall (True Positives divided by Total Actual Positives)
of our Second models True Predictions it correctly predicted 48% of the total Positives for non-violent crimes.


#### - f1-score ( 2 * (precision * recall) / (precision + recall) )
This is the weighted average of the Precision and Recall.
For the Third Models Violent predictions.   f1 = 2 x (0.68 x 0.87)/(0.68+0.87)


#### - Support -  Number of True Values in said class
Non-Crime = 894
Non-Violent = 883
Violent = 2723

In [174]:
# run gridsearch using GridSearchCV and 5 folds
# score on accuracy; what does this metric tell us?
logreg = LogisticRegression()
C_vals = [0.0001, 0.001, 0.01, 0.1, 0.5, 0.75, 1.0, 2.5, 5.0, 10.0, 100.0, 1000.0]
penalties = ['l1','l2']

When we gridsearch in Sklearn, we pass possible values we want checked as a dictionary. Keys are the parameter names and values are lists of values we want checked.  Apart from the first parameter (outside the dictionary) which is our model.

As GridSearch is comparing all of our parameters against eachother we dont have to specify any when declaring the model.

In this Example we are only Gridsearching Regression Types and C Values

In [175]:
#W.
gs = GridSearchCV(logreg, {'penalty':penalties, 'C':C_vals}, verbose=True, cv=5, scoring='f1_macro')
gs.fit(X, y)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:   15.9s
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  1.1min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.0001, 0.001, 0.01, 0.1, 0.5, 0.75, 1.0, 2.5, 5.0, 10.0, 100.0, 1000.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring='f1_macro',
       verbose=True)

In [176]:
# find the best parameters of our gridsearch model.
gs.best_params_

{'C': 10.0, 'penalty': 'l1'}

In [179]:
# use this parameter to .fit, .predict, and print a classification_report for our X and Y
lr = LogisticRegression(penalty = 'l1', C = 10.0)

X_train_gs, X_test_gs, y_train_gs, y_test_gs = train_test_split(X,y, random_state = 177)

lr.fit(X_train_gs, y_train_gs)

y_pred_gs = lr.predict(X_test_gs)

print classification_report(y_test_gs, y_pred_gs)

             precision    recall  f1-score   support

  non-crime       0.63      0.08      0.14       920
non-violent       0.46      0.46      0.46       883
    violent       0.68      0.88      0.77      2697

avg / total       0.63      0.63      0.58      4500

