Import Library
==============

In [1]:
import csv
import numpy as np
import pandas as pd

# DefinePreprocess function
1. Delete the Useless Columns
2. Extract only Time in Dates Columns
3. Separate Columns

In [2]:
def preprocess(df):
    # 1. Delete columns
    del_cols = ['Address', 'X', 'Y']
    df = df.drop(del_cols, axis=1)
    
    # [[ Cancel ]]
    # 2. Extract Time data
    #df.ix[::,'Dates'] = df.ix[::,0].str[-8:]
    #df.head(5)
    
    # 3-1. Separate Columns
    hours = pd.get_dummies(df.Dates.map(lambda x: pd.to_datetime(x).hour), prefix='hour')
    months = pd.get_dummies(df.Dates.map(lambda x: pd.to_datetime(x).month), prefix='month')
    years = pd.get_dummies(df.Dates.map(lambda x: pd.to_datetime(x).year), prefix='year')
    district = pd.get_dummies(df['PdDistrict'])
    week = pd.get_dummies(df['DayOfWeek'])
    
    # 3-2. Concatenate each Date Frame
    df = pd.concat([df, hours, months, years, district, week], axis=1)
    df = df.drop(['Dates', 'DayOfWeek', 'PdDistrict'], axis=1)
    
    return df

# Read Train / Test Data
    - read_csv
    - check info

In [3]:
# Read Train Data
train_data = pd.read_csv('../sf_crime/train.csv',header=0)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
Dates         878049 non-null object
Category      878049 non-null object
Descript      878049 non-null object
DayOfWeek     878049 non-null object
PdDistrict    878049 non-null object
Resolution    878049 non-null object
Address       878049 non-null object
X             878049 non-null float64
Y             878049 non-null float64
dtypes: float64(2), object(7)
memory usage: 60.3+ MB


In [4]:
train_data.head(5)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [5]:
# Read Test Data
test_data = pd.read_csv('../sf_crime/test.csv',header=0)

# Preprocessing

In [6]:
print('Preprocessing...')
train_data = preprocess(train_data)
train_data = train_data.drop(['Descript', 'Resolution'], axis=1)
test_data = preprocess(test_data)
test_index = test_data['Id']
test_data = test_data.drop(['Id'], axis=1)

Preprocessing...


# Learning Classifier & Predicting

In [None]:
from sklearn.naive_bayes import GaussianNB

# Make train data
y_train = train_data['Category']
X_train = train_data.drop(['Category'], axis=1)

In [None]:
# Fitting
print('Fitting...')
clf = GaussianNB()
clf.fit(X_train, y_train)

In [None]:
# Predicting
print('Predicting...')
result = pd.DataFrame(clf.predict_proba(test_data), index=test_data.index, columns=clf.classes_)
result = pd.concat([test_index, result], axis=1)

# Write Result Data

In [None]:
result.to_csv('../sf_crime/SF.csv', index=False)