## 基线测试

- 输入特征：
    - Dates中的小时字段
    - DayOfWeek
    - PdDistrict
    - Address中是否包含"Block"字段
    - 经度
    - 纬度

### 1. 读取数据集

In [1]:
import pandas as pd
import numpy as np

origin_train_data = pd.read_csv('../datasets/train.csv')
origin_train_data.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


### 2. 数据预处理

#### 2.1 删除无意义的数据列

In [2]:
train_data = origin_train_data.drop(['Category', 'Descript', 'Resolution'], axis=1)
train_data.head()

Unnamed: 0,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,2015-05-13 23:53:00,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,Wednesday,NORTHERN,VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,Wednesday,NORTHERN,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,Wednesday,PARK,100 Block of BRODERICK ST,-122.438738,37.771541


#### 2.2 将Address列转化为是否含Block的0,1值，然后删除Address列，添加HasBlock列

In [3]:
def ProcessColAddress(train_data):
    find_block = np.char.find(np.char.lower(np.array(train_data['Address'], dtype=str)), 'block')
    addresses = np.select([find_block<0, find_block>0, find_block==0], [0, 1, 1])
    train_data = train_data.drop(['Address'], axis=1)
    train_data['HasBlock'] = addresses
    return train_data

In [4]:
train_data = ProcessColAddress(train_data)
train_data.head()

Unnamed: 0,Dates,DayOfWeek,PdDistrict,X,Y,HasBlock
0,2015-05-13 23:53:00,Wednesday,NORTHERN,-122.425892,37.774599,0
1,2015-05-13 23:53:00,Wednesday,NORTHERN,-122.425892,37.774599,0
2,2015-05-13 23:33:00,Wednesday,NORTHERN,-122.424363,37.800414,0
3,2015-05-13 23:30:00,Wednesday,NORTHERN,-122.426995,37.800873,1
4,2015-05-13 23:30:00,Wednesday,PARK,-122.438738,37.771541,1


2.3 Dates列只留下小时作为特征

In [5]:
def ProcessColDates(train_data):
    hours = pd.DatetimeIndex(train_data['Dates']).hour
    train_data['Hours'] = hours
    train_data = train_data.drop(['Dates'], axis=1)
    return train_data

In [6]:
train_data = ProcessColDates(train_data)
train_data.head()

Unnamed: 0,DayOfWeek,PdDistrict,X,Y,HasBlock,Hours
0,Wednesday,NORTHERN,-122.425892,37.774599,0,23
1,Wednesday,NORTHERN,-122.425892,37.774599,0,23
2,Wednesday,NORTHERN,-122.424363,37.800414,0,23
3,Wednesday,NORTHERN,-122.426995,37.800873,1,23
4,Wednesday,PARK,-122.438738,37.771541,1,23


2.4 对X、Y、Hours列进行归一化处理

In [7]:
from sklearn.preprocessing import MinMaxScaler

def ProcessColXY(train_data):
    scaler =MinMaxScaler()
    features = ['X', 'Y', 'Hours']
    train_data[features] = scaler.fit_transform(train_data[features])
    return train_data,scaler

In [8]:
train_data,_ = ProcessColXY(train_data)
train_data.head()

  return self.partial_fit(X, y)


Unnamed: 0,DayOfWeek,PdDistrict,X,Y,HasBlock,Hours
0,Wednesday,NORTHERN,0.043578,0.001276,0,1.0
1,Wednesday,NORTHERN,0.043578,0.001276,0,1.0
2,Wednesday,NORTHERN,0.044337,0.00177,0,1.0
3,Wednesday,NORTHERN,0.04303,0.001778,1,1.0
4,Wednesday,PARK,0.037198,0.001217,1,1.0


2.5 对DayOfWeek、PdDistrict进行独热编码

In [10]:
def ProcessColToDummies(train_data):
    return pd.get_dummies(train_data)

In [11]:
train_data = ProcessColToDummies(train_data)
train_data.head()

Unnamed: 0,X,Y,HasBlock,Hours,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,...,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN
0,0.043578,0.001276,0,1.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0.043578,0.001276,0,1.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0.044337,0.00177,0,1.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0.04303,0.001778,1,1.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0.037198,0.001217,1,1.0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


### 3. 对数几率模型

3.1 将数据集分为训练集和测试集

In [60]:
from sklearn.model_selection import train_test_split
y_label = origin_train_data['Category']
X_train, X_test, y_train, y_test = train_test_split(train_data, y_label, test_size=0.2, random_state=42)

print('X_train has {} samples.'.format(X_train.shape[0]))
print('X_test has {} samples.'.format(X_test.shape[0]))
print('y_train:\n', y_train[:2])
print('y_test:\n', y_test[:2])

X_train has 702439 samples.
X_test has 175610 samples.
y_train:
 81381     NON-CRIMINAL
238545    NON-CRIMINAL
Name: Category, dtype: object
y_test:
 349598    LARCENY/THEFT
766313          ASSAULT
Name: Category, dtype: object


3.2 训练模型

In [8]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=42, solver='lbfgs', multi_class='multinomial', max_iter=1000).fit(X_train, y_train)

3.3 计算准确率

In [10]:
from sklearn.metrics import accuracy_score

y_pred = clf.predict(X_test)
print('准确率: ', accuracy_score(y_pred, y_test))

准确率:  0.22801093331814817


3.4 计算对数损失

In [11]:
from sklearn.metrics import log_loss

y_pred_prob = clf.predict_proba(X_test)
print('多分类对数损失: ', log_loss(y_test, y_pred_prob))

多分类对数损失:  2.556361302239199


3.5 保存模型到文件

In [12]:
from sklearn.externals import joblib

joblib.dump(clf, '../models/base_line_logistic_regression.ml')

['../models/base_line_logistic_regression.ml']

### 4. 预测新样本

4.1 加载模型文件

In [28]:
from sklearn.externals import joblib

clf1 = joblib.load('../models/base_line_logistic_regression.ml')

4.2 加载数据集

In [15]:
new_origin_data = pd.read_csv('../datasets/test.csv')
new_origin_data.head(1)

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051


4.3 预处理数据集

In [26]:
id_col = new_origin_data['Id']
new_data = new_origin_data.drop(['Id'],axis=1)

new_data = ProcessColDates(new_data)
new_data = ProcessColAddress(new_data)
new_data,_ = ProcessColXY(new_data)
new_data = ProcessColToDummies(new_data)
new_data.head(1)

  return self.partial_fit(X, y)


Unnamed: 0,X,Y,Hours,HasBlock,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,...,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN
0,0.056641,0.00052,1.0,1,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0


4.4 预测新样本

In [122]:
new_y_pred_prob = clf1.predict_proba(new_data)

In [124]:
csv_output = pd.DataFrame(columns=clf1.classes_, data=np.round(new_y_pred_prob, 4))
csv_output.insert(0, 'Id', id_col)

In [125]:
csv_output.head()

Unnamed: 0,Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0.0032,0.1267,0.0001,0.001,0.051,0.0013,0.0008,0.0451,0.0032,...,0.0001,0.0043,0.0006,0.0412,0.0,0.0067,0.0772,0.1047,0.0406,0.0186
1,1,0.0091,0.1698,0.0004,0.0009,0.0572,0.005,0.0029,0.0299,0.0042,...,0.0003,0.0047,0.0007,0.0575,0.0,0.0125,0.0652,0.046,0.0434,0.0168
2,2,0.001,0.0898,0.0002,0.0003,0.0688,0.002,0.0008,0.0351,0.0037,...,0.0001,0.0063,0.0009,0.0293,0.0,0.0073,0.0652,0.0705,0.0351,0.0072
3,3,0.0016,0.1167,0.0001,0.0009,0.0427,0.0011,0.0011,0.0273,0.0027,...,0.0002,0.0041,0.0009,0.0363,0.0,0.0043,0.0845,0.1506,0.0262,0.0141
4,4,0.0016,0.1167,0.0001,0.0009,0.0427,0.0011,0.0011,0.0273,0.0027,...,0.0002,0.0041,0.0009,0.0363,0.0,0.0043,0.0845,0.1506,0.0262,0.0141


In [126]:
csv_output.to_csv('./result.csv', index=False)