In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
# row_id：登记事件的ID
# xy：坐标
# 准确性：定位准确性 
# 时间：时间戳
# place_id：业务的ID，这是您预测的目标，即目标值

In [3]:
# 1. 获取数据集
data = pd.read_csv('train.csv')

In [4]:
data

Unnamed: 0,row_id,x,y,accuracy,time,place_id
0,0,0.7941,9.0809,54,470702,8523065625
1,1,5.9567,4.7968,13,186555,1757726713
2,2,8.3078,7.0407,74,322648,1137537235
3,3,7.3665,2.5165,65,704587,6567393236
4,4,4.0961,1.1307,31,472130,7440663949
...,...,...,...,...,...,...
29118016,29118016,6.5133,1.1435,67,399740,8671361106
29118017,29118017,5.9186,4.4134,67,125480,9077887898
29118018,29118018,2.9993,6.3680,67,737758,2838334300
29118019,29118019,4.0637,8.0061,70,764975,1007355847


In [5]:
# 2. 基本数据处理
# 2.1 选择时间特征
time = pd.to_datetime(data['time'], unit='s')
time = pd.DatetimeIndex(time)
data['day'] = time.day
data['hour'] = time.hour
data['weekday'] = time.weekday

# 2.2 去掉签到较少的地方
place_count = data.groupby('place_id').count()
place_count = place_count[place_count['row_id']>5]
data = data[data['place_id'].isin(place_count.index)]

# 2.3 确定特征值和目标值
x = data[['x','y','accuracy','day','hour','weekday']]
y = data['place_id']

# 2.4 分割数据
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22,test_size=0.33)

In [6]:
# 3 特征工程--特征预处理(标准化)
# 3.1 实例化一个转换器
transfer = StandardScaler()
# 3.2 调用 fit_transform
x_train = transfer.fit_transform(x_train)
x_test = transfer.fit_transform(x_test)

In [7]:
# 4 机器学习--knn+cv
# 4.1 实例化一个估计器
estimator = KNeighborsClassifier(n_jobs=-1)
# 4.2 调用 GridSearchCV()
param_grid = {'n_neighbors':[1,3,5,7,9]}
estimator = GridSearchCV(estimator, param_grid, cv=5)
# 4.3 模型训练
estimator.fit(x_train, y_train)



In [8]:
# 5.模型评估
# 5.1 基本评估方式
score = estimator.score(x_test, y_test)
print("最后预测的准确率为:\n", score)

y_predict = estimator.predict(x_test)
print("最后的预测值为:\n", y_predict)
print("预测值和真实值的对比情况:\n", y_predict == y_test)

# 5.2 使用交叉验证后的评估方式
print("在交叉验证中验证的最好结果:\n", estimator.best_score_)
print("最好的参数模型:\n", estimator.best_estimator_)
print("每次交叉验证后的验证集准确率结果和训练集准确率结果:\n",estimator.cv_results_)

最后预测的准确率为:
 0.14964635090270834
最后的预测值为:
 [9196225245 3083916136 9157230123 ... 8282206179 3881213519 8804931933]
预测值和真实值的对比情况:
 17335387    False
23858716    False
24429653     True
13780495    False
19423567     True
            ...  
25610216    False
11406311    False
14729410    False
28289050    False
18484525    False
Name: place_id, Length: 9608253, dtype: bool
在交叉验证中验证的最好结果:
 0.13698056635686537
最好的参数模型:
 KNeighborsClassifier(n_jobs=-1, n_neighbors=1)
每次交叉验证后的验证集准确率结果和训练集准确率结果:
 {'mean_fit_time': array([38.83413277, 38.93270507, 39.11323676, 38.88676829, 38.91099572]), 'std_fit_time': array([0.45859461, 0.52061345, 0.27512063, 0.45200953, 0.51065728]), 'mean_score_time': array([112.40736284, 136.01520925, 149.28184366, 144.36297994,
       163.04065132]), 'std_score_time': array([2.27695703, 5.89432007, 2.97617609, 4.08086375, 7.92346432]), 'param_n_neighbors': masked_array(data=[1, 3, 5, 7, 9],
             mask=[False, False, False, False, False],
       fill_value='?',
    