In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
# row_id：登记事件的ID
# xy：坐标
# 准确性：定位准确性 
# 时间：时间戳
# place_id：业务的ID，这是您预测的目标，即目标值

In [None]:
# 1. 获取数据集
data = pd.read_csv('train.csv')

In [None]:
data

In [None]:
# 2. 基本数据处理
# 2.1 选择时间特征
time = pd.to_datetime(data['time'], unit='s')
time = pd.DatetimeIndex(time)
data['day'] = time.day
data['hour'] = time.hour
data['weekday'] = time.weekday

# 2.2 去掉签到较少的地方
place_count = data.groupby('place_id').count()
place_count = place_count[place_count['row_id']>5]
data = data[data['place_id'].isin(place_count.index)]

# 2.3 确定特征值和目标值
x = data[['x','y','accuracy','day','hour','weekday']]
y = data['place_id']

# 2.4 分割数据
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22,test_size=0.33)

In [None]:
# 3 特征工程--特征预处理(标准化)
# 3.1 实例化一个转换器
transfer = StandardScaler()
# 3.2 调用 fit_transform
x_train = transfer.fit_transform(x_train)
x_test = transfer.fit_transform(x_test)

In [None]:
# 4 机器学习--knn+cv
# 4.1 实例化一个估计器
estimator = KNeighborsClassifier(n_jobs=-1)
# 4.2 调用 GridSearchCV()
param_grid = {'n_neighbors':[1,3,5,7,9]}
estimator = GridSearchCV(estimator, param_grid, cv=5)
# 4.3 模型训练
estimator.fit(x_train, y_train)

In [None]:
# 5.模型评估
# 5.1 基本评估方式
score = estimator.score(x_test, y_test)
print("最后预测的准确率为:\n", score)

In [None]:
y_predict = estimator.predict(x_test)
print("最后的预测值为:\n", y_predict)
print("预测值和真实值的对比情况:\n", y_predict == y_test

In [None]:
# 5.2 使用交叉验证后的评估方式
print("在交叉验证中验证的最好结果:\n", estimator.best_score_)
print("最好的参数模型:\n", estimator.best_estimator_)
print("每次交叉验证后的验证集准确率结果和训练集准确率结果:\n",estimator.cv_results_)