In [1]:
import pandas as pd

In [3]:
# 获取数据
data=pd.read_csv("facebook-v-predicting-check-ins/train.csv")

In [5]:
data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
0,0,0.7941,9.0809,54,470702,8523065625
1,1,5.9567,4.7968,13,186555,1757726713
2,2,8.3078,7.0407,74,322648,1137537235
3,3,7.3665,2.5165,65,704587,6567393236
4,4,4.0961,1.1307,31,472130,7440663949


In [8]:
# 基本数据处理
# 缩小数据范围
data=data.query("x<2.5&x>2&y<1.5&y>1.0")
data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
112,112,2.236,1.3655,66,623174,7663031065
180,180,2.2003,1.2541,65,610195,2358558474
367,367,2.4108,1.3213,74,579667,6644108708
874,874,2.0822,1.1973,320,143566,3229876087
1022,1022,2.016,1.1659,65,207993,3244363975


In [9]:
# 处理时间特征
time_value=pd.to_datetime(data["time"],unit="s")

In [10]:
date=pd.DatetimeIndex(time_value)

In [15]:
data["day"]=date.day
data["weekday"]=data.weekday
data["hour"]=date.hour

In [16]:
data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id,day,weekday,hour
112,112,2.236,1.3655,66,623174,7663031065,8,3,5
180,180,2.2003,1.2541,65,610195,2358558474,8,3,1
367,367,2.4108,1.3213,74,579667,6644108708,7,2,17
874,874,2.0822,1.1973,320,143566,3229876087,2,4,15
1022,1022,2.016,1.1659,65,207993,3244363975,3,5,9


In [20]:
# 过滤签到次数少的地点
place_count=data.groupby("place_id").count()["row_id"]

In [22]:
place_count[place_count > 3].head()

place_id
1014605271    28
1015645743     4
1017236154    31
1024951487     5
1028119817     4
Name: row_id, dtype: int64

In [25]:
data_final=data[data["place_id"].isin(place_count[place_count > 3].index.values)]

In [27]:
data_final.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id,day,weekday,hour
112,112,2.236,1.3655,66,623174,7663031065,8,3,5
367,367,2.4108,1.3213,74,579667,6644108708,7,2,17
874,874,2.0822,1.1973,320,143566,3229876087,2,4,15
1022,1022,2.016,1.1659,65,207993,3244363975,3,5,9
1045,1045,2.3859,1.166,498,503378,6438240873,6,1,19


In [28]:
# 筛选特征值和目标值
x=data_final[["x","y","accuracy","day","weekday","hour"]]
y=data_final["place_id"]

In [29]:
x.head()

Unnamed: 0,x,y,accuracy,day,weekday,hour
112,2.236,1.3655,66,8,3,5
367,2.4108,1.3213,74,7,2,17
874,2.0822,1.1973,320,2,4,15
1022,2.016,1.1659,65,3,5,9
1045,2.3859,1.166,498,6,1,19


In [30]:
y.head()

112     7663031065
367     6644108708
874     3229876087
1022    3244363975
1045    6438240873
Name: place_id, dtype: int64

In [31]:
# 数据即划分 
from sklearn.model_selection import train_test_split

In [33]:
x_train,x_test,y_train,y_test=train_test_split(x,y)

In [34]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [35]:
# 3） 特征工程--标准化处理
transfer=StandardScaler()
x_train=transfer.fit_transform(x_train)
x_test=transfer.transform(x_test)

# 4）KNN算法评估
estimator=KNeighborsClassifier()

# 添加网格搜索和交叉验证
# 准备参数
param_dict={"n_neighbors":[3,5,7,9]}
estimator=GridSearchCV(estimator,param_grid=param_dict,cv=3)

estimator.fit(x_train,y_train)
# 5）模型评估
# 方法一:直接比对真实值和预测值
y_predict=estimator.predict(x_test)
print("y_predict:\n",y_predict)
print("比对结果：\n",y_test==y_predict)
# 方法二：计算准确率
score=estimator.score(x_test,y_test)
print("准确率：\n",score)

# 训练验证集的结果
print("最佳参数：\n",estimator.best_params_)
print("在交叉验证当中验证的最好结果：", estimator.best_score_)
print("模型K值是：", estimator.best_estimator_)
print("交叉验证的结果为：", estimator.cv_results_)



y_predict:
 [5781604363 5304570159 1804841714 ... 1778906774 1031277804 3513732261]
比对结果：
 24605820    False
6135114     False
27278684    False
12319813    False
6198326     False
28230027     True
10720701     True
14559849    False
18127170    False
14658582    False
16303960    False
7219484     False
25826843     True
5815074     False
21961806    False
27201405     True
6916340     False
5357813      True
10141711    False
23982835    False
26603350     True
11371255    False
551060      False
4948915      True
18921753    False
7516969      True
15549068    False
18251917    False
14540222     True
28132553    False
            ...  
28553591    False
609163      False
16005878     True
2818275     False
17767753    False
20283839    False
14507777    False
15907296    False
6140267     False
3217113      True
7648921     False
27506446    False
24786865    False
7190528     False
9368763     False
8472522      True
1414351     False
13821326     True
28814469     True
5369587  