In [1]:
# dataset : https://www.kaggle.com/navoshta/grid-knn/data
# dataset is so big 1.27G

In [2]:
import pandas as pd

In [3]:
# 1、load the data
data = pd.read_csv("./FBlocation/train.csv")

In [4]:
data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
0,0,0.7941,9.0809,54,470702,8523065625
1,1,5.9567,4.7968,13,186555,1757726713
2,2,8.3078,7.0407,74,322648,1137537235
3,3,7.3665,2.5165,65,704587,6567393236
4,4,4.0961,1.1307,31,472130,7440663949


In [5]:
 # 2、clear data 
# 1）limit the data range
data = data.query("x < 2.5 & x > 2 & y < 1.5 & y > 1.0")

In [6]:
data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
112,112,2.236,1.3655,66,623174,7663031065
180,180,2.2003,1.2541,65,610195,2358558474
367,367,2.4108,1.3213,74,579667,6644108708
874,874,2.0822,1.1973,320,143566,3229876087
1022,1022,2.016,1.1659,65,207993,3244363975


In [7]:
# 2）deal the time feature
time_value = pd.to_datetime(data["time"], unit="s")

In [8]:
date = pd.DatetimeIndex(time_value)

In [9]:
data["day"] = date.day

In [10]:
data["weekday"] = date.weekday

In [11]:
data["hour"] = date.hour

In [12]:
data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id,day,weekday,hour
112,112,2.236,1.3655,66,623174,7663031065,8,3,5
180,180,2.2003,1.2541,65,610195,2358558474,8,3,1
367,367,2.4108,1.3213,74,579667,6644108708,7,2,17
874,874,2.0822,1.1973,320,143566,3229876087,2,4,15
1022,1022,2.016,1.1659,65,207993,3244363975,3,5,9


In [13]:
# 3）filter the low frequecy location
place_count = data.groupby("place_id").count()["row_id"]

In [14]:
data.groupby("place_id").count().head()

Unnamed: 0_level_0,row_id,x,y,accuracy,time,day,weekday,hour
place_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1012165853,1,1,1,1,1,1,1,1
1013991737,3,3,3,3,3,3,3,3
1014605271,28,28,28,28,28,28,28,28
1015645743,4,4,4,4,4,4,4,4
1017236154,31,31,31,31,31,31,31,31


In [15]:
place_count[place_count > 3].head()

place_id
1014605271    28
1015645743     4
1017236154    31
1024951487     5
1028119817     4
Name: row_id, dtype: int64

In [16]:
data_final = data[data["place_id"].isin(place_count[place_count > 3].index.values)]

In [17]:
data_final.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id,day,weekday,hour
112,112,2.236,1.3655,66,623174,7663031065,8,3,5
367,367,2.4108,1.3213,74,579667,6644108708,7,2,17
874,874,2.0822,1.1973,320,143566,3229876087,2,4,15
1022,1022,2.016,1.1659,65,207993,3244363975,3,5,9
1045,1045,2.3859,1.166,498,503378,6438240873,6,1,19


In [18]:
# filter the feature value and target value
x = data_final[["x", "y", "accuracy", "day", "weekday", "hour"]]
y = data_final["place_id"]

In [19]:
x.head()

Unnamed: 0,x,y,accuracy,day,weekday,hour
112,2.236,1.3655,66,8,3,5
367,2.4108,1.3213,74,7,2,17
874,2.0822,1.1973,320,2,4,15
1022,2.016,1.1659,65,3,5,9
1045,2.3859,1.166,498,6,1,19


In [20]:
y.head()

112     7663031065
367     6644108708
874     3229876087
1022    3244363975
1045    6438240873
Name: place_id, dtype: int64

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [24]:
# 3）feature engineering：standard
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

# 4）KNN algrithm to classify
estimator = KNeighborsClassifier()

# add grid search and cross validation
# prepare the parameters
param_dict = {"n_neighbors": [3, 5, 7, 9]}
estimator = GridSearchCV(estimator, param_grid=param_dict, cv=3)
estimator.fit(x_train, y_train)

# 5）model assessment
# method1：compare the real target number with pridiction number
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("the pridiction target vs real test target is :\n", y_test == y_predict)

# method2：get the accuracy
score = estimator.score(x_test, y_test)
print("The accuracy is ：\n", score)

# best_params_
print("The best parameter is ：\n", estimator.best_params_)
# best_score_
print("The best score is ：\n", estimator.best_score_)
# best_estimator_
print("The best estimator is :\n", estimator.best_estimator_)
# cv_results_
print("The cross validation is :\n", estimator.cv_results_)



y_predict:
 [1536494374 6787040850 4338887860 ... 9081742495 1732563460 2074133146]
the pridiction target vs real test target is :
 3535259     False
5556337     False
27295324    False
224503      False
820645      False
            ...  
4063510     False
7982417      True
10767322    False
15032849    False
19054317    False
Name: place_id, Length: 20228, dtype: bool
The accuracy is ：
 0.36365434051809375
The best parameter is ：
 {'n_neighbors': 5}
The best score is ：
 0.3331465485225324
The best estimator is :
 KNeighborsClassifier()
The cross validation is :
 {'mean_fit_time': array([0.0232048 , 0.02388835, 0.02308265, 0.02412287]), 'std_fit_time': array([0.00108141, 0.00112479, 0.00047778, 0.00103249]), 'mean_score_time': array([0.61511596, 0.64884075, 0.68569096, 0.77570589]), 'std_score_time': array([0.01426371, 0.01364854, 0.02831926, 0.03200847]), 'param_n_neighbors': masked_array(data=[3, 5, 7, 9],
             mask=[False, False, False, False],
       fill_value='?',
      