In [87]:
#step:
# 1. Data set acquisition
# 2. Data processing
# 2.1 To narrow the scope of the data
# 2.2 To select time characteristics
# 2.3 To remove the places with less sign-in
# 2.4 To determine the characteristic value and target value
# 2.5 To split the data set
# 3. Feature engineering - feature preprocessing (standardization)
# 4. Machine learning - knn+cv
# 5. Model evaluation

# Implementation

## Import the modular

In [88]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

## Data set acquisition

In [89]:
facebook = pd.read_csv("./data/FBlocation/train.csv")

## Data processing

### To narrow the scope of the data

In [90]:
facebook_data = facebook.query("x>2.0 & x<2.5 & y > 2.0 & y < 2.5")

### To transform the time into proper form  
  ex: （yyyy - mm - dd hh - mm - ss）

In [91]:
facebook_data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
163,163,2.1663,2.3755,84,669737,3869813743
310,310,2.3695,2.2034,3,234719,2636621520
658,658,2.3236,2.1768,66,502343,7877745055
1368,1368,2.2613,2.3392,73,319822,9775192577
1627,1627,2.3331,2.0011,66,595084,6731326909


In [92]:
facebook_data["time"].head()

163     669737
310     234719
658     502343
1368    319822
1627    595084
Name: time, dtype: int64

In [93]:
time = pd.to_datetime(facebook_data["time"])
time.head()

163    1970-01-01 00:00:00.000669737
310    1970-01-01 00:00:00.000234719
658    1970-01-01 00:00:00.000502343
1368   1970-01-01 00:00:00.000319822
1627   1970-01-01 00:00:00.000595084
Name: time, dtype: datetime64[ns]

In [94]:
time = pd.to_datetime(facebook_data["time"],unit="s")
time.head()

163    1970-01-08 18:02:17
310    1970-01-03 17:11:59
658    1970-01-06 19:32:23
1368   1970-01-04 16:50:22
1627   1970-01-07 21:18:04
Name: time, dtype: datetime64[ns]

In [95]:
time = pd.DatetimeIndex(time)
time[:5]

DatetimeIndex(['1970-01-08 18:02:17', '1970-01-03 17:11:59',
               '1970-01-06 19:32:23', '1970-01-04 16:50:22',
               '1970-01-07 21:18:04'],
              dtype='datetime64[ns]', name='time', freq=None)

In [96]:
time.day[:4]

Int64Index([8, 3, 6, 4], dtype='int64', name='time')

In [97]:
facebook_data["day"] = time.day
facebook_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  facebook_data["day"] = time.day


Unnamed: 0,row_id,x,y,accuracy,time,place_id,day
163,163,2.1663,2.3755,84,669737,3869813743,8
310,310,2.3695,2.2034,3,234719,2636621520,3
658,658,2.3236,2.1768,66,502343,7877745055,6
1368,1368,2.2613,2.3392,73,319822,9775192577,4
1627,1627,2.3331,2.0011,66,595084,6731326909,7


In [98]:
time.hour[:4]

Int64Index([18, 17, 19, 16], dtype='int64', name='time')

In [99]:
facebook_data["hour"] = time.hour
facebook_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  facebook_data["hour"] = time.hour


Unnamed: 0,row_id,x,y,accuracy,time,place_id,day,hour
163,163,2.1663,2.3755,84,669737,3869813743,8,18
310,310,2.3695,2.2034,3,234719,2636621520,3,17
658,658,2.3236,2.1768,66,502343,7877745055,6,19
1368,1368,2.2613,2.3392,73,319822,9775192577,4,16
1627,1627,2.3331,2.0011,66,595084,6731326909,7,21


In [100]:
time.weekday[:4]

Int64Index([3, 5, 1, 6], dtype='int64', name='time')

In [101]:
facebook_data["weekday"] = time.weekday
facebook_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  facebook_data["weekday"] = time.weekday


Unnamed: 0,row_id,x,y,accuracy,time,place_id,day,hour,weekday
163,163,2.1663,2.3755,84,669737,3869813743,8,18,3
310,310,2.3695,2.2034,3,234719,2636621520,3,17,5
658,658,2.3236,2.1768,66,502343,7877745055,6,19,1
1368,1368,2.2613,2.3392,73,319822,9775192577,4,16,6
1627,1627,2.3331,2.0011,66,595084,6731326909,7,21,2


### To remove the places with few occurrence

In [102]:
facebook_data.groupby("place_id").count().head()

Unnamed: 0_level_0,row_id,x,y,accuracy,time,day,hour,weekday
place_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1006234733,1,1,1,1,1,1,1,1
1008823061,4,4,4,4,4,4,4,4
1012580558,3,3,3,3,3,3,3,3
1025585791,21,21,21,21,21,21,21,21
1026507711,220,220,220,220,220,220,220,220


In [103]:
place_count = facebook_data.groupby("place_id").count()

In [104]:
place_count.shape

(2524, 8)

In [105]:
place_count["row_id"]>3

place_id
1006234733    False
1008823061     True
1012580558    False
1025585791     True
1026507711     True
              ...  
9986101718    False
9993141712    False
9995108787     True
9998968845     True
9999851158    False
Name: row_id, Length: 2524, dtype: bool

In [106]:
place_count = place_count[place_count["row_id"]>3]
place_count.head()

Unnamed: 0_level_0,row_id,x,y,accuracy,time,day,hour,weekday
place_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1008823061,4,4,4,4,4,4,4,4
1025585791,21,21,21,21,21,21,21,21
1026507711,220,220,220,220,220,220,220,220
1032417180,10,10,10,10,10,10,10,10
1040557418,123,123,123,123,123,123,123,123


In [107]:
place_count.shape

(929, 8)

In [108]:
place_count.index

Int64Index([1008823061, 1025585791, 1026507711, 1032417180, 1040557418,
            1067960232, 1068428112, 1068896566, 1104074781, 1113141722,
            ...
            9929803766, 9934025626, 9944591314, 9951996370, 9953487159,
            9966115681, 9970566102, 9983648790, 9995108787, 9998968845],
           dtype='int64', name='place_id', length=929)

In [109]:
facebook_data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id,day,hour,weekday
163,163,2.1663,2.3755,84,669737,3869813743,8,18,3
310,310,2.3695,2.2034,3,234719,2636621520,3,17,5
658,658,2.3236,2.1768,66,502343,7877745055,6,19,1
1368,1368,2.2613,2.3392,73,319822,9775192577,4,16,6
1627,1627,2.3331,2.0011,66,595084,6731326909,7,21,2


In [110]:
facebook_data.shape

(71664, 9)

In [114]:
facebook_data["place_id"].isin(place_count.index).head()

163     True
310     True
658     True
1368    True
1627    True
Name: place_id, dtype: bool

In [115]:
facebook_data = facebook_data[facebook_data["place_id"].isin(place_count.index)]
facebook_data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id,day,hour,weekday
163,163,2.1663,2.3755,84,669737,3869813743,8,18,3
310,310,2.3695,2.2034,3,234719,2636621520,3,17,5
658,658,2.3236,2.1768,66,502343,7877745055,6,19,1
1368,1368,2.2613,2.3392,73,319822,9775192577,4,16,6
1627,1627,2.3331,2.0011,66,595084,6731326909,7,21,2


In [116]:
facebook_data.shape

(69264, 9)

### To determine the features and target varible

In [119]:
x = facebook_data[["x","y","accuracy","day","hour","weekday"]]
x.head()

Unnamed: 0,x,y,accuracy,day,hour,weekday
163,2.1663,2.3755,84,8,18,3
310,2.3695,2.2034,3,3,17,5
658,2.3236,2.1768,66,6,19,1
1368,2.2613,2.3392,73,4,16,6
1627,2.3331,2.0011,66,7,21,2


In [121]:
facebook_data["place_id"].head()

163     3869813743
310     2636621520
658     7877745055
1368    9775192577
1627    6731326909
Name: place_id, dtype: int64

In [123]:
y = facebook_data["place_id"]
y.head()

163     3869813743
310     2636621520
658     7877745055
1368    9775192577
1627    6731326909
Name: place_id, dtype: int64

### To split the data set

In [124]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 22)

In [126]:
x_test.head()

Unnamed: 0,x,y,accuracy,day,hour,weekday
24703810,2.3032,2.1776,87,7,2,2
19445902,2.4898,2.3992,32,7,3,2
18490063,2.1656,2.337,60,6,23,1
7762709,2.4609,2.2669,72,6,3,1
6505956,2.0409,2.4288,46,7,18,2


In [128]:
x_train.head()

Unnamed: 0,x,y,accuracy,day,hour,weekday
5900956,2.3061,2.1863,66,6,14,1
16743069,2.3463,2.3761,62,8,11,3
13825759,2.3866,2.269,16,3,3,5
13612259,2.2796,2.2509,11,3,1,5
848177,2.0402,2.4858,11,1,7,3
