In [3]:
import pandas as pd
import numpy as np

In [4]:
NYC = pd.read_csv('../data/dataset_TSMC2014_NYC.csv')
TKY = pd.read_csv('../data/dataset_TSMC2014_TKY.csv')

In [5]:
NYC[:2]

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002581,-240,Tue Apr 03 18:00:09 +0000 2012
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,Tue Apr 03 18:00:25 +0000 2012


In [6]:
NYC['utcTimestamp'] = pd.to_datetime(NYC['utcTimestamp'])
TKY['utcTimestamp'] = pd.to_datetime(TKY['utcTimestamp'])

In [7]:
def match_dict(hour):
    #I split a day into 5 pecies,1 means early morning,2 means morning..etc
    split_day_dict = {1:[0,5],2:[5,11],3:[11,13],4:[13,18],5:[18,24]}
    for k,v in split_day_dict.items():
        if hour>=v[0] and hour<=v[1]:
            return k

def time_region_fearures(df):
    df['hour'] = df['utcTimestamp'].dt.hour
    df['month'] = df['utcTimestamp'].dt.month
    df['weekday'] = df['utcTimestamp'].dt.weekday
    df['week'] = df['utcTimestamp'].dt.week
    df['split_day'] = df['hour'].apply(lambda x:match_dict(x))
    df['weekofyear'] = df['utcTimestamp'].dt.weekofyear
    df['dayofweek'] = df['utcTimestamp'].dt.dayofweek
    df['weekofyear'] = df['utcTimestamp'].dt.weekofyear
    df['ismonthstart'] = df['utcTimestamp'].dt.is_month_start
    df['ismonthend'] = df['utcTimestamp'].dt.is_month_end
    
    
    return df

In [8]:
NYC = time_region_fearures(NYC)
TKY = time_region_fearures(TKY)

In [9]:
NYC[:4]

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,hour,month,weekday,week,split_day,weekofyear,dayofweek,ismonthstart,ismonthend
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002581,-240,2012-04-03 18:00:09,18,4,1,14,4,14,1,False,False
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,2012-04-03 18:00:25,18,4,1,14,4,14,1,False,False
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716162,-73.88307,-240,2012-04-03 18:02:24,18,4,1,14,4,14,1,False,False
3,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.745164,-73.982519,-240,2012-04-03 18:02:41,18,4,1,14,4,14,1,False,False


In [10]:
def frequence_feature(df):
    frequence_weekday = df.groupby(['userId','weekday','venueCategory']).count()
    frequence_weekday = frequence_weekday.reset_index()[['userId','weekday','venueCategory','venueId']]
    frequence_weekday = frequence_weekday.rename(columns={'venueId':'frequence_weekday'})

    frequence_split_day = df.groupby(['userId','split_day','venueCategory']).count()
    frequence_split_day = frequence_split_day.reset_index()[['userId','split_day','venueCategory','venueId']]
    frequence_split_day = frequence_split_day.rename(columns={'venueId':'frequence_split_day'})


    frequence_month = df.groupby(['userId','month','venueCategory']).count()
    frequence_month = frequence_month.reset_index()[['userId','month','venueCategory','venueId']]
    frequence_month = frequence_month.rename(columns={'venueId':'frequence_month'})
    
    df = df.merge(frequence_weekday,how='left',on=['userId','venueCategory','weekday'])
    df = df.merge(frequence_split_day,how='left',on=['userId','venueCategory','split_day'])
    df = df.merge(frequence_month,how='left',on=['userId','venueCategory','month'])
    
    return df


In [11]:
NYC = frequence_feature(NYC)
TKY = frequence_feature(TKY)

In [12]:
#encode venueCategory
def encode_venueCategory(df):
    unique_venueCategory_vec = np.unique(df['venueCategory'])
    venueCategory_dict = {}
    is_judge ={True:1,False:0}
    for idx,name in enumerate(unique_venueCategory_vec):
        venueCategory_dict[name] = idx
    df['venueCategory_id'] = df['venueCategory'].apply(lambda x:venueCategory_dict[x])
    df['ismonthstart'] = df['ismonthstart'].apply(lambda x:is_judge[x])
    df['ismonthend'] = df['ismonthend'].apply(lambda x:is_judge[x])
    return df,venueCategory_dict

In [13]:
NYC,venue_category_dict_nyc = encode_venueCategory(NYC)
TKY,venue_category_dict_tky = encode_venueCategory(TKY)

## Modeling

In [14]:
NYC = NYC.sort_values(by=['userId','utcTimestamp'])
TKY = TKY.sort_values(by=['userId','utcTimestamp'])

In [15]:
chosed_cols = ['userId','utcTimestamp','hour','month', 'weekday','week',
               'split_day','weekofyear','dayofweek',
               'ismonthstart','ismonthend','frequence_weekday',
               'frequence_split_day','frequence_month','venueCategory_id']
                
def generate_feature(df,chosed_cols):
    df = df[chosed_cols]
    unique_users = np.unique(df['userId'])
    test_set = pd.DataFrame()
    train_set = pd.DataFrame()
    for user in unique_users:
        test_set = test_set.append(df[df['userId']==user][-1:])
        train_set = train_set.append(df[df['userId']==user][:])
    return test_set,train_set

In [16]:
testset_nyc,trainset_nyc = generate_feature(NYC,chosed_cols)
testset_tky,trainset_tky = generate_feature(TKY,chosed_cols)

## model

In [17]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


In [18]:
def train(model,X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    classif = OneVsRestClassifier(model)
    classif.fit(X_train, y_train)
    y_pred = classif.predict(X)
    acc = accuracy_score(y_true=y,y_pred=y_pred)
    
    return acc,classif

## NYC

In [20]:
#prepare
NYC_train_Timestamp = trainset_nyc['utcTimestamp']
NYC_train = trainset_nyc.drop('utcTimestamp',axis=1)
NYC_test_Timestamp = testset_nyc['utcTimestamp']
NYC_test = testset_nyc.drop('utcTimestamp',axis=1)
y_nyc = NYC_train['venueCategory_id']
X_nyc = NYC_train.drop('venueCategory_id',axis=1)

show_nyc = testset_nyc
# train svm
svm = SVC(kernel='linear')
#rf_nyc = RandomForestClassifier(n_jobs=-1)
#rf_acc_nyc,rf_nyc = train(rf_nyc,X_nyc,y_nyc)
print("random forest accuracy_score on NYC is {}".format(rf_acc_nyc))

#predict N+1
test_y_nyc = NYC_test['venueCategory_id']
test_X_nyc = NYC_test.drop('venueCategory_id',axis=1)

rf_pred_nyc = rf_nyc.predict(test_X_nyc)
rf_test_acc_nyc = accuracy_score(test_y_nyc,rf_pred_nyc)
print('N+1 predict on NYC accuracy score is {}'.format(rf_test_acc_nyc))
show_nyc['predict'] = rf_pred_nyc
venue_category_dict_nyc_ = dict(zip(venue_category_dict_nyc.values(), venue_category_dict_nyc.keys()))
show_nyc['predict'] = show_nyc['predict'].apply(lambda x:venue_category_dict_nyc_[x]) 
show_nyc['venueCategory'] = show_nyc['venueCategory_id'] .apply(lambda x:venue_category_dict_nyc_[x])

print('show n+1 result')
print(show_nyc[['userId','utcTimestamp','venueCategory','predict']])

random forest accuracy_score on NYC is 0.8013964859208189
N+1 predict on NYC accuracy score is 0.76269621421976
show n+1 result
        userId        utcTimestamp              venueCategory  \
196135       1 2012-12-15 00:13:02                    Airport   
226902       2 2013-02-13 19:16:40                      Beach   
222989       3 2013-02-06 00:49:03                 Restaurant   
227121       4 2013-02-14 00:37:40                     Church   
179298       5 2012-11-20 00:44:50          Food & Drink Shop   
227323       6 2013-02-14 09:43:16              Deli / Bodega   
225933       7 2013-02-12 13:45:02                     School   
226044       8 2013-02-12 17:00:08                        Bar   
219939       9 2013-02-01 02:27:59           Asian Restaurant   
171443      10 2012-11-05 18:07:19            Thai Restaurant   
227377      11 2013-02-16 01:35:22                 Restaurant   
224759      12 2013-02-10 17:31:34          Food & Drink Shop   
214414      13 2013-01-18 1

In [31]:
nyc_parameters = rf_nyc.get_params()
vec = []
for key in nyc_parameters.keys():
    tmp = [key,nyc_parameters[key]]
    vec.append(tmp)
vec = pd.DataFrame(vec,columns=['parameter','value'])
vec

Unnamed: 0,parameter,value
0,estimator__bootstrap,True
1,estimator__class_weight,
2,estimator__criterion,gini
3,estimator__max_depth,
4,estimator__max_features,auto
5,estimator__max_leaf_nodes,
6,estimator__min_impurity_decrease,0
7,estimator__min_impurity_split,
8,estimator__min_samples_leaf,1
9,estimator__min_samples_split,2


In [24]:
show_nyc[['userId','utcTimestamp','venueCategory','predict']]

Unnamed: 0,userId,utcTimestamp,venueCategory,predict
196135,1,2012-12-15 00:13:02,Airport,Clothing Store
226902,2,2013-02-13 19:16:40,Beach,Beach
222989,3,2013-02-06 00:49:03,Restaurant,Restaurant
227121,4,2013-02-14 00:37:40,Church,Church
179298,5,2012-11-20 00:44:50,Food & Drink Shop,Food & Drink Shop
227323,6,2013-02-14 09:43:16,Deli / Bodega,Deli / Bodega
225933,7,2013-02-12 13:45:02,School,School
226044,8,2013-02-12 17:00:08,Bar,School
219939,9,2013-02-01 02:27:59,Asian Restaurant,Building
171443,10,2012-11-05 18:07:19,Thai Restaurant,Thai Restaurant


## TKY

In [22]:
#prepare
TKY_train_Timestamp = trainset_tky['utcTimestamp']
TKY_train = trainset_tky.drop('utcTimestamp',axis=1)
TKY_test_Timestamp = testset_tky['utcTimestamp']
TKY_test = testset_tky.drop('utcTimestamp',axis=1)
y_tky = TKY_train['venueCategory_id']
X_tky = TKY_train.drop('venueCategory_id',axis=1)
show_tky = testset_tky
# train svm
svm = SVC(kernel='linear')
rf_tky = RandomForestClassifier(n_jobs=-1)
rf_acc_tky,rf_tky = train(rf_tky,X_tky,y_tky)
print("rf accuracy_score on TKY is {}".format(rf_acc_tky))

test_y_tky = TKY_test['venueCategory_id']
test_X_tky = TKY_test.drop('venueCategory_id',axis=1)

rf_pred_tky = rf_tky.predict(test_X_tky)
rf_test_acc_tky = accuracy_score(test_y_tky,rf_pred_tky)
print('N+1 predict on TKY accuracy score is {}'.format(rf_test_acc_tky))
show_tky['predict'] = rf_pred_tky
venue_category_dict_tky_ = dict(zip(venue_category_dict_tky.values(), venue_category_dict_tky.keys()))
show_tky['predict'] = show_tky['predict'].apply(lambda x:venue_category_dict_tky_[x]) 
show_tky['venueCategory'] = show_tky['venueCategory_id'] .apply(lambda x:venue_category_dict_tky_[x])

print(show_tky[['userId','utcTimestamp','venueCategory','predict']])

rf accuracy_score on TKY is 0.844199873453686
N+1 predict on TKY accuracy score is 0.7802006105538596
        userId        utcTimestamp              venueCategory  \
536337       1 2013-01-26 12:44:42              Train Station   
571615       2 2013-02-14 00:25:44                     Office   
573011       3 2013-02-14 10:01:19                       Park   
572950       4 2013-02-14 09:47:56              Movie Theater   
573562       5 2013-02-14 12:23:47      Ramen /  Noodle House   
568714       6 2013-02-12 21:57:03              Train Station   
573309       7 2013-02-14 11:07:31      Ramen /  Noodle House   
573606       8 2013-02-16 01:46:12              Train Station   
228487       9 2012-06-18 14:30:17              Train Station   
571387      10 2013-02-13 22:59:15              Train Station   
553872      11 2013-02-03 09:44:49                     Subway   
572628      12 2013-02-14 08:36:59       Fast Food Restaurant   
439208      13 2012-12-03 03:56:19                  B