In [None]:
# from google.colab import drive
# drive.mount('/content/drive') 

In [None]:
# path = '/content/drive/MyDrive/PRML_Data_contest_1/'
path = '../../data/'
import pandas as pd
import numpy as np
import sklearn
from math import radians, cos, sin, asin, sqrt 
from sklearn.preprocessing import LabelEncoder
# from tqdm.notebook import tqdm
# tqdm.pandas()
from sklearn.model_selection import RepeatedStratifiedKFold,StratifiedKFold,train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score
from lightgbm import LGBMClassifier
import warnings
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
import lightgbm as lgb
pd.set_option('use_inf_as_na', True)
pd.set_option('display.max_columns',170)
warnings.filterwarnings('ignore')

# Functions

In [None]:
def encoder(df,cols,df_type='train', encoders={}):  
  df = df.copy()
  if df_type=='train':
    encoders = dict()

    for col_name in cols:
        series = df[col_name]
        label_encoder = LabelEncoder()
        df[col_name] = pd.Series(
            label_encoder.fit_transform(series[series.notnull()]),
            index=series[series.notnull()].index
        )
        encoders[col_name] = label_encoder
    return encoders, df
  elif df_type=='test':
    for col_name in cols:
      series = df[col_name]
      df[col_name] = pd.Series(
            encoders[col_name].transform(series[series.notnull()]),
            index=series[series.notnull()].index
        )
    return df

def acceptance(x, category='going'):
  # df is a row from tour_convoy
  a = set(x[category].split(' '))
  b = set(x.invited.split(' '))
  if x[category]=='':
    return 0
  elif x.invited=='':
    return 1
  else:
    return len(a.intersection(b))/len(b)


def distance(lat1, lon1, lat2, lon2): 
    lon1 = radians(lon1) 
    lon2 = radians(lon2) 
    lat1 = radians(lat1) 
    lat2 = radians(lat2) 
    dlon = lon2 - lon1  
    dlat = lat2 - lat1 
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * asin(sqrt(a))  
    r = 6371
    return(c * r) 
def num_friend(biker,tour,category='going'):
  friends = bikers_network.set_index('biker_id').loc[biker].friends.split(' ')
  people = tour_convoy.set_index('tour_id').loc[tour][category].split(' ')
  a = set(friends)
  b = set(people)
  if a=={''} or b=={''}:
    return 0
  return len(a.intersection(b))

def friends_going_ratio(df, category='going'):
  total = (df['num_friends_going']+df['num_friends_maybe']+df['num_friends_not_going'])
  if category=='going':
    x=df['num_friends_going']
  elif category=='maybe':
    x=df['num_friends_maybe']
  elif category=='not_going':
    x=df['num_friends_not_going']
  if total==0:
    return 0
  return x/total


def get_order(clf,test_pred,test_features):
  dic={}
  test_features['probab'] = clf.predict_proba(test_pred)[:,1]
  x = test_features.groupby('biker_id')
  for i in x.groups:
    data = x.get_group(i)
    y = data.sort_values('probab', ascending=False)
    lis = list(y.tour_id)
    dic[i] = ' '.join(lis)
  df = pd.DataFrame(dic.items(), columns=['biker_id', 'tour_id'])
  return df

def get_order_gbm(clf,test_pred,test_features):
  dic={}
  test_features['probab'] = clf.predict(test_pred)
  x = test_features.groupby('biker_id')
  for i in x.groups:
    data = x.get_group(i)
    y = data.sort_values('probab', ascending=False)
    lis = list(y.tour_id)
    dic[i] = ' '.join(lis)
  df = pd.DataFrame(dic.items(), columns=['biker_id', 'tour_id'])
  return df

def correct_dates(x):
  date = x.split('-')
  day = int(date[0])
  month = int(date[1])
  year = int(date[2])
  year = min(year,2019)
  month = min(month,12)
  if month==2:
    day = min(day,28)
  else:
    day = min(day,30)
  date = [str(day),str(month),str(year)]
  return ('-'.join(date))


# Pre-processing

## Tours

In [None]:
tours = pd.read_csv(path+"tours.csv",parse_dates=True)
tours = tours.rename(columns={'biker_id': 'organizer_id','latitude':'tour_latitude','longitude':'tour_longitude'})
tours['tour_date']=tours['tour_date'].apply(lambda x: correct_dates(x))
tours['tour_date'] = pd.to_datetime(tours['tour_date'],dayfirst=True)
tours['tour_month'] = tours['tour_date'].dt.month
tours['tour_day_of_week'] = tours['tour_date'].dt.weekday
tours['tour_quarter'] = tours['tour_date'].dt.quarter

l = []
for i in range(1,101):
  l+=['w{}'.format(i)]

tours['imp_word_count'] = tours[l].sum(axis=1)
tours['total_word_count'] = tours['imp_word_count']+tours['w_other'] 
tours['imp_word_ratio'] = tours['imp_word_count']/tours['total_word_count']
tours['imp_word/w_other'] =tours['imp_word_count']/tours['w_other'] 
tours['tour_timezone'] = np.round(tours['tour_longitude']*4,0)

In [None]:
tours.head()

Unnamed: 0,tour_id,organizer_id,tour_date,city,state,pincode,country,tour_latitude,tour_longitude,w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11,w12,w13,w14,w15,w16,w17,w18,w19,w20,w21,w22,w23,w24,w25,w26,w27,w28,w29,w30,w31,w32,w33,w34,w35,w36,w37,w38,w39,w40,w41,w42,w43,w44,w45,w46,w47,w48,w49,w50,w51,w52,w53,w54,w55,w56,w57,w58,w59,w60,w61,w62,w63,w64,w65,w66,w67,w68,w69,w70,w71,w72,w73,w74,w75,w76,w77,w78,w79,w80,w81,w82,w83,w84,w85,w86,w87,w88,w89,w90,w91,w92,w93,w94,w95,w96,w97,w98,w99,w100,w_other,tour_month,tour_day_of_week,tour_quarter,imp_word_count,total_word_count,imp_word_ratio,imp_word/w_other,tour_timezone
0,VX4921758,DG47864012,2012-10-30,,,,,,,2,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,9,10,1,4,7,16,0.4375,0.777778,
1,RT4999119,DE76440521,2012-11-03,,,,,,,2,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,11,5,4,10,17,0.588235,1.428571,
2,SY28440935,FB7514445,2012-11-05,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,11,0,4,2,14,0.142857,0.166667,
3,RU82345152,HI1585781,2012-10-30,,,,,,,1,0,2,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,10,1,4,8,16,0.5,1.0,
4,QP51165850,BA16098580,2012-09-27,,,,,,,1,1,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,1,2,0,0,0,0,0,0,0,0,2,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,9,3,3,15,24,0.625,1.666667,


## Tour Convoy

In [None]:
tour_convoy = pd.read_csv(path+"tour_convoy.csv")
tour_convoy.fillna('',inplace=True)
tour_convoy['num_maybe'] = tour_convoy['maybe'].apply(lambda x: len(x.split(' ') if len(x.split(' '))>=1 else 0))
tour_convoy['num_going'] = tour_convoy['going'].apply(lambda x: len(x.split(' ') if len(x.split(' '))>=1 else 0))
tour_convoy['num_invited'] = tour_convoy['invited'].apply(lambda x: len(x.split(' ') if len(x.split(' '))>=1 else 0))
tour_convoy['num_not_going'] = tour_convoy['not_going'].apply(lambda x: len(x.split(' ') if len(x.split(' '))>=1 else 0))
tour_convoy['total_num'] = tour_convoy['num_maybe'] + tour_convoy['num_going'] + tour_convoy['num_not_going']
tour_convoy['going_rate'] = tour_convoy['num_going']/tour_convoy['total_num']
tour_convoy['not_going_rate'] = tour_convoy['num_not_going']/tour_convoy['total_num']

In [None]:
tour_convoy.head()

Unnamed: 0,tour_id,going,maybe,invited,not_going,num_maybe,num_going,num_invited,num_not_going,total_num,going_rate,not_going_rate
0,QQ59822043,BJ75964455 CF2302513 EC26086795 DI05886383 BE2...,CH33420590 FB7546982 BD50834692 FD2087573 FI31...,BH23091036 DH95873583 EB09144917 DF60622906 DB...,DF75574655 BA77296663,7,7,70,2,16,0.4375,0.125
1,VX6467261,CD94228942 CG86116898 BA56558062 DH92942231 EB...,BE98184352 GE5689144 DH70076778 DD1335845 EC39...,BH88073374 HD3302094 BI30571649 GH6508092 HA81...,,8,11,75,1,20,0.55,0.05
2,QQ86208412,,DD20380166 DI10793697,BD79121209 EE0668682,BH28988561 CJ50720854,2,1,2,2,5,0.2,0.4
3,RV21578336,,,,,1,1,1,1,3,0.333333,0.333333
4,XU5842686,CE06118796 DF50897984 CJ4255260 BB25817077 BA9...,CG71721559 BH61448345 CD56975806 CG66669465 BA...,BF18670705 II0919237 CD26414227 CG73818347 DD2...,DF00235232,6,6,10,1,13,0.461538,0.076923


## Bikers

In [None]:
bikers = pd.read_csv(path+'bikers.csv')
locations = pd.read_csv(path+'locations.csv').drop('Unnamed: 0',axis=1)
bikers = bikers.merge(locations, on='biker_id')
bikers = bikers.rename(columns={'latitude':'biker_latitude','longitude':'biker_longitude','time_zone':'biker_timezone'})

In [None]:
bikers.head()

Unnamed: 0,biker_id,language_id,location_id,bornIn,gender,member_since,area,biker_timezone,biker_latitude,biker_longitude
0,DB97468391,id,ID,1993,male,02-10-2012,Medan Indonesia,480.0,3.589999,98.678017
1,DF37982273,id,ID,1992,male,29-09-2012,Medan Indonesia,420.0,3.589999,98.678017
2,IC3183725,en,US,1975,male,06-10-2012,Stratford Ontario,-240.0,43.372967,-80.97509
3,BI72223848,en,US,1991,female,04-11-2012,Tehran Iran,210.0,35.686398,51.432858
4,DE29017717,id,ID,1995,female,10-09-2012,,420.0,,


## Bikers Network

In [None]:
bikers_network = pd.read_csv(path+"bikers_network.csv")
bikers_network.fillna('',inplace=True)
bikers_network['num_friends'] = bikers_network['friends'].apply(lambda x: len(x.split(' '))if x!='' else 0)

In [None]:
bikers_network.head()

Unnamed: 0,biker_id,friends,num_friends
0,DB97468391,BD46449342 DI73244116 EC26080662 BC22907620 FE...,3624
1,DF37982273,BE91560444 DJ5798035 CA36380346 IJ9375619 DF34...,1468
2,IC3183725,BE84954627 BJ50387873 BG52977611 EB85960823 EC...,97
3,BI72223848,ID361640 HC3814682 FF7944478 BH24049724 CF3059...,14
4,DE29017717,EC53303705 CB30310957 BI38389374 DJ28735761 HB...,1137


# Creating Train and Test

In [None]:
train = pd.read_csv(path+"train.csv",parse_dates=True)
test = pd.read_csv(path+"test.csv")

In [None]:
def merge_dfs(df,tours=tours,bikers=bikers,bikers_network=bikers_network,tour_convoy=tour_convoy):
  x = df.merge(tours, on='tour_id',how='left')
  y = x.merge(bikers, on='biker_id',how='left')
  z = y.merge(bikers_network[['biker_id','num_friends']], on='biker_id', how='left')
  a = tour_convoy[['tour_id', 'num_maybe', 'num_going', 'num_invited', 'num_not_going', 'total_num', 'going_rate',
       'not_going_rate']]
  result = z.merge(a,on='tour_id',how='left')
  return result

In [None]:
train_df_merge = merge_dfs(train)
train_df_merge.to_csv('train_df_merge.csv',index=False)

In [None]:
train_df_merge.head()

Unnamed: 0,biker_id,tour_id,invited,timestamp,like,dislike,organizer_id,tour_date,city,state,pincode,country,tour_latitude,tour_longitude,w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11,w12,w13,w14,w15,w16,w17,w18,w19,w20,w21,w22,w23,w24,w25,w26,w27,w28,w29,w30,w31,w32,w33,w34,w35,w36,w37,w38,w39,w40,w41,w42,w43,w44,w45,w46,w47,w48,w49,w50,w51,w52,w53,w54,w55,w56,w57,w58,w59,w60,w61,w62,w63,w64,w65,w66,w67,w68,w69,w70,w71,w72,w73,w74,w75,w76,w77,w78,w79,w80,w81,w82,w83,w84,w85,w86,w87,w88,w89,w90,w91,w92,w93,w94,w95,w96,w97,w98,w99,w100,w_other,tour_month,tour_day_of_week,tour_quarter,imp_word_count,total_word_count,imp_word_ratio,imp_word/w_other,tour_timezone,language_id,location_id,bornIn,gender,member_since,area,biker_timezone,biker_latitude,biker_longitude,num_friends,num_maybe,num_going,num_invited,num_not_going,total_num,going_rate,not_going_rate
0,DA44012,QY18771225,0,02-10-2012 15:53:05,0,0,EB06419938,2012-10-03,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,10,2,4,0,2,0.0,0.0,,id,ID,1990,male,02-10-2012,Binjai,480.0,3.699836,98.429443,863,2,8,23,25,35,0.228571,0.714286
1,DA44012,QU02284248,0,02-10-2012 15:53:05,0,0,CA16654644,2012-10-03,Yogyakarta,,,Indonesia,-7.767,110.363,2,0,0,0,2,3,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,10,2,4,14,38,0.368421,0.583333,441.0,id,ID,1990,male,02-10-2012,Binjai,480.0,3.699836,98.429443,863,6,10,122,2,18,0.555556,0.111111
2,DA44012,RU29072432,0,02-10-2012 15:53:05,1,0,DG39934255,2012-10-26,Medan,,,Indonesia,3.567,98.65,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,37,10,4,4,9,46,0.195652,0.243243,395.0,id,ID,1990,male,02-10-2012,Binjai,480.0,3.699836,98.429443,863,154,212,3844,137,503,0.421471,0.272366
3,DA44012,SP72478280,0,02-10-2012 15:53:05,0,0,JH461525,2012-10-06,,,,,34.017,71.583,1,2,1,1,0,0,2,0,0,1,1,1,2,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,0,0,0,0,2,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,3,0,56,10,5,4,30,86,0.348837,0.535714,286.0,id,ID,1990,male,02-10-2012,Binjai,480.0,3.699836,98.429443,863,6,8,9,1,15,0.533333,0.066667
4,DA44012,QS90707377,0,02-10-2012 15:53:05,0,0,DG39934255,2012-10-06,Medan,,,Indonesia,3.607,98.653,2,0,0,0,0,4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,79,10,5,4,12,91,0.131868,0.151899,395.0,id,ID,1990,male,02-10-2012,Binjai,480.0,3.699836,98.429443,863,65,83,3814,55,203,0.408867,0.270936


In [None]:
train_df_merge.dtypes

biker_id           object
tour_id            object
invited             int64
timestamp          object
like                int64
                   ...   
num_invited         int64
num_not_going       int64
total_num           int64
going_rate        float64
not_going_rate    float64
Length: 140, dtype: object

In [None]:
test_df_merge = merge_dfs(test)
test_df_merge.to_csv('test_df_merge.csv',index=False)

In [None]:
test_df_merge.head()

Unnamed: 0,biker_id,tour_id,invited,timestamp,organizer_id,tour_date,city,state,pincode,country,tour_latitude,tour_longitude,w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11,w12,w13,w14,w15,w16,w17,w18,w19,w20,w21,w22,w23,w24,w25,w26,w27,w28,w29,w30,w31,w32,w33,w34,w35,w36,w37,w38,w39,w40,w41,w42,w43,w44,w45,w46,w47,w48,w49,w50,w51,w52,w53,w54,w55,w56,w57,w58,w59,w60,w61,w62,w63,w64,w65,w66,w67,w68,w69,w70,w71,w72,w73,w74,w75,w76,w77,w78,w79,w80,w81,w82,w83,w84,w85,w86,w87,w88,w89,w90,w91,w92,w93,w94,w95,w96,w97,w98,w99,w100,w_other,tour_month,tour_day_of_week,tour_quarter,imp_word_count,total_word_count,imp_word_ratio,imp_word/w_other,tour_timezone,language_id,location_id,bornIn,gender,member_since,area,biker_timezone,biker_latitude,biker_longitude,num_friends,num_maybe,num_going,num_invited,num_not_going,total_num,going_rate,not_going_rate
0,CG33145288,QX16813281,0,01-11-2012 10:14:42,BB12186589,2012-11-10,,,,,11.529,104.931,7,3,7,1,3,4,2,2,0,3,2,3,5,1,1,0,1,0,1,0,0,2,1,0,0,0,0,1,1,0,0,0,0,0,2,0,1,0,0,4,0,0,0,0,0,0,1,0,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,2,0,1,0,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,81,11,5,4,74,155,0.477419,0.91358,420.0,en,US,1993,male,01-11-2012,Phnom Penh,420.0,11.563141,104.865318,2528,180,223,2521,87,490,0.455102,0.177551
1,CG33145288,QR69035551,0,01-11-2012 10:14:08,HA0933835,2013-02-01,Phnom Penh,,,Cambodia,11.569,104.914,2,1,3,0,0,0,2,1,3,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3,0,0,2,1,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,36,2,4,1,33,69,0.478261,0.916667,420.0,en,US,1993,male,01-11-2012,Phnom Penh,420.0,11.563141,104.865318,2528,384,510,9094,1,895,0.569832,0.001117
2,CG33145288,VW3098017,0,01-11-2012 10:14:08,DC74062122,2012-11-04,Phnom Penh,,,Cambodia,11.551,104.929,0,0,2,0,0,2,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,30,11,6,4,12,42,0.285714,0.4,420.0,en,US,1993,male,01-11-2012,Phnom Penh,420.0,11.563141,104.865318,2528,6,20,454,7,33,0.606061,0.212121
3,EC61865653,RR14608095,0,01-11-2012 02:14:15,EG82098,2012-11-04,,,,,,,0,0,0,0,2,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,84,11,6,4,10,94,0.106383,0.119048,,id,ID,1993,male,01-11-2012,Magelang,540.0,-7.468956,110.218559,848,135,382,2927,67,584,0.65411,0.114726
4,CG33145288,RP07279414,0,01-11-2012 10:14:08,HA0933835,2013-11-28,Phnom Penh,,,Cambodia,11.569,104.914,1,0,4,1,1,0,0,1,0,2,0,1,3,0,0,2,0,0,0,0,0,1,0,2,2,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,0,0,0,1,0,0,1,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,28,11,3,4,34,62,0.548387,1.214286,420.0,en,US,1993,male,01-11-2012,Phnom Penh,420.0,11.563141,104.865318,2528,348,571,9064,1,920,0.620652,0.001087


In [None]:
train_df_merge.dtypes

biker_id           object
tour_id            object
invited             int64
timestamp          object
like                int64
                   ...   
num_invited         int64
num_not_going       int64
total_num           int64
going_rate        float64
not_going_rate    float64
Length: 140, dtype: object

In [None]:
def add_features(df): 
  df['bornIn'] = df['bornIn'].replace('None','1993')
  df['bornIn'] = df['bornIn'].replace('23-May','1993')
  df['bornIn'] = df['bornIn'].fillna('1993')
  df['bornIn'] = df['bornIn'].astype(int)
  df['tour_date'] = pd.to_datetime(df['tour_date'],dayfirst=True)
  df = df.rename(columns={'timestamp':'biker_inform_date'})
  df['member_since'] = pd.to_datetime(df['member_since'],dayfirst=True)
  df['biker_inform_date'] = pd.to_datetime(df['biker_inform_date'],dayfirst=True)
  df['month_inform'] = df['biker_inform_date'].dt.month
  df['day_inform'] = df['biker_inform_date'].dt.weekday
  df['quarter_inform'] = df['biker_inform_date'].dt.quarter
  df['inform_month_is_tour_month'] = (df['month_inform']==df['tour_month']).astype(int)
  df['inform_quarter_is_tour_quarter'] = (df['quarter_inform']==df['tour_month']).astype(int)
  df['member_tour_period'] = (df['tour_date'] - df['member_since']).dt.days
  df['inform_tour_period'] = (df['tour_date'] - df['biker_inform_date']).dt.days
  df['distance_between_biker_tour'] = df.apply(lambda x: distance(x.tour_latitude, x.tour_longitude,x.biker_latitude,x.biker_longitude), axis=1)
  df['time_zone_diff'] = (df['biker_timezone'] - df['tour_timezone'])
  df['num_friends_going'] = df.apply(lambda x: num_friend(x.biker_id	, x.tour_id, category='going'), axis=1)
  df['num_friends_maybe'] = df.apply(lambda x: num_friend(x.biker_id	, x.tour_id, category='maybe'), axis=1)
  df['num_friends_not_going'] = df.apply(lambda x: num_friend(x.biker_id	, x.tour_id, category='not_going'), axis=1)
  df['num_friends_invited'] = df.apply(lambda x: num_friend(x.biker_id, x.tour_id, category='invited'), axis=1)
  df['is_weekend']= df['tour_day_of_week'].apply(lambda x: 1 if x in [5,6] else 0)
  bikers_network.set_index('biker_id',inplace=True)
  df['lat_diff'] = np.abs(df['tour_latitude'] - df['biker_latitude'])
  df['age'] = df['tour_date'].dt.year- df['bornIn']
  df['friends_going_ratio'] = df.apply(lambda x: friends_going_ratio(x), axis=1)
  df['friends_with_organizer'] = df.apply(lambda x: 1 if x.organizer_id in bikers_network.loc[x.biker_id].friends else 0, axis=1)
  bikers_network.reset_index(inplace=True)
  drop_cols = ['biker_inform_date','organizer_id','tour_date','city','state','pincode','country','area','member_since']
  df= df.drop(drop_cols, axis=1)
  return df

In [None]:
train_df_merge = pd.read_csv('train_df_merge.csv')
test_df_merge = pd.read_csv('test_df_merge.csv')

train_features = add_features(train_df_merge)
train_features.to_csv('train_features.csv',index=False)

test_features= add_features(test_df_merge)
test_features.to_csv('test_features.csv',index=False)

In [None]:
train_features = pd.read_csv('train_features.csv')
test_features = pd.read_csv('test_features.csv')

# Model

In [None]:
categorical_features = ['invited','tour_month','tour_day_of_week','tour_quarter','language_id','location_id','gender','month_inform','day_inform',
                        'quarter_inform','inform_month_is_tour_month','inform_quarter_is_tour_quarter','friends_with_organizer','is_weekend']
categorical_for_selected = ['friends_with_organizer','gender','invited','language_id','location_id','tour_day_of_week','tour_month','tour_quarter']

In [None]:
drop_test = ['biker_id','tour_id']
drop_val=['like','dislike']
X, y = train_features.drop(drop_val+drop_test,axis=1), train_features['like']
test = test_features.drop(drop_test,axis=1)
X_train,X_val,y_train,y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

encoder_dict,X_train = encoder(X_train,['language_id', 'location_id', 'gender'],df_type='train')
X_val = encoder(X_val,['language_id', 'location_id', 'gender'],df_type='test',encoders=encoder_dict)
test = encoder(test,['language_id', 'location_id', 'gender'],df_type='test',encoders=encoder_dict)

def categorical(df, categorical_features):
  df = df.copy()
  for c in categorical_features:
    df[c] = df[c].astype('category')
  return df
X_train = categorical(X_train, categorical_features)
X_val = categorical(X_val, categorical_features)
test = categorical(test, categorical_features)

## Model - 1

In [None]:
d_train = lgb.Dataset(X_train, label=y_train,categorical_feature=categorical_for_selected)
d_val = lgb.Dataset(X_val, label=y_val,categorical_feature=categorical_for_selected)

params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': {'binary','auc', 'l2'},
'is_training_metric': True,
'metric_freq': 5,
'num_leaves': 100,
'n_estimators':200,
'learning_rate': 0.01,
'feature_fraction': 0.7,
'bagging_fraction': 0.6,
'bagging_freq': 5,
'verbose': 0,
'device' : 'cpu',
'gpu_platform_id' : 0,
'gpu_device_id' : 0,
'num_iterations':5000,
'min_data_in_leaf': 15,
'max_depth': 80,
'random_state':42 
}

gbm1 = lgb.train( params, d_train, num_boost_round=5000, valid_sets=d_val, early_stopping_rounds=1000, verbose_eval=False)

In [None]:
test_features1 = test_features.copy()
submission = get_order_gbm(gbm1,test,test_features1)
submission.to_csv('CH18B067_CH18B032_1.csv',index=False) 

# Model - 2

In [None]:
d_train = lgb.Dataset(X_train, label=y_train)
d_val = lgb.Dataset(X_val, label=y_val)

params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': {'binary','auc', 'l2'},
'is_training_metric': True,
'metric_freq': 5,
'num_leaves': 113,
'n_estimators':500,
'learning_rate': 0.04,
'feature_fraction': 0.5,
'bagging_fraction': 1,
'bagging_freq': 5,
'verbose': 0,
'device' : 'cpu',
'gpu_platform_id' : 0,
'gpu_device_id' : 0,
'num_iterations':5000,
'min_data_in_leaf': 16,
'max_depth': 52,
'random_state':42 }

gbm2 = lgb.train( params, d_train, num_boost_round=5000,
valid_sets=d_val, early_stopping_rounds=1000, verbose_eval=False)

In [None]:
test_features2 = test_features.copy()
submission = get_order_gbm(gbm2,test,test_features2)
submission.to_csv('CH18B067_CH18B032_2.csv',index=False) 

# Removing Temporary Files

In [None]:
import os
os.remove('train_df_merge.csv')
os.remove('test_df_merge.csv')
os.remove('train_features.csv')
os.remove('test_features.csv')