In [None]:
import numpy as np 
import pandas as pd 
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.metrics import classification_report
from tqdm.notebook import tqdm
from collections import Counter

In [None]:
folder_path = '/content/drive/MyDrive/python_data/kaggle/airbnb/data/'
files = ['age_gender_bkts.csv', 'countries.csv', 'sessions.csv', 'test_users.csv', 'train_users_2.csv']

In [None]:
train_df = pd.read_csv(folder_path+files[4])
test_df = pd.read_csv(folder_path+files[3])
sessions = pd.read_csv(folder_path+files[2])
countries = pd.read_csv(folder_path+files[1])
age_gender = pd.read_csv(folder_path+files[0])

In [None]:
countries

Unnamed: 0,country_destination,lat_destination,lng_destination,distance_km,destination_km2,destination_language,language_levenshtein_distance
0,AU,-26.853388,133.27516,15297.744,7741220.0,eng,0.0
1,CA,62.393303,-96.818146,2828.1333,9984670.0,eng,0.0
2,DE,51.165707,10.452764,7879.568,357022.0,deu,72.61
3,ES,39.896027,-2.487694,7730.724,505370.0,spa,92.25
4,FR,46.232193,2.209667,7682.945,643801.0,fra,92.06
5,GB,54.63322,-3.432277,6883.659,243610.0,eng,0.0
6,IT,41.87399,12.564167,8636.631,301340.0,ita,89.4
7,NL,52.133057,5.29525,7524.3203,41543.0,nld,63.22
8,PT,39.553444,-7.839319,7355.2534,92090.0,por,95.45
9,US,36.966427,-95.84403,0.0,9826675.0,eng,0.0


In [None]:
device = 'cuda' if torch.cuda.is_available() else "cpu"

# EDA

In [None]:
train_df.head(3)

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US


## Session

In [None]:
sessions.head(3)

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0


## Country

In [None]:
# countries

In [None]:
# age_gender

## 確認target 是否有data imbalance問題
1. 6成NDF，3成US
2. Evaluation metrics：NDCG
3. data imbalance 問題

In [None]:
print('目的地占比 %')
(train_df.iloc[:, -1].value_counts()/len(train_df))*100

目的地占比 %


NDF      58.347349
US       29.222632
other     4.728954
FR        2.353233
IT        1.328174
GB        1.088774
ES        1.053638
CA        0.669006
DE        0.497070
NL        0.356991
AU        0.252517
PT        0.101663
Name: country_destination, dtype: float64

In [None]:
import matplotlib.pyplot as plt
plt.hist()

# Session

In [None]:
sessions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10567737 entries, 0 to 10567736
Data columns (total 6 columns):
 #   Column         Dtype  
---  ------         -----  
 0   user_id        object 
 1   action         object 
 2   action_type    object 
 3   action_detail  object 
 4   device_type    object 
 5   secs_elapsed   float64
dtypes: float64(1), object(5)
memory usage: 483.8+ MB


In [None]:
print(len(sessions))
sessions.fillna(value= {'action_type': 'NaN', 'action_detail': 'NaN', 'secs_elapsed': 0}, inplace= True)
sessions.head(3)

10567737


Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0


In [None]:
# new_session = pd.merge(sessions, train_df, left_on='user_id', right_on= 'id')[['user_id', 'action', 'action_type', 'action_detail', 'device_type', 'secs_elapsed', 'country_destination']]

In [None]:
# new_session.groupby(['action_detail'])['country_destination'].value_counts()
# pd.get_dummies(sessions)
print(train_df['id'].nunique())
print(sessions['user_id'].nunique())
print(sessions['action'].nunique())
print(sessions['action_type'].nunique())
print(sessions['action_detail'].nunique())

213451
135483
359
11
156


In [None]:
# sessions['action_detail'].apply(lambda x: x.split('_')[0])
# clean_session = sessions[['user_id', 'action_detail']].dropna().drop_duplicates()
# user_action_feature = pd.get_dummies(clean_session, columns= ['action_detail']).groupby(['user_id']).max()

In [None]:
session_feature = sessions.groupby(['user_id']).agg(action_detail_count= ('action_detail', 'count'),
                          action_detail_mode= ('action_detail', lambda x: Counter(x).most_common(1)[0][0]),
                          secs_mean= ('secs_elapsed', 'mean'), 
                          secs_std= ('secs_elapsed', 'std')).fillna(0)

In [None]:
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(session_feature[['action_detail_count', 'secs_mean', 'secs_std']])

In [None]:
session_feature['action_detail_count'] = normalized_data[:, 0]
session_feature['secs_mean'] = normalized_data[:, 1]
session_feature['secs_std'] = normalized_data[:, 2]

### 需要補missing value & normalize

In [None]:
session_feature.info()

<class 'pandas.core.frame.DataFrame'>
Index: 135483 entries, 00023iyk9l to zzzlylp57e
Data columns (total 4 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   action_detail_count  135483 non-null  float64
 1   action_detail_mode   135483 non-null  object 
 2   secs_mean            135483 non-null  float64
 3   secs_std             135483 non-null  float64
dtypes: float64(3), object(1)
memory usage: 5.2+ MB


In [None]:
session_feature.head(3)

Unnamed: 0_level_0,action_detail_count,action_detail_mode,secs_mean,secs_std
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00023iyk9l,0.014333,p3,0.023297,0.07326
0010k6l0om,0.022786,,0.009996,0.018169
001wyh0pz8,0.032709,view_search_results,0.003376,0.005259


In [None]:
# new_session.groupby(['action_detail'])['country_destination'].apply(lambda x: x.value_counts().head(3)).tail(20)

## date_first_booking 缺值 -> NDF, 沒缺 -> 沒有NDF

In [None]:
train_df[train_df['date_first_booking'].isnull()]['country_destination'].value_counts()

NDF    124543
Name: country_destination, dtype: int64

## Age 補值？

In [None]:
print(round(len(train_df[train_df['age'].isnull()])*100/len(train_df), 2), '% of train data do not have age value')

41.22 % of train data do not have age value


In [None]:
# train_df.info()

## Test data

In [None]:
test_df.head(3)

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,5uwns89zht,2014-07-01,20140701000006,,FEMALE,35.0,facebook,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
1,jtl0dijy2j,2014-07-01,20140701000051,,-unknown-,,basic,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
2,xx0ulgorjt,2014-07-01,20140701000148,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome


In [None]:
# test_df.info()

In [None]:
test_df['date_first_booking'].value_counts()

Series([], Name: date_first_booking, dtype: int64)

In [None]:
print(round(len(test_df[test_df['age'].isnull()])*100/len(test_df), 2), '% of the test data do not have age')

46.5 % of the test data do not have age


# Preprocessing

In [None]:
train_df.head(3)

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US


In [None]:
def get_data(train, test):
  train['eval'] = 'train'
  test['eval'] = 'test'

  train_y = train['country_destination']
  train.drop(columns= ['country_destination'], axis=0, inplace=True)

  df = pd.concat([train, test], axis= 0)
  df['year_account_created'] = df['date_account_created'].apply(lambda x: x.split('-')[0])
  df['month_account_created'] = df['date_account_created'].apply(lambda x: x.split('-')[1])
  unwanted_cols = ['date_first_booking', 'date_account_created', 'timestamp_first_active', 'age']
  df.drop(columns= unwanted_cols, axis=0, inplace= True)
  df = pd.merge(df, session_feature, left_on= 'id', right_on='user_id', how= 'left').fillna(value= {'first_affiliate_tracked': 'NaN',
                                                      'action_detail_mode': 'NaN', 
                                                      'action_detail_count': 0,
                                                      'secs_mean': 0,
                                                      'secs_std': 0}) # first_affilaited_tracked missing | some user no sessions data
  print(df.info())

  cat_cols = [
        'year_account_created',
        'month_account_created',
        'gender', 
        'signup_method', 
        'signup_flow', 
        'language', 
        'affiliate_channel', 
        'affiliate_provider', 
        'first_affiliate_tracked', 
        'signup_app', 
        'first_device_type', 
        'first_browser',
        'action_detail_mode']

  # enc = LabelEncoder()
  dense_df = df.drop(columns= cat_cols, axis= 0) # including (id, eval) column
  cat_df = df[cat_cols].apply(LabelEncoder().fit_transform)
  df = pd.concat([dense_df, cat_df], axis= 1)

  

  print(cat_df.head(3))
  print(dense_df.head(3))
  # print(df)

  '''train data'''
  train_X = df[df['eval']=='train']
  train_id = train_X['id']
  train_X = train_X.drop(columns= ['id', 'eval'], axis= 0)
  train_cat, train_dense = train_X[cat_cols].to_numpy(), train_X.drop(columns= cat_cols).to_numpy()
  
  '''test data'''
  test_X = df[df['eval']=='test']
  y_id = test_df['id']
  test_X = test_X.drop(columns= ['id', 'eval'], axis= 0)
  test_cat, test_dense = test_X[cat_cols].to_numpy(), test_X.drop(columns= cat_cols).to_numpy()

  cat_fields = [cat_df[col].max()+1 for col in cat_df.columns]
  num_contns = train_dense.shape[1]

  return train_cat, train_dense, train_y, test_cat, test_dense, train_id, y_id, cat_fields, num_contns

In [None]:
train_df = train_df[train_df['country_destination']!='NDF']

In [None]:
train_cat, train_dense, train_y, test_cat, test_dense, train_id, y_id, cat_fields, num_contns= get_data(train= train_df, test= test_df)
print(cat_fields)
print(num_contns)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 151004 entries, 0 to 151003
Data columns (total 18 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       151004 non-null  object 
 1   gender                   151004 non-null  object 
 2   signup_method            151004 non-null  object 
 3   signup_flow              151004 non-null  int64  
 4   language                 151004 non-null  object 
 5   affiliate_channel        151004 non-null  object 
 6   affiliate_provider       151004 non-null  object 
 7   first_affiliate_tracked  151004 non-null  object 
 8   signup_app               151004 non-null  object 
 9   first_device_type        151004 non-null  object 
 10  first_browser            151004 non-null  object 
 11  eval                     151004 non-null  object 
 12  year_account_created     151004 non-null  object 
 13  month_account_created    151004 non-null  object 
 14  acti

In [None]:
print(train_cat.shape)
print(train_dense.shape)
print(train_y.shape)
print(test_cat.shape)

(88908, 13)
(88908, 3)
(88908,)
(62096, 13)


In [None]:
enc = LabelEncoder()
enc_fitted = enc.fit(train_y)
train_y = enc_fitted.transform(train_y)

In [None]:
set(train_y)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}

In [None]:
target_names= enc.inverse_transform(list(set(train_y)))
print(target_names)

['AU' 'CA' 'DE' 'ES' 'FR' 'GB' 'IT' 'NL' 'PT' 'US' 'other']


# Custom Dataset

In [None]:
class custom_dataset(Dataset):
  def __init__(self, X_cat, X_dense, y= None, if_y= True):
    self.X_cat = torch.tensor(X_cat, dtype= torch.long)
    self.X_dense = torch.tensor(X_dense, dtype= torch.float)
    self.if_y= if_y
    if if_y:
      self.y = torch.tensor(y, dtype= torch.long)
  
  def __len__(self):
    return len(self.X_cat)
  
  def __getitem__(self, idx):
    if self.if_y:
      return self.X_cat[idx], self.X_dense[idx], self.y[idx]
    else:
      return self.X_cat[idx], self.X_dense[idx]

# Model, metric：NDCG

* predict multiclass with probability(n_class = 1+n_countries)
* loss function: cross entropy

In [None]:
class DeepFM(nn.Module):
  def __init__(self, cat_fields, num_contns, k, hidden_dims, dropout, n_class, sparse= True):
    super(DeepFM, self).__init__()
    self.cat_fields = cat_fields
    self.num_contns = num_contns 
    self.num_cat = len(cat_fields)
    self.k = k 
    self.hidden_dims = hidden_dims
    self.dropout= nn.Dropout(p=dropout)

    """Linear"""
    # if num_contns != 0:
    self.fm_1st_dense = nn.Linear(num_contns, 1)
    self.fm_1st_cat = nn.ModuleList([nn.Embedding(voc_size, 1, sparse= sparse) for voc_size in cat_fields])

    """embedding"""
    self.embedding_layer = nn.ModuleList([nn.Embedding(voc_size, k, sparse= sparse) for voc_size in cat_fields])
    
    """DNN"""
    layers = []
    input_dim = k * len(cat_fields) + num_contns
    # self.fc_3rd_dense = nn.Linear(num_contns, input_dim) #將contns轉成input_dim過dnn

    for hidden_dim in hidden_dims:
      layers.append(nn.Linear(input_dim, hidden_dim))
      layers.append(nn.BatchNorm1d(hidden_dim))
      layers.append(nn.ReLU())
      layers.append(self.dropout)
      input_dim = hidden_dim
    
    layers.append(nn.Linear(hidden_dims[-1], n_class))
    self.dnn = nn.Sequential(*layers)
    

  def Dense_Embedding(self, X_cat):
    # (batch_size, num_cat)
    cat2dense = [embed(X_cat[:, i].unsqueeze(dim= 1)) for i, embed in enumerate(self.embedding_layer)] # [batch_size, k]
    cat2dense = torch.cat(cat2dense, dim= 1) #[batch_size, num_cat, k]
    return cat2dense

  
  def FM(self, X): # [batch_size, num_cat, k]
    sum_of_square = torch.sum(X, dim= 1)**2 #[n, k]
    square_of_sum = torch.sum(X**2, dim= 1)
    ix = sum_of_square - square_of_sum 
    FM_out = 0.5 * torch.sum(ix, dim= 1, keepdim= True) # [n, 1] 
    return FM_out
  
  def forward(self, X_cat, X_dense):

    '''1st'''
    X_cat_1st = [embed(X_cat[:, i].unsqueeze(dim= 1)) for i, embed in enumerate(self.fm_1st_cat)] # [batch_size, 1]
    X_cat_1st = torch.cat(X_cat_1st, dim= 1) # [batch_size, num_cat, 1]
    X_cat_1st = torch.sum(X_cat_1st, dim= 1, keepdim= False)
    X_dense_1st = self.fm_1st_dense(X_dense)
    y_1st = X_cat_1st + X_dense_1st

    '''2nd'''
    X_cat2dense = self.Dense_Embedding(X_cat) # [batch_size, num_cat, k]
    FM_y = self.FM(X_cat2dense)

    '''3rd'''
    X_cat_flatten = torch.flatten(X_cat2dense, start_dim= 1, end_dim= 2) # [batch_size, num_cat*k]
    # X_dense_flatten = self.fc_3rd_dense(X_dense)
    X_flatten = torch.cat([X_cat_flatten, X_dense], dim= 1)
    DNN_y = self.dnn(X_flatten) # [batch_size, num_cat*k]

    y = y_1st + FM_y + DNN_y

    # return nn.Sigmoid()(y) # BCELoss
    return y # nn.BCEWithLogitsLoss(pos_weight=9)

# Training

## 參數

In [None]:
'''
1. session的field number
2. user資訊的field number：train_df_drop為去掉不必要資訊後(包括id, destination)的df
3. series.nunique()算出該field的unique數
'''

'\n1. session的field number\n2. user資訊的field number：train_df_drop為去掉不必要資訊後(包括id, destination)的df\n3. series.nunique()算出該field的unique數\n'

In [None]:
batch_size= 256
lr = 1e-3
n_epoch = 10
k = 5
p = 0.2
hidden_dims = [64, 64, 64]
n_class = len(set(train_y))
sparse = False 
threshold = 0.5

In [None]:
weight_dict = {idx: 1/count*83 for idx, count in Counter(train_y).items()}
weight = [weight_dict[idx] for idx in range(len(set(train_y)))]
weight = [1, 1, 1, 1, 1, 1, 1, 1/24, 1, 1, 1/12, 1/2]
weight = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1/12, 1/2]
print(target_names)

['AU' 'CA' 'DE' 'ES' 'FR' 'GB' 'IT' 'NL' 'PT' 'US' 'other']


In [None]:
# Counter(train_y).items()

In [None]:
data_dataset = custom_dataset(train_cat, train_dense, train_y)
train_size= int(0.8*len(data_dataset))
val_size= len(data_dataset)- train_size

train_dataset, val_dataset= random_split(data_dataset, [train_size, val_size], generator=torch.Generator().manual_seed(42))
test_dataset= custom_dataset(test_cat, test_dense, if_y= False)

train_loader = DataLoader(train_dataset, batch_size= batch_size, shuffle= True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size= batch_size, shuffle= False, num_workers=2)
test_loader= DataLoader(test_dataset, batch_size= batch_size, shuffle= False, num_workers=2)

In [None]:
model = DeepFM(cat_fields= cat_fields, num_contns= num_contns, k= k, hidden_dims= hidden_dims, dropout= p, n_class= n_class, sparse= sparse).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr= lr)
criterion = nn.CrossEntropyLoss(weight= torch.tensor(weight))
# criterion = nn.CrossEntropyLoss()

## Training process

In [None]:
for epoch in range(n_epoch):

  model.train()
  train_loss= list()
  val_loss= list()
  train_score = 0
  val_score = 0
  train_preds, train_targets = [], []
  val_preds, val_targets= [], []
  train_size = len(train_dataset)
  val_size = len(val_dataset)

  for i, (X_cat, X_dense, y) in enumerate(train_loader):
    optimizer.zero_grad()
    output= model(X_cat, X_dense)
    loss= criterion(output, y)
    loss.backward()
    optimizer.step()

    _, pred = torch.max(output, dim= 1)
    train_preds += pred.tolist()
    train_targets += y.tolist()
    train_score += sum(pred==y).item()
    train_loss.append(loss.item())
  
  '''eval'''
  model.eval()
  for i, (X_cat, X_dense, y) in enumerate(val_loader):
    with torch.no_grad():
      output= model(X_cat, X_dense)
    _, pred = torch.max(output, dim= 1)
    val_preds += pred.tolist()
    val_targets += y.tolist()
    val_score += sum(pred==y).item()

  print(f'train loss: {round(np.mean(train_loss), 4)}| train accuracy: {round(train_score/train_size, 2)}')
  print(f'val accuracy: {round(val_score/val_size, 2)}')
  

print(f'train loss: {round(np.mean(train_loss), 4)}| train accuracy:\n {classification_report(train_preds, train_targets, target_names= target_names)}')
print(f'val accuracy:\n {classification_report(val_preds, val_targets, target_names= target_names)}')

train loss: 2.1581| train accuracy: 0.3
val accuracy: 0.37
train loss: 2.1122| train accuracy: 0.3
val accuracy: 0.32
train loss: 2.1041| train accuracy: 0.3
val accuracy: 0.33
train loss: 2.1006| train accuracy: 0.3
val accuracy: 0.28
train loss: 2.0942| train accuracy: 0.28
val accuracy: 0.29
train loss: 2.0922| train accuracy: 0.3
val accuracy: 0.3
train loss: 2.0873| train accuracy: 0.29
val accuracy: 0.33
train loss: 2.085| train accuracy: 0.3
val accuracy: 0.31
train loss: 2.0844| train accuracy: 0.3
val accuracy: 0.25
train loss: 2.0814| train accuracy: 0.29
val accuracy: 0.31
train loss: 2.0814| train accuracy:
               precision    recall  f1-score   support

          AU       0.00      0.00      0.00         0
          CA       0.00      0.00      0.00         0
          DE       0.00      0.00      0.00         0
          ES       0.00      0.00      0.00         3
          FR       0.48      0.07      0.13     26462
          GB       0.00      0.00      0.00    

  _warn_prf(average, modifier, msg_start, len(result))


# Test

In [None]:
# test_X_id

In [None]:
model.eval()
preds = []
for i, (X_cat, X_dense) in enumerate(test_loader):
  with torch.no_grad():
    output= model(X_cat.to(device), X_dense.to(device))
    # print(nn.Sigmoid()(output))
  preds += torch.argsort(output, dim= 1, descending= True)[:, :5].numpy().tolist()
  # print(preds)
  # break

In [None]:
len(preds) == len(y_id)
preds = [enc_fitted.inverse_transform(pred) for pred in preds]

In [None]:
sub_df = pd.DataFrame(y_id)
sub_df['country'] = preds

In [None]:
sub_df.head(50)

Unnamed: 0,id,country
0,5uwns89zht,"[US, other, FR, IT, GB]"
1,jtl0dijy2j,"[other, US, FR, IT, GB]"
2,xx0ulgorjt,"[other, US, FR, IT, ES]"
3,6c6puo6ix0,"[other, US, FR, IT, GB]"
4,czqhjk3yfe,"[other, FR, US, IT, GB]"
5,szx28ujmhf,"[US, other, FR, IT, GB]"
6,guenkfjcbq,"[other, US, FR, IT, GB]"
7,tkpq0mlugk,"[other, US, FR, IT, GB]"
8,3xtgd5p9dn,"[other, US, FR, IT, GB]"
9,md9aj22l5a,"[other, FR, US, IT, GB]"


In [None]:
ans = sub_df.explode(column= 'country').set_index('id')
ans.to_csv('submission.csv')