In [82]:
import pandas as pd
import operator
import torch
from torch.utils.data import Dataset
import numpy as np

def extract_id(row):
    return np.int32(row.split(":")[0])

def extract_value(row):
    return np.float32(row.split(":")[1])

class CTR_Dataset(Dataset):
    def __init__(self, id_df, value_df):
        self.x_id = torch.LongTensor(np.array(id_df.drop(columns=['rating'])))
        self.y = torch.Tensor(np.array(id_df['rating']))
        self.x_value = torch.Tensor(np.array(value_df))

    def __getitem__(self, idx):
        return self.x_id[idx], self.x_value[idx], self.y[idx]

    def __len__(self):
        return len(self.x_id)

In [83]:
path = "train.csv"
print("Extracting feature id and feature value...")

df = pd.read_csv(path)
print(df.head(10))
# extract feature id and feature value


Extracting feature id and feature value...
             id  click      hour    C1  banner_pos   site_id site_domain  \
0  1.000009e+18      0  14102100  1005           0  1fbe01fe    f3845767   
1  1.000017e+19      0  14102100  1005           0  1fbe01fe    f3845767   
2  1.000037e+19      0  14102100  1005           0  1fbe01fe    f3845767   
3  1.000064e+19      0  14102100  1005           0  1fbe01fe    f3845767   
4  1.000068e+19      0  14102100  1005           1  fe8cc448    9166c161   
5  1.000072e+19      0  14102100  1005           0  d6137915    bb1ef334   
6  1.000072e+19      0  14102100  1005           0  8fda644b    25d4cfcd   
7  1.000092e+19      0  14102100  1005           1  e151e245    7e091613   
8  1.000095e+19      1  14102100  1005           0  1fbe01fe    f3845767   
9  1.000126e+19      0  14102100  1002           0  84c7ba46    c4e18dd6   

  site_category    app_id app_domain  ... device_type device_conn_type    C14  \
0      28905ebd  ecad2386   7801e8d9  .

In [84]:
mid = df['click']
df.drop(labels=['click'], axis=1,inplace = True)
df.insert(0, 'click', mid)
print(df.head(10))
headers = [f"column_{i}" for i in range(1,25)]
df.columns = headers
#df.columns = ['column_1','column_2','column_3','column_4',,'column_5','column_6','column_7','column_8','column_9',v'column_10','column_11','column_12','column_13','column_14','column_15','column_16','column_17','column_18','column_19','column_20','column_21','column_22','column_23','column_24']
print(df.head(10))

   click            id      hour    C1  banner_pos   site_id site_domain  \
0      0  1.000009e+18  14102100  1005           0  1fbe01fe    f3845767   
1      0  1.000017e+19  14102100  1005           0  1fbe01fe    f3845767   
2      0  1.000037e+19  14102100  1005           0  1fbe01fe    f3845767   
3      0  1.000064e+19  14102100  1005           0  1fbe01fe    f3845767   
4      0  1.000068e+19  14102100  1005           1  fe8cc448    9166c161   
5      0  1.000072e+19  14102100  1005           0  d6137915    bb1ef334   
6      0  1.000072e+19  14102100  1005           0  8fda644b    25d4cfcd   
7      0  1.000092e+19  14102100  1005           1  e151e245    7e091613   
8      1  1.000095e+19  14102100  1005           0  1fbe01fe    f3845767   
9      0  1.000126e+19  14102100  1002           0  84c7ba46    c4e18dd6   

  site_category    app_id app_domain  ... device_type device_conn_type    C14  \
0      28905ebd  ecad2386   7801e8d9  ...           1                2  15706   
1

In [66]:
#df = df.groupby(f"column_{2}").filter(lambda x: len(x) >= 5)

In [85]:
df['column_2'] = 0
print(df.head(10))

   column_1  column_2  column_3  column_4  column_5  column_6  column_7  \
0         0         0  14102100      1005         0  1fbe01fe  f3845767   
1         0         0  14102100      1005         0  1fbe01fe  f3845767   
2         0         0  14102100      1005         0  1fbe01fe  f3845767   
3         0         0  14102100      1005         0  1fbe01fe  f3845767   
4         0         0  14102100      1005         1  fe8cc448  9166c161   
5         0         0  14102100      1005         0  d6137915  bb1ef334   
6         0         0  14102100      1005         0  8fda644b  25d4cfcd   
7         0         0  14102100      1005         1  e151e245  7e091613   
8         1         0  14102100      1005         0  1fbe01fe  f3845767   
9         0         0  14102100      1002         0  84c7ba46  c4e18dd6   

   column_8  column_9 column_10  ... column_15 column_16 column_17 column_18  \
0  28905ebd  ecad2386  7801e8d9  ...         1         2     15706       320   
1  28905ebd  e

In [86]:
df['column_2'] = df['column_2'].astype('category').cat.codes

In [87]:
df[f'column_{3}'] = df[f'column_{3}'].astype('category').cat.codes

In [88]:
for i in range(4,25):
    df[f'column_{i}'] = df[f'column_{i}'].astype('category').cat.codes

In [89]:
print( df[f'column_{2}'])

0           0
1           0
2           0
3           0
4           0
           ..
40428962    0
40428963    0
40428964    0
40428965    0
40428966    0
Name: column_2, Length: 40428967, dtype: int8


In [91]:
print( df[f'column_{2}'].max())

0


In [92]:
df[f'column_{3}'] += df[f'column_{2}'].max()+1

In [93]:
print(df[f'column_{3}'].min())

1


In [94]:
for i in range(4,25):
    df[f'column_{i}'] += df[f'column_{i-1}'].max()+1

In [95]:
print(df[f'column_{24}'].min())

9449386


In [96]:
print(df[f'column_{23}'].max())

9449385


In [97]:
num_features = df[f'column_{24}'].max() + 1
print(num_features)

9449446


In [98]:
print("Remapping feature ids according to the frequency...")
freq_list = df.groupby(f'column_{2}').size().to_list() + df.groupby(f'column_{3}').size().to_list()



Remapping feature ids according to the frequency...


In [99]:
for i in range(4,25):
    freq_list = freq_list + df.groupby(f'column_{i}').size().to_list()

In [100]:
freq_list_sorted = sorted(range(len(freq_list)), key=lambda k: freq_list[k])[::-1]
freq_map = {k: v for v, k in enumerate(freq_list_sorted)}

In [101]:
df[f'column_{3}'] = df[f'column_{3}'].map(freq_map)

In [102]:
for i in range(4,25):
    df[f'column_{i}'] = df[f'column_{i}'].map(freq_map)

In [103]:
from sklearn.model_selection import train_test_split
import pickle
import os



def train_val_test_split(data_df, test_ratio=0.1, val_ratio=0.2, seed=12):
    train, test = train_test_split(data_df, test_size=test_ratio, random_state=seed)
    train, val = train_test_split(train, test_size=val_ratio / (1 - test_ratio), random_state=seed)
    return train, val, test

print("Splitting...")
train_df, val_df, test_df = train_val_test_split(df, test_ratio=0.1, val_ratio=0.2, seed=12)

Splitting...


In [104]:
class Avazu_Dataset(Dataset):
    def __init__(self, data_df):
        self.x = torch.LongTensor(np.array(data_df.drop(columns=['column_1'])))
        self.y = torch.Tensor(np.array(data_df['column_1']))

    def __getitem__(self, idx):
        return self.x[idx], torch.ones_like(self.x[idx], dtype=torch.float32), self.y[idx]

    def __len__(self):
        return len(self.x)


In [105]:
print("The dataset has been preprocessed.")
result = [Avazu_Dataset(train_df), Avazu_Dataset(val_df), Avazu_Dataset(test_df), num_features]
with open(f"dataset.pickle", 'wb') as handle:
    pickle.dump(result, handle,protocol = 4)
    print("File saved.")



The dataset has been preprocessed.
File saved.
