In [4]:
import pandas as pd
import numpy as np

train = pd.read_csv('data/tmp_train_data.csv', index_col=0)

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800000 entries, 854126 to 1627
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   brand chain       800000 non-null  int64  
 1   category dept     800000 non-null  int64  
 2   category          800000 non-null  int64  
 3   company           800000 non-null  int64  
 4   brand             800000 non-null  int64  
 5   date              800000 non-null  object 
 6   productsize       800000 non-null  float64
 7   productmeasure    789349 non-null  object 
 8   purchasequantity  800000 non-null  int64  
 9   purchaseamount    800000 non-null  float64
 10  repeater          800000 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 73.2+ MB


In [6]:
train.head()

Unnamed: 0,brand chain,category dept,category,company,brand,date,productsize,productmeasure,purchasequantity,purchaseamount,repeater
854126,17,27,2702,101780010,14717,2012-12-11,4.0,LB,0,0.0,0
431388,18,63,6320,102113020,10786,2013-04-14,18.0,CT,1,2.59,1
911409,4,21,2117,102800020,56490,2012-11-07,54.0,OZ,1,1.51,0
828030,95,63,6315,104138343,10091,2012-03-09,64.0,OZ,1,3.69,0
266514,15,8,811,103620030,1698,2012-11-30,24.0,OZ,3,6.58,0


In [13]:
train = train.loc[:,  ~train.columns.isin(['date', 'productmeasure'])]

In [16]:
train.head()
X = train.loc[:, ~train.columns.isin(['repeater'])]
y = train.repeater

In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [49]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(640000, 8) (640000,)
(160000, 8) (160000,)


In [50]:
import torch

X_train = torch.FloatTensor(X_train.values)
X_test = torch.FloatTensor(X_test.values)
y_train = torch.LongTensor(y_train.values)
y_test = torch.LongTensor(y_test.values)

In [51]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

torch.Size([640000, 8]) torch.Size([640000])
torch.Size([160000, 8]) torch.Size([160000])


In [65]:
import torch
import pandas as pd
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader


class CustomDataset(Dataset):
    def __init__(self, file_path):
        df = pd.read_csv(file_path, index_col=0)
        df = df.loc[:,  ~df.columns.isin(['date', 'productmeasure'])]
        self.x1 = df.iloc[:, 0].values
        self.x2 = df.iloc[:, 1].values
        self.x3 = df.iloc[:, 2].values
        self.x4 = df.iloc[:, 3].values
        self.x5 = df.iloc[:, 4].values
        self.x6 = df.iloc[:, 5].values
        self.x7 = df.iloc[:, 6].values
        self.x8 = df.iloc[:, 7].values

        self.y = df.iloc[:, 8].values
        self.length = len(df)

    def __getitem__(self, index):
        x = torch.FloatTensor([self.x1[index], self.x2[index], self.x3[index], self.x4[index], self.x5[index], self.x6[index], self.x7[index]])
        y = torch.FloatTensor([self.y[index]])
        return x, y

    def __len__(self):
        return self.length


class CustomModel(nn.Module):
    def __init__(self):
        super(CustomModel, self).__init__()
        self.layer = nn.Sequential(
            nn.Linear(7, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.layer(x)
        return x


train_dataset = CustomDataset("./data/tmp_train_data.csv")
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, drop_last=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = CustomModel().to(device)
criterion = nn.BCELoss().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(10):
    cost = 0.0

    for x, y in train_dataloader:
        x = x.to(device)
        y = y.to(device)

        output = model(x)
        loss = criterion(output, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        cost += loss

    cost = cost / len(train_dataloader)

    if (epoch + 1) % 1000 == 0:
        print(f"Epoch : {epoch+1:4d}, Model : {list(model.parameters())}, Cost : {cost:.3f}")


# with torch.no_grad():
#     model.eval()
#     inputs = torch.FloatTensor(
#         [[89, 92, 75], [75, 64, 50], [38, 58, 63], [33, 42, 39], [23, 15, 32]]
#     ).to(device)
#     outputs = model(inputs)

#     print("---------")
#     print(outputs)
#     print(outputs >= torch.FloatTensor([0.5]).to(device))

In [68]:
for epoch in range(10):
    cost = 0.0

    for x, y in train_dataloader:
        x = x.to(device)
        y = y.to(device)

        output = model(x)
        loss = criterion(output, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        cost += loss

    cost = cost / len(train_dataloader)

    print(f"Epoch : {epoch+1:4d}, Model : {list(model.parameters())}, Cost : {cost:.3f}")

Epoch :    1, Model : [Parameter containing:
tensor([[-0.0909, -0.1467, -0.0008, -0.2041, -0.2779,  0.0452,  0.0464]],
       requires_grad=True), Parameter containing:
tensor([-0.3430], requires_grad=True)], Cost : 14.808
Epoch :    2, Model : [Parameter containing:
tensor([[-0.0909, -0.1467, -0.0008, -0.2041, -0.2779,  0.0452,  0.0464]],
       requires_grad=True), Parameter containing:
tensor([-0.3430], requires_grad=True)], Cost : 14.808
Epoch :    3, Model : [Parameter containing:
tensor([[-0.0909, -0.1467, -0.0008, -0.2041, -0.2779,  0.0452,  0.0464]],
       requires_grad=True), Parameter containing:
tensor([-0.3430], requires_grad=True)], Cost : 14.808
Epoch :    4, Model : [Parameter containing:
tensor([[-0.0909, -0.1467, -0.0008, -0.2041, -0.2779,  0.0452,  0.0464]],
       requires_grad=True), Parameter containing:
tensor([-0.3430], requires_grad=True)], Cost : 14.808
Epoch :    5, Model : [Parameter containing:
tensor([[-0.0909, -0.1467, -0.0008, -0.2041, -0.2779,  0.0452, 

In [10]:
if 'brand' in 'brand   123':
    print('TRue')
else:
    print('false')

TRue


In [11]:
(server_round, {None: str(None)})

NameError: name 'server_round' is not defined

In [59]:
inputs = torch.FloatTensor(
        [[89, 92, 75], [75, 64, 50], [38, 58, 63], [33, 42, 39], [23, 15, 32]]
    )
inputs

tensor([[89., 92., 75.],
        [75., 64., 50.],
        [38., 58., 63.],
        [33., 42., 39.],
        [23., 15., 32.]])

In [63]:
iter(test_dataloader)

<torch.utils.data.dataloader._SingleProcessDataLoaderIter at 0x1c474047e80>

In [70]:
test_data = pd.read_csv("./data/tmp_test_data.csv", index_col=0)
test_data = test_data.drop(['repeater', 'date', 'productmeasure'], axis=1)
test_data.head()

Unnamed: 0,brand chain,category dept,category,company,brand,productsize,purchasequantity,purchaseamount
312419,15,6,610,1061126969,15041,8.4,1,2.19
41973,4,33,3303,102840020,10241,10.0,2,7.78
564105,4,7,708,107232373,5104,12.0,1,5.99
713300,18,29,2906,107066272,12673,4.0,1,0.99
634232,95,36,3631,104900040,1414,144.0,1,6.99


In [72]:
test_data

Unnamed: 0,brand chain,category dept,category,company,brand,productsize,purchasequantity,purchaseamount
312419,15,6,610,1061126969,15041,8.40,1,2.19
41973,4,33,3303,102840020,10241,10.00,2,7.78
564105,4,7,708,107232373,5104,12.00,1,5.99
713300,18,29,2906,107066272,12673,4.00,1,0.99
634232,95,36,3631,104900040,1414,144.00,1,6.99
...,...,...,...,...,...,...,...,...
236542,18,33,3308,102840020,3336,9.75,1,0.00
990761,18,18,1835,104460040,9709,16.60,1,11.30
477136,20,0,0,10000,0,0.00,1,2.99
972923,20,9,902,103010030,47346,16.00,1,0.00


In [76]:
ids = pd.read_csv("./data/testHistory.csv").id 
df = pd.DataFrame({"id": ids,})

In [77]:
df

Unnamed: 0,id
0,12262064
1,12277270
2,12332190
3,12524696
4,13074629
...,...
151479,4843417324
151480,4847628950
151481,4847787712
151482,4853460972


In [1]:
import pandas as pd
ids = pd.read_csv("./data/testHistory.csv").id 
df = pd.DataFrame(data={"id": ids, "repeatProbability": ''})

In [2]:
df

Unnamed: 0,id,repeatProbability
0,12262064,
1,12277270,
2,12332190,
3,12524696,
4,13074629,
...,...,...
151479,4843417324,
151480,4847628950,
151481,4847787712,
151482,4853460972,


In [4]:
from sklearn.model_selection import train_test_split

df = pd.read_csv('./data/tmp_train_data.csv', index_col=0)
# Get train/test indices
train_index, test_index = train_test_split(df.index.values, test_size=0.1, shuffle=False)
print(train_index, test_index)

[854126 431388 911409 ... 319131 430685 148216] [304446 530899 246692 ... 663869 119828   1627]


In [5]:
len(train_index)

720000

In [6]:
len(test_index)

80000