In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

In [2]:
SEED = 28
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fba4e08b5d0>

In [3]:
train_data = pd.read_csv('train.csv')

In [4]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,latitude,longitude,company,is_local,type,fin_1,fin_2,fin_3,fin_4,target
0,0,40.10891,-83.09286,8336,0,3,-135060.089443,86013.396489,1206.094242,52287.082257,0
1,1,39.86542,-84.0628,18403,1,0,-1766.845055,14985.64018,477.494992,168836.215743,1
2,2,39.10266,-84.52468,14022,0,3,-177302.873693,44881.958005,1463.339889,130388.243325,0
3,3,39.10148,-84.52341,11051,0,0,209049.99746,0.0,95.340075,103267.727546,1
4,4,41.06213,-81.53784,3243,0,3,8669.269507,0.0,399.421926,177532.206618,1


In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160001 entries, 0 to 160000
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  160001 non-null  int64  
 1   latitude    160001 non-null  float64
 2   longitude   160001 non-null  float64
 3   company     160001 non-null  int64  
 4   is_local    160001 non-null  int64  
 5   type        160001 non-null  int64  
 6   fin_1       160001 non-null  float64
 7   fin_2       160001 non-null  float64
 8   fin_3       160001 non-null  float64
 9   fin_4       160001 non-null  float64
 10  target      160001 non-null  int64  
dtypes: float64(6), int64(5)
memory usage: 13.4 MB


In [6]:
train_data.describe()

Unnamed: 0.1,Unnamed: 0,latitude,longitude,company,is_local,type,fin_1,fin_2,fin_3,fin_4,target
count,160001.0,160001.0,160001.0,160001.0,160001.0,160001.0,160001.0,160001.0,160001.0,160001.0,160001.0
mean,80000.0,35.954746,-93.058963,9582.472441,0.157868,2.313473,19753.432114,23521.645197,532.059032,98648.62959,0.595109
std,46188.454548,7.561538,23.227451,5648.91769,0.364618,1.408469,109273.198383,30622.532195,502.762427,71474.973849,0.490873
min,0.0,1.0,-124.422706,1.0,0.0,0.0,-551041.674432,0.0,1.0,0.0,0.0
25%,40000.0,33.65494,-117.2615,4798.0,0.0,1.0,-51898.897414,0.0,225.118134,34551.47416,0.0
50%,80000.0,37.33353,-88.218217,10064.0,0.0,3.0,16241.785927,8382.814808,391.468199,84909.879747,1.0
75%,120000.0,40.745879,-80.76744,14049.0,0.0,3.0,92478.316992,40686.069965,671.4802,156391.223512,1.0
max,160000.0,49.00058,1.0,19997.0,1.0,4.0,557212.884648,205666.441401,21637.539085,249974.729832,1.0


In [7]:
train_data.isnull().sum()

Unnamed: 0    0
latitude      0
longitude     0
company       0
is_local      0
type          0
fin_1         0
fin_2         0
fin_3         0
fin_4         0
target        0
dtype: int64

In [8]:
companies = train_data.groupby('company').count().iloc[:, 0]

In [9]:
companies

company
1        4302
5         159
6          21
7          83
11         33
         ... 
19971      10
19974      45
19979      22
19985      11
19997       6
Name: Unnamed: 0, Length: 3686, dtype: int64

In [10]:
companies[companies >= 10]

company
1        4302
5         159
6          21
7          83
11         33
         ... 
19960      14
19971      10
19974      45
19979      22
19985      11
Name: Unnamed: 0, Length: 1483, dtype: int64

In [11]:
companies_list_to_encode = list(companies[companies > 10].index) + [-1]

In [12]:
len(companies_list_to_encode)

1385

In [13]:
le = LabelEncoder()

In [14]:
le.fit(companies_list_to_encode)

LabelEncoder()

In [15]:
companies_to_replace = list(companies[companies <= 10].index)

In [16]:
train_data.company.replace(companies_to_replace, -1, inplace=True)

In [17]:
train_data.company.nunique()

1385

In [18]:
train_data.company = le.transform(train_data.company)

In [19]:
train_data.company.nunique()

1385

In [20]:
clustering_data = train_data[['latitude', 'longitude']]

In [21]:
clustering_data.head()

Unnamed: 0,latitude,longitude
0,40.10891,-83.09286
1,39.86542,-84.0628
2,39.10266,-84.52468
3,39.10148,-84.52341
4,41.06213,-81.53784


In [22]:
cluster_picker = KMeans(n_clusters=300)

In [23]:
clusters = cluster_picker.fit_predict(clustering_data)

In [24]:
clusters

array([ 11, 273,  70, ..., 180, 180,  97], dtype=int32)

In [25]:
train_data.drop(['latitude', 'longitude'], axis=1, inplace=True)

In [26]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,company,is_local,type,fin_1,fin_2,fin_3,fin_4,target
0,0,561,0,3,-135060.089443,86013.396489,1206.094242,52287.082257,0
1,1,1273,1,0,-1766.845055,14985.64018,477.494992,168836.215743,1
2,2,973,0,3,-177302.873693,44881.958005,1463.339889,130388.243325,0
3,3,757,0,0,209049.99746,0.0,95.340075,103267.727546,1
4,4,229,0,3,8669.269507,0.0,399.421926,177532.206618,1


In [27]:
train_data['geo_cluster'] = clusters

In [28]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,company,is_local,type,fin_1,fin_2,fin_3,fin_4,target,geo_cluster
0,0,561,0,3,-135060.089443,86013.396489,1206.094242,52287.082257,0,11
1,1,1273,1,0,-1766.845055,14985.64018,477.494992,168836.215743,1,273
2,2,973,0,3,-177302.873693,44881.958005,1463.339889,130388.243325,0,70
3,3,757,0,0,209049.99746,0.0,95.340075,103267.727546,1,70
4,4,229,0,3,8669.269507,0.0,399.421926,177532.206618,1,267


In [29]:
train_data.type.nunique()

5

In [30]:
train_data.drop(columns=['Unnamed: 0'], inplace=True)

In [31]:
train_data.head()

Unnamed: 0,company,is_local,type,fin_1,fin_2,fin_3,fin_4,target,geo_cluster
0,561,0,3,-135060.089443,86013.396489,1206.094242,52287.082257,0,11
1,1273,1,0,-1766.845055,14985.64018,477.494992,168836.215743,1,273
2,973,0,3,-177302.873693,44881.958005,1463.339889,130388.243325,0,70
3,757,0,0,209049.99746,0.0,95.340075,103267.727546,1,70
4,229,0,3,8669.269507,0.0,399.421926,177532.206618,1,267


In [32]:
n_companies = len(companies_list_to_encode)
n_types = train_data.type.nunique()
features = ['fin_1', 'fin_2', 'fin_3', 'fin_4', 'is_local']

In [33]:
X, y = train_data.drop('target', axis=1), train_data.target

In [34]:
X.head()

Unnamed: 0,company,is_local,type,fin_1,fin_2,fin_3,fin_4,geo_cluster
0,561,0,3,-135060.089443,86013.396489,1206.094242,52287.082257,11
1,1273,1,0,-1766.845055,14985.64018,477.494992,168836.215743,273
2,973,0,3,-177302.873693,44881.958005,1463.339889,130388.243325,70
3,757,0,0,209049.99746,0.0,95.340075,103267.727546,70
4,229,0,3,8669.269507,0.0,399.421926,177532.206618,267


In [35]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.1, random_state=SEED)

In [36]:
test_data = pd.read_csv('test.csv')

In [37]:
companies_to_replace = set(test_data.company) - set(companies_list_to_encode)

In [38]:
test_data.company.replace(companies_to_replace, -1, inplace=True)

In [39]:
test_data.company = le.transform(test_data.company)

In [40]:
clustering_data = test_data[['latitude', 'longitude']]

In [41]:
clusters = cluster_picker.predict(clustering_data)

In [42]:
test_data.drop(['latitude', 'longitude'], axis=1, inplace=True)

In [43]:
test_data['geo_cluster'] = clusters

In [44]:
test_data.drop(columns=['Unnamed: 0'], inplace=True)

In [45]:
test_data.type.nunique()

5

In [46]:
test_data.head()

Unnamed: 0,company,is_local,type,fin_1,fin_2,fin_3,fin_4,target,geo_cluster
0,0,1,1,113033.389907,0.0,270.906219,31222.780176,0,97
1,767,0,0,-87239.590275,73759.38751,759.194862,237587.544996,1,77
2,0,1,4,-122084.49862,15528.109943,1039.655934,29612.346982,0,103
3,0,0,1,87355.127256,0.0,214.594205,163526.475818,1,83
4,795,0,3,33014.437946,0.0,379.819724,67499.397999,1,294


In [47]:
test_X, test_y = test_data.drop(columns=['target']), test_data.target

In [48]:
class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        self.n_samples = x.shape[0]
    
    def __getitem__(self, index):
        return self.x.iloc[index].values, self.y.iloc[index]
    
    def __len__(self):
        return self.n_samples

In [49]:
def init_weights(m):
    torch.nn.init.kaiming_uniform(m.weight)
    m.bias.data.fill_(0.01)

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.features_branch = nn.Sequential(
            nn.Linear(5, 32),
            nn.BatchNorm1d(num_features=32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.BatchNorm1d(num_features=32),
            nn.ReLU())
        self.company_branch = nn.Sequential(
            nn.Embedding(n_companies, 128),
            nn.ReLU(),
            nn.Linear(128, 32),
            nn.ReLU(),
            )
        self.type_branch = nn.Sequential(
            nn.Embedding(n_types, 16),
            nn.ReLU(),
            nn.Linear(16, 16),
            nn.ReLU(),
        )
        self.geo_branch = nn.Sequential(
            nn.Embedding(300, 128),
            nn.ReLU(),
            nn.Linear(128, 32),
            nn.ReLU())
        self.main_branch = nn.Sequential(
            nn.Linear(112, 64),
            nn.BatchNorm1d(num_features=64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.BatchNorm1d(num_features=64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid())
        
#         self.features_branch.apply(init_weights)
#         self.company_branch.apply(init_weights)
#         self.type_branch.apply(init_weights)
#         self.geo_branch.apply(init_weights)
#         self.main_branch.apply(init_weights)
        
    def forward(self, x):
#         print(x[:, [1, 3, 4, 5, 6]])
        out1 = self.features_branch(x[:, [1, 3, 4, 5, 6]].to(torch.float32))
        out2 = self.company_branch(x[:, 0].to(torch.long))
        out3 = self.type_branch(x[:, 2].to(torch.long))
        out4 = self.geo_branch(x[:, 7].to(torch.long))
        out = torch.cat((out1, out2, out3, out4), 1)
        out = self.main_branch(out)
        return out

In [50]:
model = NeuralNetwork()

In [51]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [52]:
train_dataset = CustomDataset(train_X, train_y)

In [53]:
batch_size = 128

In [54]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

In [55]:
valid_dataset = CustomDataset(valid_X, valid_y)
valid_dataloader = DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=False)

In [56]:
n_epochs = 10
epoch_accuracies = np.arange(n_epochs, dtype=np.float32)

In [57]:
def train(dataloader, model, loss_fn, optimizer):
    n_iterations = len(dataloader)
    model.train()
    for i, (x, y) in enumerate(dataloader):
        # forward
        y = y.reshape(y.shape[0], 1).type(torch.float32)

        pred = model(x)
        loss = loss_fn(pred, y)

        # backwards
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print(f'Step {i+1} / {n_iterations}, loss = {loss.item():.4f}')

In [58]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for x, y in dataloader:
            y = y.reshape(y.shape[0], 1).to(torch.float32)
            
            pred = model(x)
            test_loss += loss_fn(pred, y).item()
#             print(pred)
#             print(y)
            correct += (pred.round() == y).sum().item()
        
    test_loss /= num_batches
    correct /= size
    print(f'Test Error: \n Accuracy: {(100*correct):>0.1f}%, \
        Avg loss: {test_loss:>8f} \n')
    
    return correct

In [59]:
def save_checkpoint(state, filename):
    print("Saving checkpoint...")
    torch.save(state, filename)

In [60]:
def load_checkpoint(checkpoint):
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])

In [61]:
for epoch in range(n_epochs):
    print(f'Epoch {epoch+1}\n--------------------------')
    train(train_dataloader, model, criterion, optimizer)
    accuracy = test(valid_dataloader, model, criterion)
    epoch_accuracies[epoch] = accuracy
    checkpoint = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}
    save_checkpoint(checkpoint, f'{epoch}.pth.tar')
print("Done!")

Epoch 1
--------------------------
Step 100 / 1125, loss = 0.4636
Step 200 / 1125, loss = 0.5119
Step 300 / 1125, loss = 0.4832
Step 400 / 1125, loss = 0.4197
Step 500 / 1125, loss = 0.4236
Step 600 / 1125, loss = 0.4369
Step 700 / 1125, loss = 0.4644
Step 800 / 1125, loss = 0.3712
Step 900 / 1125, loss = 0.3788
Step 1000 / 1125, loss = 0.4007
Step 1100 / 1125, loss = 0.3913
Test Error: 
 Accuracy: 77.7%,         Avg loss: 0.435247 

Saving checkpoint...
Epoch 2
--------------------------
Step 100 / 1125, loss = 0.4192
Step 200 / 1125, loss = 0.4256
Step 300 / 1125, loss = 0.4964
Step 400 / 1125, loss = 0.4562
Step 500 / 1125, loss = 0.4060
Step 600 / 1125, loss = 0.3689
Step 700 / 1125, loss = 0.4657
Step 800 / 1125, loss = 0.4115
Step 900 / 1125, loss = 0.4195
Step 1000 / 1125, loss = 0.3601
Step 1100 / 1125, loss = 0.4399
Test Error: 
 Accuracy: 77.8%,         Avg loss: 0.436959 

Saving checkpoint...
Epoch 3
--------------------------
Step 100 / 1125, loss = 0.4279
Step 200 / 1125,

In [62]:
best_model = epoch_accuracies.argmax()

In [63]:
load_checkpoint(torch.load(f'{best_model}.pth.tar'))

In [64]:
test_dataset = CustomDataset(test_X, test_y)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

In [65]:
test(test_dataloader, model, criterion)

Test Error: 
 Accuracy: 77.2%,         Avg loss: 0.441616 



0.7723

In [72]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

237377