In [1]:
import pandas as pd
import numpy as np
import csv
import joblib 
import torch
from torch import nn
import sklearn
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader

In [2]:
# load data

train_df = pd.read_csv("tabular-playground-series-aug-2022//train.csv")
test_df = pd.read_csv("tabular-playground-series-aug-2022//test.csv")

ID = test_df['id']

# concat train and test data for imputation
all_df = pd.concat([train_df[test_df.columns], test_df], ignore_index=1)

In [3]:
float_columns = test_df.dtypes[test_df.dtypes == 'float64'].index.tolist()
float_columns.remove('measurement_17')
float_columns.remove('loading')
int_columns = test_df.dtypes[test_df.dtypes == 'int64'].index.tolist()
print(float_columns, int_columns)

['measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7', 'measurement_8', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16'] ['id', 'attribute_2', 'attribute_3', 'measurement_0', 'measurement_1', 'measurement_2']


In [4]:
# drop id and not numerical columns
#all_df.drop(['id'], axis=1, inplace=True)
all_df = all_df.drop(['product_code', 'attribute_0', 'attribute_1'], axis=1)
columns = all_df.columns

In [5]:
all_df['m_3_missing'] = all_df.measurement_3.isna()
all_df['m_5_missing'] = all_df.measurement_5.isna()

# imputation

imp = KNNImputer(n_neighbors=10)
std_scaler = StandardScaler()
for i in all_df.columns:
  
  all_df[[i]] = imp.fit_transform(all_df[[i]])


# use imputed data to calculate "attribute_2" * "attribute_3", stdev, avg from "measurement_0" to "measurement_16" of each row as extra features
a = all_df['attribute_2']
b = all_df['attribute_3']
area = a*b
all_df['area'] = area

stdev = np.std(all_df[float_columns], axis=1)
all_df['stdev'] = stdev

avg = np.average(all_df[float_columns], axis=1)
all_df['avg'] = avg

In [6]:
all_df = all_df.drop(int_columns, axis=1)
all_df = all_df.drop(float_columns, axis=1)

In [7]:
col_to_scale = ['loading', 'avg', 'stdev', 'area', 'measurement_17']

# standardscaling
all_df[col_to_scale] = std_scaler.fit_transform(all_df[col_to_scale])

In [8]:
full_data = all_df.copy()

In [9]:
# split x, y

x_train = full_data.iloc[:train_df.shape[0], :]
y_train = train_df['failure'].astype('float')
x_test = full_data.iloc[train_df.shape[0]:, :]

In [10]:
device = (torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))
print(device)

cuda


In [11]:
class train_ds(Dataset):
  def __init__(self, x_train, y_train):
    
    self.X_train = torch.from_numpy(x_train.values.astype(np.float32))
    self.y_train = torch.from_numpy(y_train.values.astype(np.float32))

  def __len__(self):
    return len(self.y_train)

  def __getitem__(self, idx):
    return self.X_train[idx], self.y_train[idx]

In [12]:
class test_ds(Dataset):
  def __init__(self, x_test):
    
    self.X_test = torch.from_numpy(x_test.values.astype(np.float32))
  

  def __len__(self):
    return len(self.X_test)

  def __getitem__(self, idx):
    return self.X_test[idx]

In [13]:
train_data = train_ds(x_train, y_train)
test_data = test_ds(x_test)

In [14]:
train_dl = DataLoader(train_data, batch_size=1000, shuffle=True)
test_dl = DataLoader(test_data, batch_size=1000)

In [15]:
col_num = x_train.shape[1]

In [16]:
torch.manual_seed(32)

class LinearModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.layer1 = nn.Linear(in_features=col_num, out_features=8)
    self.layer2 = nn.Linear(in_features=16, out_features=8)
    self.layer3 = nn.Linear(in_features=8, out_features=8)
    self.layer4 = nn.Linear(in_features=8, out_features=1)

    self.relu = nn.ReLU()
    self.batch_norm1 = nn.BatchNorm1d(16)
    self.batch_norm2 = nn.BatchNorm1d(31)
    self.dropout = nn.Dropout(p=0.2)


  def forward(self, x: torch.tensor):
      #x = self.dropout(x)
      x = self.relu(self.layer1(x))
      x = self.dropout(x)
      x = self.layer3(x)
      x = self.layer4(x)

      return x

model = LinearModel()

In [17]:
model.to(device)

LinearModel(
  (layer1): Linear(in_features=7, out_features=8, bias=True)
  (layer2): Linear(in_features=16, out_features=8, bias=True)
  (layer3): Linear(in_features=8, out_features=8, bias=True)
  (layer4): Linear(in_features=8, out_features=1, bias=True)
  (relu): ReLU()
  (batch_norm1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm2): BatchNorm1d(31, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [18]:
loss_fn = nn.SmoothL1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)

In [19]:
torch.manual_seed(32)

epochs = 152
for epoch in range(epochs+1):
  model.train()
  loss_value = []
  for x_train, y_train in train_dl:
    
    x_train = x_train.to(device)
    y_train = y_train.to(device)

    y_pred = model(x_train)

    loss = loss_fn(y_pred, y_train.unsqueeze(1))

    optimizer.zero_grad()

    loss.backward()

    optimizer.step()

    loss_value.append(loss)

    joblib.dump(model, f'my_model_{epoch}')

  pred = model(x_train)
  print(f"epoch:{epoch}, avgloss:{sum(loss_value) / len(loss_value)}")

epoch:0, avgloss:0.1462567150592804
epoch:1, avgloss:0.13057450950145721
epoch:2, avgloss:0.11915864050388336
epoch:3, avgloss:0.11016526073217392
epoch:4, avgloss:0.10323435068130493
epoch:5, avgloss:0.0977059155702591
epoch:6, avgloss:0.09400185197591782
epoch:7, avgloss:0.09155102074146271
epoch:8, avgloss:0.0897195115685463
epoch:9, avgloss:0.08798620849847794
epoch:10, avgloss:0.08725390583276749
epoch:11, avgloss:0.08657116442918777
epoch:12, avgloss:0.08608577400445938
epoch:13, avgloss:0.085738405585289
epoch:14, avgloss:0.08516920357942581
epoch:15, avgloss:0.08484123647212982
epoch:16, avgloss:0.08491479605436325
epoch:17, avgloss:0.08442335575819016
epoch:18, avgloss:0.08427899330854416
epoch:19, avgloss:0.08428271859884262
epoch:20, avgloss:0.08414992690086365
epoch:21, avgloss:0.08376385271549225
epoch:22, avgloss:0.08355890959501266
epoch:23, avgloss:0.08360477536916733
epoch:24, avgloss:0.08364243805408478
epoch:25, avgloss:0.08355807512998581
epoch:26, avgloss:0.0834824

In [20]:
# save model
joblib.dump(model, 'my_model')

['my_model']