In [1]:
import pandas as pd
import numpy as np
import csv
import joblib 
import torch
from torch import nn
import sklearn
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader

In [2]:
# load data

train_df = pd.read_csv("tabular-playground-series-aug-2022//train.csv")
test_df = pd.read_csv("tabular-playground-series-aug-2022//test.csv")

ID = test_df['id']

# concat train and test data for imputation
all_df = pd.concat([train_df[test_df.columns], test_df], ignore_index=1)

In [3]:
float_columns = test_df.dtypes[test_df.dtypes == 'float64'].index.tolist()
float_columns.remove('measurement_17')
float_columns.remove('loading')
int_columns = test_df.dtypes[test_df.dtypes == 'int64'].index.tolist()
print(float_columns, int_columns)

['measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7', 'measurement_8', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16'] ['id', 'attribute_2', 'attribute_3', 'measurement_0', 'measurement_1', 'measurement_2']


In [4]:
# drop id and not numerical columns
#all_df.drop(['id'], axis=1, inplace=True)
all_df = all_df.drop(['product_code', 'attribute_0', 'attribute_1'], axis=1)
columns = all_df.columns

In [5]:
all_df['m_3_missing'] = all_df.measurement_3.isna()
all_df['m_5_missing'] = all_df.measurement_5.isna()

# imputation

imp = KNNImputer(n_neighbors=10)
std_scaler = StandardScaler()
for i in all_df.columns:
  
  all_df[[i]] = imp.fit_transform(all_df[[i]])


# use imputed data to calculate "attribute_2" * "attribute_3", stdev, avg from "measurement_0" to "measurement_16" of each row as extra features
a = all_df['attribute_2']
b = all_df['attribute_3']
area = a*b
all_df['area'] = area

stdev = np.std(all_df[float_columns], axis=1)
all_df['stdev'] = stdev

avg = np.average(all_df[float_columns], axis=1)
all_df['avg'] = avg

In [6]:
all_df = all_df.drop(int_columns, axis=1)
all_df = all_df.drop(float_columns, axis=1)

In [7]:
col_to_scale = ['loading', 'avg', 'stdev', 'area', 'measurement_17']

# standardscaling
all_df[col_to_scale] = std_scaler.fit_transform(all_df[col_to_scale])

In [8]:
full_data = all_df.copy()

In [9]:
# split x, y

x_train = full_data.iloc[:train_df.shape[0], :]
y_train = train_df['failure'].astype('float')
x_test = full_data.iloc[train_df.shape[0]:, :]

In [12]:
device = (torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))
print(device)

cuda


In [13]:
class test_ds(Dataset):
  def __init__(self, x_test):
    
    self.X_test = torch.from_numpy(x_test.values.astype(np.float32))
  

  def __len__(self):
    return len(self.X_test)

  def __getitem__(self, idx):
    return self.X_test[idx]

In [14]:
test_data = test_ds(x_test)

In [15]:
test_dl = DataLoader(test_data, batch_size=1000)

In [16]:
col_num = x_train.shape[1]

torch.manual_seed(32)

class LinearModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.layer1 = nn.Linear(in_features=col_num, out_features=8)
    self.layer2 = nn.Linear(in_features=16, out_features=8)
    self.layer3 = nn.Linear(in_features=8, out_features=8)
    self.layer4 = nn.Linear(in_features=8, out_features=1)

    self.relu = nn.ReLU()
    self.batch_norm1 = nn.BatchNorm1d(16)
    self.batch_norm2 = nn.BatchNorm1d(31)
    self.dropout = nn.Dropout(p=0.2)


  def forward(self, x: torch.tensor):
      #x = self.dropout(x)
      x = self.relu(self.layer1(x))
      x = self.dropout(x)
      x = self.layer3(x)
      x = self.layer4(x)

      return x

model = LinearModel()

In [17]:
model = joblib.load('my_model')

In [18]:
test_loss = []

model.eval()
for x_test in test_dl:
  x_test = x_test.to(device)
  with torch.inference_mode():
    test_pred = model(x_test)
    test_pred_value = test_pred.cpu().numpy()
    test_loss.append(test_pred_value)

In [19]:
pred = []
for sublist in test_loss:
    for item in sublist:
        pred.append(item[0])

In [20]:
write_list = []
for i in range(len(ID)):
    write_list.append((ID[i], pred[i]))
csv_writer = csv.writer(open("109612041.csv", mode = 'w', newline=''))
csv_writer.writerow(['id', 'failure'])
for id, f in write_list:
    csv_writer.writerow([id, f])