In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
from typing import Optional, Tuple
from torch import nn
from torch.nn import functional as F
from pathlib import Path
from IPython.display import Audio

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def build_income_df(test=False) -> Tuple[pd.DataFrame, Optional[dict], Optional[set]]:
    attributes = dict()
    if test:
        df = pd.read_csv(f"../data/processed/test_final.csv")
    else:
        df = pd.read_csv(f"../data/processed/train_final.csv")
        df = df.astype({"income>50K": "bool"})
        df = df.rename(columns={"income>50K": "Prediction"})
    for numeric_column_name in ("education.num", "hours.per.week", "fnlwgt", "age"):
        df[numeric_column_name] = pd.qcut(df[numeric_column_name], q=2, duplicates="drop")
    for numeric_column_name in ("capital.gain", "capital.loss"):
        df[numeric_column_name] = df[numeric_column_name].astype("bool")
    for column in df.columns:
        mode = df[column].value_counts()
        new = mode.index[0]
        if new == "?":
            new = mode.index[1]
        df[column].replace("?", new, inplace=True)
    if test:
        return df, None, None
    attribute_keys = set(df.columns) - {"Prediction", "ID"}
    attributes = dict(zip(attribute_keys, map(lambda c: list(df[c].unique()), attribute_keys)))
    return (df, attributes, set(df["Prediction"].unique()))


def build_income() -> Tuple[np.ndarray, np.ndarray]:
    df = pd.read_csv(f"../data/processed/train_final.csv")
    df = df.astype({"income>50K": "bool"})
    df = df.rename(columns={"income>50K": "Prediction"})
    for c, t in zip(df.columns, df.dtypes):
        if t == "object":
            df[c] = df[c].astype("category")
    arr = np.stack([df[c].cat.codes if t == "category" else df[c] for c, t in zip(df.columns, df.dtypes)], 1)
    X, y = arr[:, :-1], arr[:, -1]
    return X, y


def build_income_test():
    df = pd.read_csv(f"../data/processed/test_final.csv")
    for c, t in zip(df.columns, df.dtypes):
        if t == "object":
            df[c] = df[c].astype("category")
    return np.stack([df[c].cat.codes if t == "category" else df[c] for c, t in zip(df.columns, df.dtypes)], 1)


def build_holdout():
    X, y = build_income()
    indices = np.random.permutation(X.shape[0])
    holdout = len(indices) // 4
    return X[holdout:], y[holdout:], X[:holdout], y[:holdout]


In [3]:
X, y = build_income()

In [4]:
sc = StandardScaler()
X = sc.fit_transform(X)

In [5]:
class dataset(Dataset):
    def __init__(self,x,y):
        self.x = torch.tensor(x,dtype=torch.float32)
        self.y = torch.tensor(y,dtype=torch.float32)
        self.length = self.x.shape[0]
 
    def __getitem__(self,idx):
        return self.x[idx],self.y[idx]
    
    def __len__(self):
        return self.length

In [6]:
trainset = dataset(X,y)#DataLoader
trainloader = DataLoader(trainset,batch_size=64,shuffle=False)

In [7]:
shapes = [16, 16]

class Net(nn.Module):
    def __init__(self,input_shape):
        super(Net,self).__init__()
        self.fc1 = nn.Linear(input_shape, shapes[0])
        self.fc2 = nn.Linear(shapes[0], shapes[1])
        self.fc3 = nn.Linear(shapes[1], 1) 
        
    def forward(self,x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

In [8]:
epochs = 500
lr = 0.05
model = Net(input_shape=X.shape[1])
momentum = 0.9
optimizer = torch.optim.SGD(model.parameters(),lr=lr, momentum=momentum)
loss_fn = nn.BCELoss()

In [9]:
losses = []
accur = []
for i in range(epochs):
    for j,(x_train,y_train) in enumerate(trainloader):

        #calculate output
        output = model(x_train)

        #calculate loss
        loss = loss_fn(output,y_train.reshape(-1,1))

        #accuracy
        predicted = model(torch.tensor(X,dtype=torch.float32))
        acc = (predicted.reshape(-1).detach().numpy().round() == y).mean()    #backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    losses.append(loss)
    accur.append(acc)
    if i % 100 == 0:
        print(f"Accuracy at epoch {i}: {acc}")

Accuracy at epoch 0: 0.8336
Accuracy at epoch 100: 0.85752
Accuracy at epoch 200: 0.85956
Accuracy at epoch 300: 0.85824
Accuracy at epoch 400: 0.85984


In [10]:
test_X = torch.Tensor(sc.fit_transform(build_income_test()[:, 1:]))

In [11]:
test_X.shape

torch.Size([23842, 14])

In [12]:
with torch.no_grad():
    model.eval()
    results = ["ID,Prediction"]+[",".join(map(str, (i+1, model(x).round().int().item()))) for i, x in enumerate(test_X)]

In [14]:
parent = (Path("..") / Path("reports") / Path("nn6"))
parent.mkdir(exist_ok=True)

In [15]:
with open(parent / Path("results.csv"), "w") as f:
    f.write("\n".join(results))

In [16]:
s = pd.DataFrame(accur, columns=["acc"])
s.to_csv(parent / Path("accuracy.csv"))

In [17]:
with open(parent / Path("desc.txt"), "w") as f:
    f.write(f"{shapes=}\n{lr=}\n{epochs=}\n{momentum=}")

In [18]:
Audio("sound.wav", autoplay=True)