In [121]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression as LR
import torch
from torch import nn
import torch.utils.data as data

In [3]:
df = pd.read_csv("../exampinateion/modified_train.csv")
df

Unnamed: 0.1,Unnamed: 0,age,workclass,fnlwgt,education,educational_num,marital_status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native_country,income_bigger_than_50K
0,0,67,0,366425,1,16,2,0,1,0,0,99999,0,60,0,1
1,1,17,0,244602,0,8,1,1,2,0,0,0,0,15,0,0
2,2,31,0,174201,1,13,0,0,0,0,0,0,0,40,0,1
3,3,58,1,110199,0,4,0,2,0,0,0,0,0,40,0,0
4,4,25,1,149248,1,10,1,1,1,1,0,0,0,40,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40722,43952,52,0,68982,1,13,0,0,0,0,0,0,0,50,0,1
40723,43953,19,0,116562,0,9,1,1,2,0,1,0,0,40,0,0
40724,43954,30,0,197947,1,10,2,2,1,0,0,0,0,58,0,0
40725,43955,46,0,97883,1,13,1,2,1,0,1,0,0,35,0,0


In [4]:
y = df["education"]
x = df[["race","workclass", 
        "fnlwgt", "marital_status", "occupation", "relationship", "age"]]

In [5]:
model = LR().fit(x, y)


In [24]:
probilities = model.predict_proba(x)[:,1]

In [50]:

def sample(num):
    u = np.random.rand()
    
    return 1 if u < num else 0
        


In [51]:
sample(0.5)

0

In [54]:
def repeat(times = 1, possiblity=0.5):
    
    temp = []
    
    def sample(num):
        u = np.random.rand()

        return 1 if u < num else 0

    for i in range(times):
        a = sample(possiblity)
        temp.append(a)
    
    return temp
    

In [75]:
treatments = [repeat(10, i) for i in probilities]
np.shape(treatments)

(40727, 10)

In [92]:
def build(orginal_x, probilities, times=1, save=False):
    treatments = [repeat(times, i) for i in probilities]
    data = np.hstack((x.to_numpy(),treatments))
    new = pd.DataFrame(data, columns=list(x.columns)
                         + ["p"+str(i) for i in range(0,times)])
    
    if save:
        new.to_csv("new X.csv")
        print("Successful saved!")
    
    return new

In [93]:
new_x = build(x, probilities, 20)
new_x.head()

Unnamed: 0,race,workclass,fnlwgt,marital_status,occupation,relationship,age,p0,p1,p2,...,p10,p11,p12,p13,p14,p15,p16,p17,p18,p19
0,0,0,366425,2,0,1,67,1,1,0,...,1,1,0,1,1,0,1,1,0,0
1,0,0,244602,1,1,2,17,0,1,0,...,1,1,1,1,1,1,1,0,1,0
2,0,0,174201,0,0,0,31,0,0,0,...,0,0,0,0,1,0,1,1,0,0
3,0,1,110199,0,2,0,58,0,0,0,...,0,0,0,0,1,1,1,1,1,1
4,1,1,149248,1,1,1,25,0,1,1,...,0,1,0,0,1,1,1,1,0,0


In [140]:
VALID_RATIO = 0.9

n_train_examples = int(len(new_x) * VALID_RATIO)
n_valid_examples = len(new_x) - n_train_examples
train_data, valid_data = data.random_split(new_x.to_numpy(),
                                           [n_train_examples, n_valid_examples])
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')

Number of training examples: 36654
Number of validation examples: 4073


In [141]:
batch_size = 64

train_iterator = data.DataLoader(train_data,
                                 shuffle=True,
                                 batch_size=batch_size)

valid_iterator = data.DataLoader(valid_data,
                                 batch_size=batch_size)



In [142]:
class MLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()

        self.input_fc = nn.Linear(input_dim, 100)
        self.hidden_fc = nn.Linear(100, 10)
        self.output_fc = nn.Linear(10, output_dim)

    def forward(self, x):

        # x = [batch size, height, width]

        batch_size = x.shape[0]

        x = x.view(batch_size, -1)

        # x = [batch size, height * width]

        h_1 = F.relu(self.input_fc(x))

        # h_1 = [batch size, 250]

        h_2 = F.relu(self.hidden_fc(h_1))

        # h_2 = [batch size, 100]

        y_pred = self.output_fc(h_2)

        # y_pred = [batch size, output dim]

        return y_pred, h_2

In [143]:
input_dim = np.prod(np.shape(new_x))
output_dim = 1



model = MLP(input_dim, output_dim)

In [144]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 109,964,021 trainable parameters


In [145]:
import torch.optim as optim
from tqdm.notebook import trange, tqdm
import time

optimizer = optim.Adam(model.parameters())

In [146]:
criterion = nn.CrossEntropyLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)

In [147]:
def calculate_accuracy(y_pred, y):
    top_pred = y_pred.argmax(1, keepdim=True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

In [148]:
def train(model, iterator, optimizer, criterion, device):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for (x, y) in tqdm(iterator, desc="Training", leave=False):

        x = x.to(device)
        y = y.to(device)

        optimizer.zero_grad()

        y_pred, _ = model(x)

        loss = criterion(y_pred, y)

        acc = calculate_accuracy(y_pred, y)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [149]:
def evaluate(model, iterator, criterion, device):

    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():

        for (x, y) in tqdm(iterator, desc="Evaluating", leave=False):

            x = x.to(device)
            y = y.to(device)

            y_pred, _ = model(x)

            loss = criterion(y_pred, y)

            acc = calculate_accuracy(y_pred, y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [150]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [151]:
EPOCHS = 10

best_valid_loss = float('inf')

for epoch in trange(EPOCHS):

    start_time = time.monotonic()

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, device)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, device)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')

    end_time = time.monotonic()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

  0%|          | 0/10 [00:00<?, ?it/s]

Training:   0%|          | 0/573 [00:00<?, ?it/s]

ValueError: too many values to unpack (expected 2)