In [1]:
import pandas as pd
import torch
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import torch.nn as nn
import numpy as np
import time
import wandb
import warnings

warnings.filterwarnings('ignore')


In [2]:
all_df = pd.read_excel(r'..\static\excel\all.xlsx')

In [3]:
parameters = {
    "regression": 1,
    "time": str(datetime.now()).replace(" ", "_"),
    "model_name": 'linear',
    "learning_rate": 1e-3,
    "epochs": 1000,
    "batch_size": 1024,
    "wandb": False
}

In [4]:
def init_wandb():
    # start a new wandb run to track this script
    parameters["wandb"] = True
    wandb.init(
        # set the wandb project where this run will be logged
        project="static",

        # track hyperparameters and run metadata
        config={
            "learning_rate": 0.001,
            "architecture": "linearRegression",
            "dataset": "economic",
            "epochs": parameters['epochs'],
        }
    )

init_wandb()

[34m[1mwandb[0m: Currently logged in as: [33mxunhaoz[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
def preprocess(df):
    df = df.fillna(0)
    df['REL'] = df['REL'].apply(lambda x: 1 if x == 1 else 0)
    df['MALE'] = df['SEX'].apply(lambda x: 1 if x == 1 else 0)
    df['FEMALE'] = df['SEX'].apply(lambda x: 1 if x == 2 else 0)
    df['AGE'] = df['AGE'] / 100
    df['EDU01'] = df['EDU'].apply(lambda x: 1 if x == 1 else 0)
    df['EDU02'] = df['EDU'].apply(lambda x: 1 if x == 2 else 0)
    df['EDU03'] = df['EDU'].apply(lambda x: 1 if x == 3 else 0)
    df['EDU04'] = df['EDU'].apply(lambda x: 1 if x == 4 else 0)
    df['EDU05'] = df['EDU'].apply(lambda x: 1 if x == 5 else 0)
    df['EDU06'] = df['EDU'].apply(lambda x: 1 if x == 6 else 0)
    df['EDU07'] = df['EDU'].apply(lambda x: 1 if x == 7 else 0)
    df['EDU08'] = df['EDU'].apply(lambda x: 1 if x == 8 else 0)
    df['EDU09'] = df['EDU'].apply(lambda x: 1 if x == 9 else 0)
    df['EDU10'] = df['EDU'].apply(lambda x: 1 if x == 10 else 0)
    df['IND00'] = df['IND'].apply(lambda x: 1 if x == 0 else 0)
    df['IND01'] = df['IND'].apply(lambda x: 1 if x == 1 else 0)
    df['IND02'] = df['IND'].apply(lambda x: 1 if x == 2 else 0)
    df['IND03'] = df['IND'].apply(lambda x: 1 if x == 3 else 0)
    df['IND05'] = df['IND'].apply(lambda x: 1 if x == 5 else 0)
    df['IND08'] = df['IND'].apply(lambda x: 1 if x == 8 else 0)
    df['IND35'] = df['IND'].apply(lambda x: 1 if x == 35 else 0)
    df['IND36'] = df['IND'].apply(lambda x: 1 if x == 36 else 0)
    df['IND41'] = df['IND'].apply(lambda x: 1 if x == 41 else 0)
    df['IND45'] = df['IND'].apply(lambda x: 1 if x == 45 else 0)
    df['IND55'] = df['IND'].apply(lambda x: 1 if x == 55 else 0)
    df['IND49'] = df['IND'].apply(lambda x: 1 if x == 49 else 0)
    df['IND58'] = df['IND'].apply(lambda x: 1 if x == 58 else 0)
    df['IND64'] = df['IND'].apply(lambda x: 1 if x == 64 else 0)
    df['IND67'] = df['IND'].apply(lambda x: 1 if x == 67 else 0)
    df['IND69'] = df['IND'].apply(lambda x: 1 if x == 69 else 0)
    df['IND77'] = df['IND'].apply(lambda x: 1 if x == 77 else 0)
    df['IND85'] = df['IND'].apply(lambda x: 1 if x == 85 else 0)
    df['IND86'] = df['IND'].apply(lambda x: 1 if x == 86 else 0)
    df['IND90'] = df['IND'].apply(lambda x: 1 if x == 90 else 0)
    df['IND94'] = df['IND'].apply(lambda x: 1 if x == 94 else 0)
    df['IND83'] = df['IND'].apply(lambda x: 1 if x == 83 else 0)
    df['OCC00'] = df['OCC'].apply(lambda x: 1 if x == 0 else 0)
    df['OCC01'] = df['OCC'].apply(lambda x: 1 if x == 1 else 0)
    df['OCC02'] = df['OCC'].apply(lambda x: 1 if x == 2 else 0)
    df['OCC03'] = df['OCC'].apply(lambda x: 1 if x == 3 else 0)
    df['OCC04'] = df['OCC'].apply(lambda x: 1 if x == 4 else 0)
    df['OCC05'] = df['OCC'].apply(lambda x: 1 if x == 5 else 0)
    df['OCC61'] = df['OCC'].apply(lambda x: 1 if x == 61 else 0)
    df['OCC62'] = df['OCC'].apply(lambda x: 1 if x == 62 else 0)
    df['OCC63'] = df['OCC'].apply(lambda x: 1 if x == 63 else 0)
    df['OCC07'] = df['OCC'].apply(lambda x: 1 if x == 7 else 0)
    df['OCC08'] = df['OCC'].apply(lambda x: 1 if x == 8 else 0)
    df['OCC09'] = df['OCC'].apply(lambda x: 1 if x == 9 else 0)
    df['OCC10'] = df['OCC'].apply(lambda x: 1 if x == 10 else 0)
    df['WKCLASS01'] = df['WKCLASS'].apply(lambda x: 1 if x == 1 else 0)
    df['WKCLASS02'] = df['WKCLASS'].apply(lambda x: 1 if x == 2 else 0)
    df['WKCLASS03'] = df['WKCLASS'].apply(lambda x: 1 if x == 3 else 0)
    df['WKCLASS04'] = df['WKCLASS'].apply(lambda x: 1 if x == 4 else 0)
    df['WKCLASS05'] = df['WKCLASS'].apply(lambda x: 1 if x == 5 else 0)
    df['WKCLASS06'] = df['WKCLASS'].apply(lambda x: 1 if x == 6 else 0)
    df['WKCLASS07'] = df['WKCLASS'].apply(lambda x: 1 if x == 7 else 0)
    df['WKCLASS08'] = df['WKCLASS'].apply(lambda x: 1 if x == 8 else 0)
    df['WKCLASS09'] = df['WKCLASS'].apply(lambda x: 1 if x == 9 else 0)
    df['WORKPLACE02'] = df['WORKPLACE'].apply(lambda x: 1 if x == 2 else 0)
    df['WORKPLACE03'] = df['WORKPLACE'].apply(lambda x: 1 if x == 3 else 0)
    df['WORKPLACE04'] = df['WORKPLACE'].apply(lambda x: 1 if x == 4 else 0)
    df['WORKPLACE05'] = df['WORKPLACE'].apply(lambda x: 1 if x == 5 else 0)
    df['WORKPLACE07'] = df['WORKPLACE'].apply(lambda x: 1 if x == 7 else 0)
    df['WORKPLACE08'] = df['WORKPLACE'].apply(lambda x: 1 if x == 8 else 0)
    df['WORKPLACE09'] = df['WORKPLACE'].apply(lambda x: 1 if x == 9 else 0)
    df['WORKPLACE10'] = df['WORKPLACE'].apply(lambda x: 1 if x == 10 else 0)
    df['WORKPLACE13'] = df['WORKPLACE'].apply(lambda x: 1 if x == 13 else 0)
    df['WORKPLACE14'] = df['WORKPLACE'].apply(lambda x: 1 if x == 14 else 0)
    df['WORKPLACE15'] = df['WORKPLACE'].apply(lambda x: 1 if x == 15 else 0)
    df['WORKPLACE16'] = df['WORKPLACE'].apply(lambda x: 1 if x == 16 else 0)
    df['WORKPLACE17'] = df['WORKPLACE'].apply(lambda x: 1 if x == 17 else 0)
    df['WORKPLACE18'] = df['WORKPLACE'].apply(lambda x: 1 if x == 18 else 0)
    df['WORKPLACE20'] = df['WORKPLACE'].apply(lambda x: 1 if x == 20 else 0)
    df['WORKPLACE63'] = df['WORKPLACE'].apply(lambda x: 1 if x == 63 else 0)
    df['WORKPLACE64'] = df['WORKPLACE'].apply(lambda x: 1 if x == 64 else 0)
    df['WORKPLACE65'] = df['WORKPLACE'].apply(lambda x: 1 if x == 65 else 0)
    df['WORKPLACE66'] = df['WORKPLACE'].apply(lambda x: 1 if x == 66 else 0)
    df['WORKPLACE67'] = df['WORKPLACE'].apply(lambda x: 1 if x == 67 else 0)
    df['WORKPLACE68'] = df['WORKPLACE'].apply(lambda x: 1 if x == 68 else 0)
    df['MRG'] = df['MRG'].apply(lambda x: 0 if x in [91, 93, 94, 95, 96, 97] else 1)
    df['PT'] = df['PT'] - 1

    df = df.drop(
        columns=['YEAR', 'ID', 'PERSON', 'SEX', 'PROV', 'EDU', 'F_EDU', 'DPT', 'IND', 'OCC', 'IND2', 'OCC2', 'ECON',
                 'OUTPATIENT', 'INPATIENT', 'HEALTH_INS', 'HI_PAYER', 'HI_FEE', 'INSURE_ID1', 'INSURE_MONTH1',
                 'INSURE_ID2', 'INSURE_MONTH2', 'DEPENDENTS', 'BIRTH_Y', 'BIRTH_MONTH', 'SRN', 'ROC', 'INC_OLD',
                 'NHICLASS', 'WHOPAY', 'DEPEND', 'SI1CLASS', 'SI1MONTH', 'SI2CLASS', 'SI2MONTH',
                 'WKCLASS', 'WORK', 'WORKPLACE', 'EQUIV']
    )
    return df


preprocessed_df = preprocess(all_df)

In [6]:
class DataDataset(Dataset):
    def __init__(self, condition_list, salary_list):
        self.condition_list = condition_list
        self.salary_list = salary_list

    def __len__(self):
        return len(self.condition_list)

    def __getitem__(self, index):
        return self.condition_list[index], self.salary_list[index]

In [7]:
class LinearRegressionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Sequential(
            nn.Linear(82, 512),
            nn.ReLU(),
            nn.Linear(512, 4096),
            nn.ReLU(),
            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.Linear(4096, 1),
        )

    def forward(self, x):
        return self.linear(x)

In [8]:
train, val = train_test_split(preprocessed_df, train_size=0.8)
train_condition_list = []
train_salary_list = []
val_condition_list = []
val_salary_list = []

for id, row in tqdm(train.iterrows()):
    train_salary_list.append(torch.from_numpy(np.array([row['ITM40'] / 100000])).float())
    train_condition_list.append(torch.from_numpy(row.drop(index=['ITM40']).values).float())

for id, row in tqdm(val.iterrows()):
    val_salary_list.append(torch.from_numpy(np.array([row['ITM40'] / 100000])).float())
    val_condition_list.append(torch.from_numpy(row.drop(index=['ITM40']).values).float())

train_dataset = DataDataset(train_condition_list, train_salary_list)
val_dataset = DataDataset(val_condition_list, val_salary_list)

train_dataloader = DataLoader(train_dataset, batch_size=parameters['batch_size'])
val_dataloader = DataLoader(val_dataset, batch_size=parameters['batch_size'])

232711it [01:01, 3795.43it/s]
58178it [00:15, 3647.49it/s]


In [9]:
def train(model, data_loader, device, loss_fct, optimizer):
    step_count = train_loss = 0

    for batch_idx, (inputs, labels) in enumerate(data_loader):
        model.train()
        optimizer.zero_grad()
        inputs = inputs.to(device)
        labels = labels.to(device)
        logits = model(inputs)
        loss = loss_fct(logits, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        step_count += 1

    train_loss = train_loss / step_count

    return train_loss

In [10]:
def evaluate(model, data_loader, device, loss_fct):
    step_count = val_loss = 0
    model.eval()
    with torch.no_grad():
        for batch_idx, (inputs, labels) in enumerate(data_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            logits = model(inputs)
            loss = loss_fct(logits, labels)
            val_loss += loss.item()
            step_count += 1

        val_loss = val_loss / step_count

    return val_loss

In [11]:
def save_checkpoint(save_path, model):
    if save_path is None:
        return
    torch.save(model.state_dict(), save_path)
    print(f'Model saved to ==> {save_path}')

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LinearRegressionModel().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=parameters['learning_rate'], momentum=0.9)
# optimizer = torch.optim.Adam(model.parameters(), lr=parameters['learning_rate'], betas=(0.9, 0.999), eps=1e-9)
loss_fct = torch.nn.MSELoss()

In [None]:
bestValidLoss = 100
for epoch in range(parameters['epochs']):
    st_time = time.time()
    train_loss = train(model, train_dataloader, device, loss_fct, optimizer)
    val_loss = evaluate(model, val_dataloader, device, loss_fct)

    if val_loss + train_loss < bestValidLoss:
        bestValidLoss = val_loss + train_loss
        save_checkpoint(f'../static/models/CasesModel-Epoch{epoch:03d}.pt', model)

    print('[epoch %d] cost time: %.4f s' % (epoch + 1, time.time() - st_time))
    print('         loss')
    print(f'train | {train_loss: .4f}')
    print(f'val   | {val_loss: .4f}\n')
    if parameters["wandb"]:
        wandb.log({"train_loss": train_loss, "val_loss":val_loss})

Model saved to ==> ../static/models/CasesModel-Epoch000.pt
[epoch 1] cost time: 12.2342 s
         loss
train |  20.2842
val   |  13.9041

Model saved to ==> ../static/models/CasesModel-Epoch001.pt
[epoch 2] cost time: 9.6698 s
         loss
train |  15.9133
val   |  13.4039

Model saved to ==> ../static/models/CasesModel-Epoch002.pt
[epoch 3] cost time: 9.8335 s
         loss
train |  15.5317
val   |  13.1439

Model saved to ==> ../static/models/CasesModel-Epoch003.pt
[epoch 4] cost time: 9.8259 s
         loss
train |  15.3186
val   |  13.0202

Model saved to ==> ../static/models/CasesModel-Epoch004.pt
[epoch 5] cost time: 10.2607 s
         loss
train |  15.1766
val   |  12.9640

Model saved to ==> ../static/models/CasesModel-Epoch005.pt
[epoch 6] cost time: 9.7763 s
         loss
train |  15.0657
val   |  12.9217

Model saved to ==> ../static/models/CasesModel-Epoch006.pt
[epoch 7] cost time: 10.0005 s
         loss
train |  14.9677
val   |  12.8695

Model saved to ==> ../static/mo