First we preprocess the data. Replace the string in original data with integer to make it easy for us to process.

In [None]:
from tkinter.messagebox import RETRY
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from scipy import stats
from matplotlib import style
import seaborn as sns
from matplotlib import pyplot as plt
import statsmodels.formula.api as smf
import graphviz as gr
from linearmodels.iv import IV2SLS
pd.set_option("display.max_columns", 5)
style.use("fivethirtyeight")
relative_path = ".."

def get_dataset():
    data = pd.read_csv(relative_path + "/data/income_data/train.csv")
    data = data.dropna(axis=0)
    data.rename(columns={'educational-num': 'educational_num', "income_>50K": "income_bigger_than_50K",
                         'marital-status': 'marital_status', 'native-country': 'native_country'}, inplace=True)
    data["race"] = data["race"].replace(to_replace="Amer-Indian-Eskimo",
                                        value="Indian")
    data["race"] = data["race"].replace(to_replace="Asian-Pac-Islander",
                                        value="Asian")

    occupationDict = {
        "Exec-managerial": 0,
        "Other-service": 4,
        "Transport-moving": 5,
        "Adm-clerical": 6,
        "Machine-op-inspct": 5,
        "Sales": 3,
        "Handlers-cleaners": 5,
        "Farming-fishing": 5,
        "Protective-serv": 2,
        "Prof-specialty": 2,
        "Craft-repair": 1,
        "Tech-support": 5,
        "Priv-house-serv": 5,
        "Armed-Forces": 5
    }
    data["occupation"] = data["occupation"].map(occupationDict)
    raceDict = {
        "White": 0,
        "Black": 1,
        "Asian": 2,
        "Indian": 2,
        "Other": 2
    }
    data["race"] = data["race"].map(raceDict)
    genderDict = {
        "Male": 0,
        "Female": 1
    }
    data["gender"] = data["gender"].map(genderDict)
    maritalDict = {
        "Divorced": 2,
        "Never-married": 1,
        "Married-civ-spouse": 0,
        "Widowed": 2,
        "Separated": 2,
        "Married-spouse-absent": 2,
        "Married-AF-spouse": 2
    }
    data["marital_status"] = data["marital_status"].map(maritalDict)
    def map_country(native_country):
        if native_country == "United-States":
            return 0
        else:
            return 1
    data["native_country"] = data["native_country"].map(map_country)
    workclassDict = {
        'Private': 0,
        'State-gov': 1,
        'Self-emp-not-inc': 2,
        'Federal-gov': 1,
        'Local-gov': 1,
        'Self-emp-inc': 1,
        'Without-pay': 1
    }
    data["workclass"] = data["workclass"].map(workclassDict)
    educationDict = {
        'Doctorate': 1,
        '12th': 0,
        'Bachelors': 1,
        '7th-8th': 0,
        'Some-college': 1,
        'HS-grad': 0,
        '9th': 0,
        '10th': 0,
        '11th': 0,
        'Masters': 1,
        'Preschool': 0,
        '5th-6th': 0,
        'Prof-school': 0,
        'Assoc-voc': 0,
        'Assoc-acdm': 0,
        '1st-4th': 0
    }
    data["education"] = data["education"].map(educationDict)
    def map_relationship(relationship):
        if relationship == "Husband":
            return 0
        if relationship == "Not-in-family":
            return 1
        else:
            return 2
    data["relationship"] = data["relationship"].map(map_relationship)
    data.to_csv(relative_path + "/data/income_data/modified_train.csv")
    return data

Modify the data set

In [None]:
from torch.utils.data import Dataset

class imcomedataset(Dataset):

  def __init__(self,train_data,train_outcome,prob):

    self.x_train = torch.tensor(train_data, dtype=torch.float32)
    self.y_train = torch.tensor(train_outcome, dtype=torch.float32)
    self.prob = torch.from_numpy(prob.to_numpy())

  def __len__(self):
    return len(self.y_train)

  def __getitem__(self,idx):
    return self.x_train[idx],self.y_train[idx],self.prob[idx]

Then we modify the training model.

In [None]:
import torch
import torch.nn.functional as F
from torch import nn
from torch.nn import Module

class MLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()

        self.input_fc = nn.Linear(input_dim, 10)
        self.hidden_fc = nn.Linear(10, 10)
        self.output_fc = nn.Linear(10, output_dim)

    def forward(self, x):

        # x = [batch size, height, width]

        batch_size = x.shape[0]

        x = x.view(batch_size, -1)

        # x = [batch size, height * width]

        h_1 = F.relu(self.input_fc(x))
        # h_1 = [batch size, 250]

        h_2 = F.relu(self.hidden_fc(h_1))
        # h_2 = [batch size, 100]

        h3 = self.output_fc(h_2)
        # y_pred = [batch size, output dim]


        y_pred = F.sigmoid(h3)
        #y_pred = F.softmax(h3, dim=1)
        return y_pred, h3

Modify the Loss

In [None]:
from torch.nn import Module

class CustomLoss(Module):
    def __init__(self):
        super().__init__()

    def forward(self, predict_y, y,p):

        temp = y - torch.mul(predict_y, p)

        return torch.mean(torch.square(temp))

First Stage

In [None]:
from sklearn.linear_model import LogisticRegression
from data_processor import get_xp
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# this function will sample based on the probility and times give.
# the return is a list of samples
def repeat_sample(times=1, possiblity=0.5):
    temp = []

    def sample(num):
        u = np.random.rand()

        return 1 if u < num else 0

    for i in range(times):
        a = sample(possiblity)
        temp.append(a)

    return temp


# this function will build the new treatments
def build(x, probilities, times=1, save=False):
    treatments = [repeat_sample(times, i) for i in probilities]
    data = np.hstack((x.to_numpy(), treatments))
    new = pd.DataFrame(data, columns=list(x.columns)
                                     + ["p" + str(i) for i in range(0, times)])

    if save:
        new.to_csv("../data/income_data/new X.csv")
        print("Successful saved!")

    return new


# this function will do the first stage and return a new x.
# ml method will have 4 opinions
# save will auto save the new x with sample result to a csv file.
def solve_stage_one(ml_method="LR", save=False):
    x, y = get_xp()

    d = {
        "LR": LogisticRegression(),
        "ADA": AdaBoostClassifier(),
        "FOREST": RandomForestClassifier(),
        "MLP": MLPClassifier(),
    }

    model = d[ml_method.upper()]
    clf = model.fit(x, y)
    pred = clf.predict(x)
    probilities = clf.predict_proba(x)[:, 1]
    print(classification_report(y, pred))

    new_x = build(x, probilities, 1000, save)

    return new_x

print(solve_stage_one("FOREST", save=True))

Second Stage