In [20]:

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset
from tensorflow.keras import optimizers

import pandas as pd
import numpy as np


In [21]:
df = pd.read_csv('adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [22]:
def get_categorical(df):

    categorical = []
    names = list(df.columns)
    for name in names:
        if df.dtypes[name] == 'object':
            categorical.append(name)
    return categorical


def transform_nomm(df,nomm):
    obj_df = df
    oe_style = OneHotEncoder()
    for col in nomm:
        oe_results = oe_style.fit_transform(df[[col]])
        obj_df = dff.join(pd.DataFrame(oe_results.toarray(), columns=oe_style.categories_))
    return obj_df


def get_numerical(df):

    numerical = []
    names = list(df.columns)
    for name in names:
        if df.dtypes[name] != 'object':
            numerical.append(name)
    return numerical


def transform_ordd(dff,ordd):
    
    ord_enc = OrdinalEncoder()
    for col in ordd:
        dff[col] = ord_enc.fit_transform(dff[[col]])
    return 0



In [23]:
nomm = ['marital-status','occupation','relationship','workclass','race','gender','native-country']
ordd = ['education']

dff = pd.get_dummies(df, columns=nomm)
tt = transform_ordd(dff,ordd)
cols = []
for i in list(dff.columns):
    if  i != 'income':
        cols.append(i)

dff = shuffle(dff)
dff['income'] = dff['income'].map({'<=50K':0,'>50K':1}).astype(int)

dff.head()

Unnamed: 0,age,fnlwgt,education,educational-num,capital-gain,capital-loss,hours-per-week,income,marital-status_Divorced,marital-status_Married-AF-spouse,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
10376,38,104727,12.0,14,0,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
45334,54,123011,15.0,10,0,0,40,1,0,0,...,0,0,0,0,0,0,0,1,0,0
34183,67,63552,5.0,4,0,0,35,0,0,0,...,0,0,0,0,0,0,0,1,0,0
22095,34,265807,15.0,10,0,0,45,0,0,0,...,0,0,0,0,0,0,0,1,0,0
31408,51,338836,15.0,10,0,0,40,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [24]:
df_1 = dff.iloc[:int(len(dff)*0.2),:]
df_2 = dff.iloc[int(len(dff)*0.2)+1:,:]
print("Shape of new dataframes - {} , {}".format(df_1.shape, df_2.shape))

Shape of new dataframes - (9768, 94) , (39073, 94)


In [37]:

df_2.to_csv('train.csv')
df_1.to_csv('test.csv')
dff.to_csv('adult_processed.csv')


len(df_2), len(df_1)

(39073, 9768)

## Pysyft + Torch Federated Compatabile Data Preporeccsing


In [26]:
from syft.frameworks.torch.fl import dataloader,FederatedDataLoader, FederatedDataset

import torch
import syft as sy  

In [38]:
class FeatureDataset(Dataset):
    
    def __init__(self,file_name):
        
        file_out = pd.read_csv(file_name)
        cols = []
        for i in list(file_out.columns):
            if  i != 'income':
                cols.append(i)
        
        x = file_out[cols].copy().values
        y = file_out[['income']].copy().values
        
        sc = StandardScaler()
        x_train = sc.fit_transform(x)
        y_train = y
        
        self.X_train = torch.tensor(x_train, dtype= torch.float32)
        self.Y_train = torch.tensor(y_train)
        
#         self.Y_train.type(torch.LongTensor)    

    def __len__(self):
        return len(self.Y_train)
    
    def __getitem__(self, idx):
        return self.X_train[idx],self.Y_train[idx]
    
    

In [40]:
feature_set1 = FeatureDataset('train.csv')
feature_set2 = FeatureDataset('test.csv')

len(feature_set1),len(feature_set2)

(39073, 9768)

In [29]:
hook = sy.TorchHook(torch)  # attach the pytorch hook
joe = sy.VirtualWorker(hook, id="taj")  #  remote worker joe
jane = sy.VirtualWorker(hook, id="ammar")  #  remote worker  jane




In [44]:
federated_train_loader = sy.FederatedDataLoader(feature_set1.federate((joe, jane)), batch_size=64, shuffle=True) # the federate() method splits the data within the workers
test_loader = torch.utils.data.DataLoader(feature_set2, batch_size=64, shuffle=True)

In [53]:
print('Note that patch size is 64, so 611*64 = 39104 which is the training data size, same applies to test data.')

print(f'\n{federated_train_loader}\n train patch size: {len(federated_train_loader)}\n\n{test_loader}\n test patch size: {len(test_loader)}')


Note that patch size is 64, so 611*64 = 39104 which is the training data size, same applies to test data.

<syft.frameworks.torch.fl.dataloader.FederatedDataLoader object at 0x7fa465f40400>
 train patch size: 611

<torch.utils.data.dataloader.DataLoader object at 0x7fa465f40390>
 test patch size: 153


In [54]:
cols

['age',
 'fnlwgt',
 'education',
 'educational-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'marital-status_Divorced',
 'marital-status_Married-AF-spouse',
 'marital-status_Married-civ-spouse',
 'marital-status_Married-spouse-absent',
 'marital-status_Never-married',
 'marital-status_Separated',
 'marital-status_Widowed',
 'occupation_?',
 'occupation_Adm-clerical',
 'occupation_Armed-Forces',
 'occupation_Craft-repair',
 'occupation_Exec-managerial',
 'occupation_Farming-fishing',
 'occupation_Handlers-cleaners',
 'occupation_Machine-op-inspct',
 'occupation_Other-service',
 'occupation_Priv-house-serv',
 'occupation_Prof-specialty',
 'occupation_Protective-serv',
 'occupation_Sales',
 'occupation_Tech-support',
 'occupation_Transport-moving',
 'relationship_Husband',
 'relationship_Not-in-family',
 'relationship_Other-relative',
 'relationship_Own-child',
 'relationship_Unmarried',
 'relationship_Wife',
 'workclass_?',
 'workclass_Federal-gov',
 'workclass_Local-gov',
