# Installing necessary libraries

In [1]:
#!pip install --upgrade tensorflow

# Importing libraries

In [1]:
import os
import pandas as pd
import numpy as np
import keras
from tqdm.notebook import tqdm
from time import sleep

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Input
from keras.utils import to_categorical

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

# Loading data

In [2]:
file_path_1 =  r"..\Extracted_files\HHV1_PERV1_merged.dta"

In [3]:
df = pd.read_stata(file_path_1)

In [5]:
print("Number of HHs surveyed = {}. \nNumber of variables = {}".format(df.shape[0], df.shape[1]))
display(df.head())

Number of HHs surveyed = 418159. 
Number of variables = 181


Unnamed: 0,hhvar1,hhvar2,hhvar3,hhvar4,hhvar5,hhvar6,hhvar7,hhvar8,hhvar9,hhvar10,...,pvar134,pvar135,pvar136,pvar137,pvar138,pvar139,perid,_merge,weights,usual_status_code
0,FVH7,104,Q1,V1,1,2,4,21,1,14,...,0,0,2,4,246798,4,Q1V1110002110101,both,308,94.0
1,FVH7,104,Q1,V1,1,2,4,21,1,14,...,0,0,2,4,246798,4,Q1V1110002110102,both,308,93.0
2,FVH7,104,Q1,V1,1,2,4,21,1,14,...,0,6500,2,4,246798,4,Q1V1110002110103,both,308,11.0
3,FVH7,104,Q1,V1,1,2,4,21,1,14,...,0,0,2,4,246798,4,Q1V1110002110104,both,308,21.0
4,FVH7,104,Q1,V1,1,2,4,21,1,14,...,0,0,2,4,246798,4,Q1V1110002110105,both,308,91.0


# Documentation

What determines status of employment?

# Changing formats

In [6]:
#To numeric
numeric_columns = ['hhvar21', 'hhvar30','pvar20', 'pvar24','pvar29']
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric)

#To categorical
categorical_columns = ['hhvar5', 'hhvar6', 'hhvar7', 'hhvar23', 'hhvar24', 'pvar18', 'pvar19',  'pvar21', 'pvar22', 'pvar23', 'pvar26', 'pvar28', 'pvar30']
df[categorical_columns] = df[categorical_columns].astype('category')

# Adding columns

In [7]:
df["dummy_employed"] = np.where(df['usual_status_code']<=51,1,0)

  df["dummy_employed"] = np.where(df['usual_status_code']<=51,1,0)


# Filtering

In [8]:
#Filtering for columns
filtered_columns = numeric_columns + categorical_columns + ["dummy_employed"]
df_filtered = df.loc[:,filtered_columns]

#Filtering for age
df_filtered = df_filtered.loc[(df_filtered["pvar20"]>=12) & (df_filtered["pvar20"]<=59)]

In [9]:
print(df_filtered.shape)
df_filtered.head()


(296501, 19)


Unnamed: 0,hhvar21,hhvar30,pvar20,pvar24,pvar29,hhvar5,hhvar6,hhvar7,hhvar23,hhvar24,pvar18,pvar19,pvar21,pvar22,pvar23,pvar26,pvar28,pvar30,dummy_employed
2,7,8733,40,10,,1,2,4,1,2,3,male,2,8,1,2,,,1
3,7,8733,35,10,,1,2,4,1,2,4,female,2,8,1,2,,,1
4,7,8733,15,9,,1,2,4,1,2,6,male,1,7,1,6,,,0
5,7,8733,12,6,,1,2,4,1,2,6,male,1,6,1,6,,,0
6,7,8733,16,12,,1,2,4,1,2,6,female,1,10,1,2,,,1


# Treating missing values

Three columns had missing values:
1. pvar28: Field of training
2. pvar29: Duration of training
3. pvar30 Type of training

In [10]:
#Checking for missing values before substitution
empty_list = ['', ' ', np.nan]
missing_values = []
for col in df_filtered.columns:
    empty_count = df_filtered[col].isin(empty_list).sum()
    if empty_count>0:
        dict = {col:empty_count}
        missing_values.append(dict)
print("Missing values count before substitution = {}".format(missing_values))

#Treating missing values
df_filtered[['pvar28', 'pvar30']] = df_filtered[['pvar28', 'pvar30']].replace('', 'no training')
df_filtered['pvar29'] = df_filtered['pvar29'].fillna(0)

#Checking for missing values after substitution
missing_values = []
for col in df_filtered.columns:
    empty_count = df_filtered[col].isin(empty_list).sum()
    if empty_count>0:
        dict = {col:empty_count}
        missing_values.append(dict)
print("Missing values count before substitution = {}".format(missing_values))

Missing values count before substitution = [{'pvar29': np.int64(283470)}, {'pvar28': np.int64(283470)}, {'pvar30': np.int64(283470)}]
Missing values count before substitution = []


  df_filtered[['pvar28', 'pvar30']] = df_filtered[['pvar28', 'pvar30']].replace('', 'no training')


In [11]:
df_filtered.head()

Unnamed: 0,hhvar21,hhvar30,pvar20,pvar24,pvar29,hhvar5,hhvar6,hhvar7,hhvar23,hhvar24,pvar18,pvar19,pvar21,pvar22,pvar23,pvar26,pvar28,pvar30,dummy_employed
2,7,8733,40,10,0.0,1,2,4,1,2,3,male,2,8,1,2,no training,no training,1
3,7,8733,35,10,0.0,1,2,4,1,2,4,female,2,8,1,2,no training,no training,1
4,7,8733,15,9,0.0,1,2,4,1,2,6,male,1,7,1,6,no training,no training,0
5,7,8733,12,6,0.0,1,2,4,1,2,6,male,1,6,1,6,no training,no training,0
6,7,8733,16,12,0.0,1,2,4,1,2,6,female,1,10,1,2,no training,no training,1


# Encoding categorical columns

In [12]:
label_encoder = LabelEncoder()

In [13]:
for col in categorical_columns:
    df_filtered[col] = label_encoder.fit_transform(df_filtered[col])

# Normalizing numerical values

In [14]:
scaler = MinMaxScaler()
temp_scaled = pd.DataFrame(scaler.fit_transform(df_filtered[numeric_columns]), columns=numeric_columns,index = df_filtered.index)
df_scaled = pd.merge(temp_scaled, df_filtered[categorical_columns + ['dummy_employed']], left_index= True, right_index=True,how = 'outer')

In [15]:
df_scaled.head()

Unnamed: 0,hhvar21,hhvar30,pvar20,pvar24,pvar29,hhvar5,hhvar6,hhvar7,hhvar23,hhvar24,pvar18,pvar19,pvar21,pvar22,pvar23,pvar26,pvar28,pvar30,dummy_employed
2,0.24,0.053664,0.595745,0.4,0.0,0,1,3,0,1,2,1,1,7,0,1,22,3,1
3,0.24,0.053664,0.489362,0.4,0.0,0,1,3,0,1,3,0,1,7,0,1,22,3,1
4,0.24,0.053664,0.06383,0.36,0.0,0,1,3,0,1,5,1,0,6,0,5,22,3,0
5,0.24,0.053664,0.0,0.24,0.0,0,1,3,0,1,5,1,0,5,0,5,22,3,0
6,0.24,0.053664,0.085106,0.48,0.0,0,1,3,0,1,5,0,0,8,0,1,22,3,1


# Splitting the data

In [16]:
#Predictor variables
X = df_scaled[numeric_columns + categorical_columns]

#Target variable
y = df_scaled['dummy_employed']
y = pd.get_dummies(y) #NN requires one-hot encoding for classification

In [31]:
X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                 test_size=0.2, 
                                                 random_state=42
                                                 ) 

# Training

In [49]:
def nn_model (x_training,y_training,num_hidden_layers = 3):
    
    #Instatiating a sequential NN
    model = Sequential()

    #Setting up input layer
    model.add(Input(shape = (len(x_training.columns),)))

    #Setting up hidden layers
    for i in range(num_hidden_layers):
        model.add(Dense(15,activation = 'relu',name = f'layer_{i}'))

    #Setting up the output layer
    model.add(Dense(y_training.shape[1],activation='softmax'))

    #Compiling the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    #Fitting the model
    model.fit(x_training,y_training, epochs = 2,verbose = 1)

    return model

In [45]:
import tensorflow as tf
tf.random.set_seed(42)
np.random.seed(42)

In [51]:
a = nn_model(X_train,y_train)

Epoch 1/2
[1m7413/7413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.7683 - loss: 0.4752
Epoch 2/2
[1m7413/7413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - accuracy: 0.8261 - loss: 0.3886


In [47]:
a.fit(X_train,y_train,
          #validation_data=(X_test,y_test), 
          epochs=2,verbose=1)

Epoch 1/2
[1m7413/7413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - accuracy: 0.7697 - loss: 0.4721
Epoch 2/2
[1m7413/7413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.8304 - loss: 0.3729


<keras.src.callbacks.history.History at 0x1f30b019940>

In [48]:
a.fit(X_train,y_train,
          #validation_data=(X_test,y_test), 
          epochs=2,verbose=1)

Epoch 1/2
[1m7413/7413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3ms/step - accuracy: 0.8404 - loss: 0.3530
Epoch 2/2
[1m7413/7413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 4ms/step - accuracy: 0.8437 - loss: 0.3460


<keras.src.callbacks.history.History at 0x1f30b035d30>

In [32]:
#Instatiating a sequential NN
model = Sequential()

#Setting up the input layer
model.add(Input(shape = (len(X.columns),)))

#Setting up the hidden layers
model.add(Dense(10,activation='relu', name = "layer_1"))
model.add(Dense(15,activation='relu', name = "layer_2"))
model.add(Dense(10,activation='relu', name = "layer_3"))

#Setting up the output layer
model.add(Dense(y.shape[1],activation='softmax'))

#Compiling the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [36]:
model.fit(X_train,y_train,
          #validation_data=(X_test,y_test), 
          epochs=2,verbose=1)

Epoch 1/2
[1m7413/7413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1ms/step - accuracy: 0.8346 - loss: 0.3665
Epoch 2/2
[1m7413/7413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1ms/step - accuracy: 0.8387 - loss: 0.3563


<keras.src.callbacks.history.History at 0x1f2fffc9c70>

In [56]:
benchmark_accuracy = a.evaluate(X_test,y_test)[1]

[1m1854/1854[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.8359 - loss: 0.3646


In [57]:
benchmark_accuracy

0.83651202917099

# Feature importance

In [58]:
def feature_importance (x_training,x_testing,y_training,y_testing,model = nn_model):

    #Running the benchmark model
    benchmark_model = model(x_training,y_training)
    benchmark_accuracy = benchmark_model.evaluate(x_testing,y_testing)[1]
    
    #Initializing empty dictionary for storing importance scores
    importance_dict = {}

    #Iterating through columns
    with tqdm (total = x_training.shape[1]) as pbar:
        for col in x_training.columns:
            print(f"\n \n=====Computing for column {col}=====\n")
            
            x_training_copy = x_training.copy()
            x_training_copy.loc[:,col] = np.random.permutation(x_training_copy.loc[:,col]) #Shuffling values of col i

            #Training the model after shuffling
            new_model = model(x_training_copy,y_training)
        
            #Extracting accuracy score
            new_accuracy = new_model.evaluate(x_testing,y_testing)[1]
            importance_score = benchmark_accuracy - new_accuracy

            #Storing accuracy score
            importance_dict[col] = importance_score

            #Pring progress
            print(f"Shuffling of column {col} complete. Drop in accuracy = {importance_score}")

            #Update progress bar
            pbar.update(1)

    return importance_dict


In [59]:
feature_importance(x_training=X_train, y_training=y_train,x_testing=X_test,y_testing=y_test)

Epoch 1/2
[1m7413/7413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 3ms/step - accuracy: 0.7855 - loss: 0.4691
Epoch 2/2
[1m7413/7413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - accuracy: 0.8301 - loss: 0.3758
[1m1854/1854[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8397 - loss: 0.3568


  0%|          | 0/18 [00:00<?, ?it/s]


 
=====Computing for column hhvar21=====

Epoch 1/2
[1m7413/7413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.7854 - loss: 0.4529
Epoch 2/2
[1m7413/7413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.8343 - loss: 0.3672
[1m1854/1854[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.8428 - loss: 0.3524
Shuffling of column hhvar21 complete. Drop in accuracy = -0.0029172897338867188

 
=====Computing for column hhvar30=====

Epoch 1/2
[1m7413/7413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.7727 - loss: 0.4879
Epoch 2/2
[1m7413/7413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - accuracy: 0.8256 - loss: 0.3918
[1m1854/1854[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8338 - loss: 0.3716
Shuffling of column hhvar30 complete. Drop in accuracy = 0.00622248649597168

 
=====Computing for column pvar20=====

Epoch

{'hhvar21': -0.0029172897338867188,
 'hhvar30': 0.00622248649597168,
 'pvar20': 0.01180422306060791,
 'pvar24': 0.002225935459136963,
 'pvar29': 0.0048059821128845215,
 'hhvar5': 0.006003260612487793,
 'hhvar6': -0.0011972784996032715,
 'hhvar7': 0.004873454570770264,
 'hhvar23': -0.0025125741958618164,
 'hhvar24': 0.009375929832458496,
 'pvar18': -0.000455319881439209,
 'pvar19': 0.05119645595550537,
 'pvar21': 0.0019392967224121094,
 'pvar22': 0.003541290760040283,
 'pvar23': 0.001045525074005127,
 'pvar26': 0.026424527168273926,
 'pvar28': 0.0023776888847351074,
 'pvar30': 0.00409775972366333}