# TSTR CTGAN Dataset D

In [1]:
#import libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import os
print('Libraries imported!!')

Libraries imported!!


In [2]:
#define directory of functions and actual directory
HOME_PATH = '' #home path of the project
FUNCTIONS_DIR = 'EVALUATION FUNCTIONS/UTILITY'
ACTUAL_DIR = os.getcwd()

#change directory to functions directory
os.chdir(HOME_PATH + FUNCTIONS_DIR)

#import functions for data labelling analisys
from utility_evaluation import DataPreProcessor
from utility_evaluation import train_evaluate_model

#change directory to actual directory
os.chdir(ACTUAL_DIR)
print('Functions imported!!')

Functions imported!!


## 1. Read data

In [3]:
#read real dataset
train_data = pd.read_csv(HOME_PATH + 'SYNTHETIC DATASETS/CTGAN/D_ContraceptiveMethod_Synthetic_CTGAN.csv')
categorical_columns = ['wife_education','husband_education','wife_religion','wife_working','husband_occupation',
                        'standard_of_living_index','media_exposure','contraceptive_method_used']
for col in categorical_columns :
    train_data[col] = train_data[col].astype('category')
train_data

Unnamed: 0,wife_age,wife_education,husband_education,children,wife_religion,wife_working,husband_occupation,standard_of_living_index,media_exposure,contraceptive_method_used
0,21,2,4,3,1,1,3,3,0,3
1,19,3,3,9,1,1,1,4,1,1
2,14,4,4,2,1,1,1,2,0,3
3,26,4,4,1,1,1,1,4,0,3
4,30,3,4,0,1,1,2,2,0,2
...,...,...,...,...,...,...,...,...,...,...
1173,20,1,3,3,0,1,3,2,0,1
1174,16,1,4,4,1,0,3,2,0,3
1175,40,4,2,1,0,1,1,4,0,3
1176,34,3,4,3,1,1,3,2,0,3


In [4]:
#read test data
test_data = pd.read_csv(HOME_PATH + 'REAL DATASETS/TEST DATASETS/D_ContraceptiveMethod_Real_Test.csv')
for col in categorical_columns :
    test_data[col] = test_data[col].astype('category')
test_data

Unnamed: 0,wife_age,wife_education,husband_education,children,wife_religion,wife_working,husband_occupation,standard_of_living_index,media_exposure,contraceptive_method_used
0,29,3,3,4,1,1,3,4,0,3
1,35,3,4,5,1,0,2,2,0,1
2,23,4,4,1,0,1,1,4,0,2
3,30,2,3,3,1,1,3,1,0,1
4,22,2,3,1,1,1,3,3,0,1
...,...,...,...,...,...,...,...,...,...,...
290,38,4,4,3,1,0,1,4,0,2
291,25,2,2,1,1,1,3,3,0,3
292,25,3,1,4,1,1,3,1,0,2
293,24,4,3,3,1,1,3,3,0,3


In [5]:
target = 'contraceptive_method_used'
#quick look at the breakdown of class values
print('Train data')
print(train_data.shape)
print(train_data.groupby(target).size())
print('#####################################')
print('Test data')
print(test_data.shape)
print(test_data.groupby(target).size())

Train data
(1178, 10)
contraceptive_method_used
1    468
2    347
3    363
dtype: int64
#####################################
Test data
(295, 10)
contraceptive_method_used
1    130
2     71
3     94
dtype: int64


## 2. Pre-process training data

In [6]:
target = 'contraceptive_method_used'
categorical_columns = ['wife_education','husband_education','wife_religion','wife_working','husband_occupation',
                       'standard_of_living_index','media_exposure']
numerical_columns = train_data.select_dtypes(include=['int64','float64']).columns.tolist()
categories = [np.array([0, 1, 2, 3]), np.array([0, 1, 2, 3]), np.array([0, 1]), np.array([0, 1]), 
              np.array([0, 1, 2, 3]), np.array([0, 1, 2, 3]), np.array([0, 1])]

data_preprocessor = DataPreProcessor(categorical_columns, numerical_columns, categories)
x_train = data_preprocessor.preprocess_train_data(train_data.loc[:, train_data.columns != target])
y_train = train_data.loc[:, target]

x_train.shape, y_train.shape

((1178, 24), (1178,))

## 3. Preprocess test data

In [7]:
x_test = data_preprocessor.preprocess_test_data(test_data.loc[:, test_data.columns != target])
y_test = test_data.loc[:, target]
x_test.shape, y_test.shape

((295, 24), (295,))

## 4. Create a dataset to save the results

In [8]:
results = pd.DataFrame(columns = ['model','accuracy','precision','recall','f1'])
results

Unnamed: 0,model,accuracy,precision,recall,f1


## 4. Train and evaluate Random Forest Classifier

In [9]:
rf_results = train_evaluate_model('RF', x_train, y_train, x_test, y_test)
results = results.append(rf_results, ignore_index=True)
rf_results

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished


Unnamed: 0,model,accuracy,precision,recall,f1
0,RF,0.4136,0.4118,0.4136,0.412


## 5. Train and Evaluate KNeighbors Classifier

In [10]:
knn_results = train_evaluate_model('KNN', x_train, y_train, x_test, y_test)
results = results.append(knn_results, ignore_index=True)
knn_results

Unnamed: 0,model,accuracy,precision,recall,f1
0,KNN,0.3729,0.3597,0.3729,0.3595


## 6. Train and evaluate Decision Tree Classifier

In [11]:
dt_results = train_evaluate_model('DT', x_train, y_train, x_test, y_test)
results = results.append(dt_results, ignore_index=True)
dt_results

Unnamed: 0,model,accuracy,precision,recall,f1
0,DT,0.3424,0.3542,0.3424,0.3464


## 7. Train and evaluate Support Vector Machines Classifier

In [12]:
svm_results = train_evaluate_model('SVM', x_train, y_train, x_test, y_test)
results = results.append(svm_results, ignore_index=True)
svm_results

[LibSVM]

Unnamed: 0,model,accuracy,precision,recall,f1
0,SVM,0.3864,0.3751,0.3864,0.3779


## 8. Train and evaluate Multilayer Perceptron Classifier

In [13]:
mlp_results = train_evaluate_model('MLP', x_train, y_train, x_test, y_test)
results = results.append(mlp_results, ignore_index=True)
mlp_results

Iteration 1, loss = 1.10912507
Iteration 2, loss = 1.08131333
Iteration 3, loss = 1.07191198
Iteration 4, loss = 1.06171623
Iteration 5, loss = 1.05322127
Iteration 6, loss = 1.04426663
Iteration 7, loss = 1.03801883
Iteration 8, loss = 1.03150799
Iteration 9, loss = 1.02430214
Iteration 10, loss = 1.01848418
Iteration 11, loss = 1.01253083
Iteration 12, loss = 1.00425890
Iteration 13, loss = 0.99679328
Iteration 14, loss = 0.98982118
Iteration 15, loss = 0.98213786
Iteration 16, loss = 0.97426146
Iteration 17, loss = 0.96488959
Iteration 18, loss = 0.95887326
Iteration 19, loss = 0.94895504
Iteration 20, loss = 0.93984391
Iteration 21, loss = 0.93139084
Iteration 22, loss = 0.92212424
Iteration 23, loss = 0.91162346
Iteration 24, loss = 0.90590098
Iteration 25, loss = 0.89523965
Iteration 26, loss = 0.88428192
Iteration 27, loss = 0.87696075
Iteration 28, loss = 0.86591739
Iteration 29, loss = 0.85538267
Iteration 30, loss = 0.84223706
Iteration 31, loss = 0.83845381
Iteration 32, los

Unnamed: 0,model,accuracy,precision,recall,f1
0,MLP,0.3932,0.3994,0.3932,0.3953


## 9. Save results file

In [14]:
results.to_csv('RESULTS/models_results_ctgan.csv', index=False)
results

Unnamed: 0,model,accuracy,precision,recall,f1
0,RF,0.4136,0.4118,0.4136,0.412
1,KNN,0.3729,0.3597,0.3729,0.3595
2,DT,0.3424,0.3542,0.3424,0.3464
3,SVM,0.3864,0.3751,0.3864,0.3779
4,MLP,0.3932,0.3994,0.3932,0.3953
