# TSTR GM Dataset D

In [1]:
#import libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import os
print('Libraries imported!!')

Libraries imported!!


In [2]:
#define directory of functions and actual directory
HOME_PATH = '' #home path of the project
FUNCTIONS_DIR = 'EVALUATION FUNCTIONS/UTILITY'
ACTUAL_DIR = os.getcwd()

#change directory to functions directory
os.chdir(HOME_PATH + FUNCTIONS_DIR)

#import functions for data labelling analisys
from utility_evaluation import DataPreProcessor
from utility_evaluation import train_evaluate_model

#change directory to actual directory
os.chdir(ACTUAL_DIR)
print('Functions imported!!')

Functions imported!!


## 1. Read data

In [3]:
#read real dataset
train_data = pd.read_csv(HOME_PATH + 'SYNTHETIC DATASETS/GM/D_ContraceptiveMethod_Synthetic_GM.csv')
categorical_columns = ['wife_education','husband_education','wife_religion','wife_working','husband_occupation',
                        'standard_of_living_index','media_exposure','contraceptive_method_used']
for col in categorical_columns :
    train_data[col] = train_data[col].astype('category')
train_data

Unnamed: 0,wife_age,wife_education,husband_education,children,wife_religion,wife_working,husband_occupation,standard_of_living_index,media_exposure,contraceptive_method_used
0,45,4,4,1,0,1,1,3,0,1
1,13,2,4,0,1,0,3,4,0,3
2,22,3,3,1,1,1,3,2,0,3
3,33,4,4,2,0,1,2,4,0,1
4,30,3,4,1,1,1,3,1,0,3
...,...,...,...,...,...,...,...,...,...,...
1173,43,4,2,2,0,1,2,2,0,1
1174,17,2,3,1,1,1,3,2,0,1
1175,36,4,4,7,1,0,1,3,0,2
1176,27,3,4,2,0,0,3,4,0,3


In [4]:
#read test data
test_data = pd.read_csv(HOME_PATH + 'REAL DATASETS/TEST DATASETS/D_ContraceptiveMethod_Real_Test.csv')
for col in categorical_columns :
    test_data[col] = test_data[col].astype('category')
test_data

Unnamed: 0,wife_age,wife_education,husband_education,children,wife_religion,wife_working,husband_occupation,standard_of_living_index,media_exposure,contraceptive_method_used
0,29,3,3,4,1,1,3,4,0,3
1,35,3,4,5,1,0,2,2,0,1
2,23,4,4,1,0,1,1,4,0,2
3,30,2,3,3,1,1,3,1,0,1
4,22,2,3,1,1,1,3,3,0,1
...,...,...,...,...,...,...,...,...,...,...
290,38,4,4,3,1,0,1,4,0,2
291,25,2,2,1,1,1,3,3,0,3
292,25,3,1,4,1,1,3,1,0,2
293,24,4,3,3,1,1,3,3,0,3


In [5]:
target = 'contraceptive_method_used'
#quick look at the breakdown of class values
print('Train data')
print(train_data.shape)
print(train_data.groupby(target).size())
print('#####################################')
print('Test data')
print(test_data.shape)
print(test_data.groupby(target).size())

Train data
(1178, 10)
contraceptive_method_used
1    466
2    267
3    445
dtype: int64
#####################################
Test data
(295, 10)
contraceptive_method_used
1    130
2     71
3     94
dtype: int64


## 2. Pre-process training data

In [6]:
target = 'contraceptive_method_used'
categorical_columns = ['wife_education','husband_education','wife_religion','wife_working','husband_occupation',
                       'standard_of_living_index','media_exposure']
numerical_columns = train_data.select_dtypes(include=['int64','float64']).columns.tolist()
categories = [np.array([0, 1, 2, 3]), np.array([0, 1, 2, 3]), np.array([0, 1]), np.array([0, 1]), 
              np.array([0, 1, 2, 3]), np.array([0, 1, 2, 3]), np.array([0, 1])]

data_preprocessor = DataPreProcessor(categorical_columns, numerical_columns, categories)
x_train = data_preprocessor.preprocess_train_data(train_data.loc[:, train_data.columns != target])
y_train = train_data.loc[:, target]

x_train.shape, y_train.shape

((1178, 24), (1178,))

## 3. Preprocess test data

In [7]:
x_test = data_preprocessor.preprocess_test_data(test_data.loc[:, test_data.columns != target])
y_test = test_data.loc[:, target]
x_test.shape, y_test.shape

((295, 24), (295,))

## 4. Create a dataset to save the results

In [8]:
results = pd.DataFrame(columns = ['model','accuracy','precision','recall','f1'])
results

Unnamed: 0,model,accuracy,precision,recall,f1


## 4. Train and evaluate Random Forest Classifier

In [9]:
rf_results = train_evaluate_model('RF', x_train, y_train, x_test, y_test)
results = results.append(rf_results, ignore_index=True)
rf_results

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished


Unnamed: 0,model,accuracy,precision,recall,f1
0,RF,0.4339,0.4401,0.4339,0.4321


## 5. Train and Evaluate KNeighbors Classifier

In [10]:
knn_results = train_evaluate_model('KNN', x_train, y_train, x_test, y_test)
results = results.append(knn_results, ignore_index=True)
knn_results

Unnamed: 0,model,accuracy,precision,recall,f1
0,KNN,0.4576,0.4581,0.4576,0.4536


## 6. Train and evaluate Decision Tree Classifier

In [11]:
dt_results = train_evaluate_model('DT', x_train, y_train, x_test, y_test)
results = results.append(dt_results, ignore_index=True)
dt_results

Unnamed: 0,model,accuracy,precision,recall,f1
0,DT,0.3763,0.3796,0.3763,0.3757


## 7. Train and evaluate Support Vector Machines Classifier

In [12]:
svm_results = train_evaluate_model('SVM', x_train, y_train, x_test, y_test)
results = results.append(svm_results, ignore_index=True)
svm_results

[LibSVM]

Unnamed: 0,model,accuracy,precision,recall,f1
0,SVM,0.3797,0.3681,0.3797,0.3674


## 8. Train and evaluate Multilayer Perceptron Classifier

In [13]:
mlp_results = train_evaluate_model('MLP', x_train, y_train, x_test, y_test)
results = results.append(mlp_results, ignore_index=True)
mlp_results

Iteration 1, loss = 1.08136545
Iteration 2, loss = 1.05490502
Iteration 3, loss = 1.03708563
Iteration 4, loss = 1.02239355
Iteration 5, loss = 1.01088182
Iteration 6, loss = 1.00040271
Iteration 7, loss = 0.99281919
Iteration 8, loss = 0.98761361
Iteration 9, loss = 0.97957533
Iteration 10, loss = 0.97522872
Iteration 11, loss = 0.96803039
Iteration 12, loss = 0.96186536
Iteration 13, loss = 0.95727739
Iteration 14, loss = 0.95130678
Iteration 15, loss = 0.94582347
Iteration 16, loss = 0.94075382
Iteration 17, loss = 0.93452686
Iteration 18, loss = 0.93039136
Iteration 19, loss = 0.92280501
Iteration 20, loss = 0.91637700
Iteration 21, loss = 0.91025849
Iteration 22, loss = 0.90668468
Iteration 23, loss = 0.89940202
Iteration 24, loss = 0.89027354
Iteration 25, loss = 0.88648999
Iteration 26, loss = 0.88580416
Iteration 27, loss = 0.87684987
Iteration 28, loss = 0.86595807
Iteration 29, loss = 0.86435746
Iteration 30, loss = 0.85127183
Iteration 31, loss = 0.84467657
Iteration 32, los

Unnamed: 0,model,accuracy,precision,recall,f1
0,MLP,0.4068,0.4194,0.4068,0.4114


## 9. Save results file

In [14]:
results.to_csv('RESULTS/models_results_gm.csv', index=False)
results

Unnamed: 0,model,accuracy,precision,recall,f1
0,RF,0.4339,0.4401,0.4339,0.4321
1,KNN,0.4576,0.4581,0.4576,0.4536
2,DT,0.3763,0.3796,0.3763,0.3757
3,SVM,0.3797,0.3681,0.3797,0.3674
4,MLP,0.4068,0.4194,0.4068,0.4114
