# TRTR Dataset D

In [1]:
#import libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import os
print('Libraries imported!!')

Libraries imported!!


In [2]:
#define directory of functions and actual directory
HOME_PATH = '' #home path of the project
FUNCTIONS_DIR = 'EVALUATION FUNCTIONS/UTILITY'
ACTUAL_DIR = os.getcwd()

#change directory to functions directory
os.chdir(HOME_PATH + FUNCTIONS_DIR)

#import functions for data labelling analisys
from utility_evaluation import DataPreProcessor
from utility_evaluation import train_evaluate_model

#change directory to actual directory
os.chdir(ACTUAL_DIR)
print('Functions imported!!')

Functions imported!!


## 1. Read data

In [3]:
#read real dataset
train_data = pd.read_csv(HOME_PATH + 'REAL DATASETS/TRAIN DATASETS/D_ContraceptiveMethod_Real_Train.csv')
categorical_columns = ['wife_education','husband_education','wife_religion','wife_working','husband_occupation',
                        'standard_of_living_index','media_exposure','contraceptive_method_used']
for col in categorical_columns :
    train_data[col] = train_data[col].astype('category')
train_data

Unnamed: 0,wife_age,wife_education,husband_education,children,wife_religion,wife_working,husband_occupation,standard_of_living_index,media_exposure,contraceptive_method_used
0,36,2,4,11,1,1,1,4,1,3
1,48,1,3,4,1,1,3,3,1,1
2,38,1,2,8,1,1,3,3,1,1
3,27,2,2,5,1,1,3,1,0,3
4,26,3,2,4,1,1,3,4,0,3
...,...,...,...,...,...,...,...,...,...,...
1173,34,2,4,6,1,0,3,4,0,1
1174,23,4,4,1,1,0,3,4,0,2
1175,29,3,4,3,1,1,3,2,0,3
1176,37,4,4,5,0,0,2,4,0,3


In [4]:
#read test data
test_data = pd.read_csv(HOME_PATH + 'REAL DATASETS/TEST DATASETS/D_ContraceptiveMethod_Real_Test.csv')
for col in categorical_columns :
    test_data[col] = test_data[col].astype('category')
test_data

Unnamed: 0,wife_age,wife_education,husband_education,children,wife_religion,wife_working,husband_occupation,standard_of_living_index,media_exposure,contraceptive_method_used
0,29,3,3,4,1,1,3,4,0,3
1,35,3,4,5,1,0,2,2,0,1
2,23,4,4,1,0,1,1,4,0,2
3,30,2,3,3,1,1,3,1,0,1
4,22,2,3,1,1,1,3,3,0,1
...,...,...,...,...,...,...,...,...,...,...
290,38,4,4,3,1,0,1,4,0,2
291,25,2,2,1,1,1,3,3,0,3
292,25,3,1,4,1,1,3,1,0,2
293,24,4,3,3,1,1,3,3,0,3


In [5]:
target = 'contraceptive_method_used'
#quick look at the breakdown of class values
print('Train data')
print(train_data.shape)
print(train_data.groupby(target).size())
print('#####################################')
print('Test data')
print(test_data.shape)
print(test_data.groupby(target).size())

Train data
(1178, 10)
contraceptive_method_used
1    499
2    262
3    417
dtype: int64
#####################################
Test data
(295, 10)
contraceptive_method_used
1    130
2     71
3     94
dtype: int64


## 2. Pre-process training data

In [6]:
target = 'contraceptive_method_used'
categorical_columns = ['wife_education','husband_education','wife_religion','wife_working','husband_occupation',
                       'standard_of_living_index','media_exposure']
numerical_columns = train_data.select_dtypes(include=['int64','float64']).columns.tolist()
categories = [np.array([0, 1, 2, 3]), np.array([0, 1, 2, 3]), np.array([0, 1]), np.array([0, 1]), 
              np.array([0, 1, 2, 3]), np.array([0, 1, 2, 3]), np.array([0, 1])]

data_preprocessor = DataPreProcessor(categorical_columns, numerical_columns, categories)
x_train = data_preprocessor.preprocess_train_data(train_data.loc[:, train_data.columns != target])
y_train = train_data.loc[:, target]

x_train.shape, y_train.shape

((1178, 24), (1178,))

## 3. Preprocess test data

In [7]:
x_test = data_preprocessor.preprocess_test_data(test_data.loc[:, test_data.columns != target])
y_test = test_data.loc[:, target]
x_test.shape, y_test.shape

((295, 24), (295,))

## 4. Create a dataset to save the results

In [8]:
results = pd.DataFrame(columns = ['model','accuracy','precision','recall','f1'])
results

Unnamed: 0,model,accuracy,precision,recall,f1


## 4. Train and evaluate Random Forest Classifier

In [9]:
rf_results = train_evaluate_model('RF', x_train, y_train, x_test, y_test)
results = results.append(rf_results, ignore_index=True)
rf_results

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished


Unnamed: 0,model,accuracy,precision,recall,f1
0,RF,0.5288,0.5226,0.5288,0.5232


## 5. Train and Evaluate KNeighbors Classifier

In [10]:
knn_results = train_evaluate_model('KNN', x_train, y_train, x_test, y_test)
results = results.append(knn_results, ignore_index=True)
knn_results

Unnamed: 0,model,accuracy,precision,recall,f1
0,KNN,0.5186,0.524,0.5186,0.5173


## 6. Train and evaluate Decision Tree Classifier

In [11]:
dt_results = train_evaluate_model('DT', x_train, y_train, x_test, y_test)
results = results.append(dt_results, ignore_index=True)
dt_results

Unnamed: 0,model,accuracy,precision,recall,f1
0,DT,0.5051,0.5022,0.5051,0.5031


## 7. Train and evaluate Support Vector Machines Classifier

In [12]:
svm_results = train_evaluate_model('SVM', x_train, y_train, x_test, y_test)
results = results.append(svm_results, ignore_index=True)
svm_results

[LibSVM]

Unnamed: 0,model,accuracy,precision,recall,f1
0,SVM,0.3763,0.3499,0.3763,0.3568


## 8. Train and evaluate Multilayer Perceptron Classifier

In [13]:
mlp_results = train_evaluate_model('MLP', x_train, y_train, x_test, y_test)
results = results.append(mlp_results, ignore_index=True)
mlp_results

Iteration 1, loss = 1.07357797
Iteration 2, loss = 1.03344273
Iteration 3, loss = 1.00372110
Iteration 4, loss = 0.97884543
Iteration 5, loss = 0.95352749
Iteration 6, loss = 0.93481026
Iteration 7, loss = 0.91859182
Iteration 8, loss = 0.90458880
Iteration 9, loss = 0.89249298
Iteration 10, loss = 0.88135028
Iteration 11, loss = 0.87626128
Iteration 12, loss = 0.86359562
Iteration 13, loss = 0.85727672
Iteration 14, loss = 0.85049755
Iteration 15, loss = 0.84176044
Iteration 16, loss = 0.83660931
Iteration 17, loss = 0.82904798
Iteration 18, loss = 0.82105654
Iteration 19, loss = 0.81464651
Iteration 20, loss = 0.80737258
Iteration 21, loss = 0.80366191
Iteration 22, loss = 0.79911730
Iteration 23, loss = 0.79152012
Iteration 24, loss = 0.78192791
Iteration 25, loss = 0.77822910
Iteration 26, loss = 0.77242582
Iteration 27, loss = 0.76419193
Iteration 28, loss = 0.75817664
Iteration 29, loss = 0.75301350
Iteration 30, loss = 0.75140847
Iteration 31, loss = 0.74483144
Iteration 32, los

Unnamed: 0,model,accuracy,precision,recall,f1
0,MLP,0.5322,0.522,0.5322,0.5241


## 9. Save results file

In [14]:
results.to_csv('RESULTS/models_results_real.csv', index=False)
results

Unnamed: 0,model,accuracy,precision,recall,f1
0,RF,0.5288,0.5226,0.5288,0.5232
1,KNN,0.5186,0.524,0.5186,0.5173
2,DT,0.5051,0.5022,0.5051,0.5031
3,SVM,0.3763,0.3499,0.3763,0.3568
4,MLP,0.5322,0.522,0.5322,0.5241
