# DATASET GENERATOR

## Install Dependencies

In [None]:
%pip install numpy
%pip install pandas

## Import Necessary Libraries

In [1]:
import os
import numpy as np
import pandas as pd

## Load Dataset

In [6]:
# Variable
fileName = 'data_balita.csv'

# Do not change
filePath = os.path.join(os.getcwd(), 'data', 'source', fileName)
data = pd.read_csv(filePath)
data.head()

Unnamed: 0,Umur (bulan),Jenis Kelamin,Tinggi Badan (cm),Status Gizi
0,0,laki-laki,44.591973,stunted
1,0,laki-laki,56.705203,tinggi
2,0,laki-laki,46.863358,normal
3,0,laki-laki,47.508026,normal
4,0,laki-laki,42.743494,severely stunted


## Dataset Count

In [None]:
# Do not change
data.shape

(120999, 4)

## Check For NULL Value

In [None]:
# Do not change
data.isnull().sum()

Umur (bulan)         0
Jenis Kelamin        0
Tinggi Badan (cm)    0
Status Gizi          0
dtype: int64

## Group Records Based On Labels Then Shuffle

In [None]:
# Do not change
severelyStuntedData = data[data['Status Gizi'] == 'severely stunted']
stuntedData = data[data['Status Gizi'] == 'stunted']
normalData = data[data['Status Gizi'] == 'normal']
tinggiData = data[data['Status Gizi'] == 'tinggi']

severelyStuntedData = severelyStuntedData.sample(frac=1, random_state=42).reset_index(drop=True)
stuntedData = stuntedData.sample(frac=1, random_state=42).reset_index(drop=True)
normalData = normalData.sample(frac=1, random_state=42).reset_index(drop=True)
tinggiData = tinggiData.sample(frac=1, random_state=42).reset_index(drop=True)

## Split Each Group To Create Training, Validation, And Test Sets

In [None]:
# Variable
dataSplit = 0.99

# Do not change
columns = ['Umur (bulan)', 'Jenis Kelamin', 'Tinggi Badan (cm)', 'Status Gizi']
trainData = pd.DataFrame(columns=columns)
valtestData = pd.DataFrame(columns=columns)

splitIndex = int(severelyStuntedData.shape[0] * dataSplit)
trainSplit = severelyStuntedData[:splitIndex]
valtestSplit = severelyStuntedData[splitIndex:]
trainData = pd.concat([trainData, trainSplit], ignore_index=True)
valtestData = pd.concat([valtestData, valtestSplit], ignore_index=True)

splitIndex = int(stuntedData.shape[0] * dataSplit)
trainSplit = stuntedData[:splitIndex]
valtestSplit = stuntedData[splitIndex:]
trainData = pd.concat([trainData, trainSplit], ignore_index=True)
valtestData = pd.concat([valtestData, valtestSplit], ignore_index=True)

splitIndex = int(normalData.shape[0] * dataSplit)
trainSplit = normalData[:splitIndex]
valtestSplit = normalData[splitIndex:]
trainData = pd.concat([trainData, trainSplit], ignore_index=True)
valtestData = pd.concat([valtestData, valtestSplit], ignore_index=True)

splitIndex = int(tinggiData.shape[0] * dataSplit)
trainSplit = tinggiData[:splitIndex]
valtestSplit = tinggiData[splitIndex:]
trainData = pd.concat([trainData, trainSplit], ignore_index=True)
valtestData = pd.concat([valtestData, valtestSplit], ignore_index=True)

print(trainData.shape,valtestData.shape)

(119787, 4) (1212, 4)


  trainData = pd.concat([trainData, trainSplit], ignore_index=True)
  valtestData = pd.concat([valtestData, valtestSplit], ignore_index=True)


## Check Trainingset Distribution And Prune To Get Equally Distributed Labels

In [39]:
# Do not change
statusCount = trainData['Status Gizi'].value_counts()
print(statusCount)

finalTrain = pd.DataFrame(columns=columns)

minCount = statusCount.min()
for status, count in statusCount.items():
    prunedRecords = trainData[trainData['Status Gizi'] == status][minCount:]
    keepedRecords = trainData[trainData['Status Gizi'] == status][:minCount]
    valtestData = pd.concat([valtestData, prunedRecords], ignore_index=True)
    finalTrain = pd.concat([finalTrain, keepedRecords], ignore_index=True)

statusCount = finalTrain['Status Gizi'].value_counts()
print(statusCount)
print(finalTrain.shape,valtestData.shape)

Status Gizi
normal              67077
severely stunted    19670
tinggi              19364
stunted             13676
Name: count, dtype: int64
Status Gizi
normal              13676
severely stunted    13676
tinggi              13676
stunted             13676
Name: count, dtype: int64
(54704, 4) (66295, 4)


  finalTrain = pd.concat([finalTrain, keepedRecords], ignore_index=True)


## Split ValtestData Into Validationset and Testset

In [40]:
# Variable
split = 0.99
# Do not change
splitIndex = int(valtestData.shape[0] * split)
valtestData = valtestData.sample(frac=1, random_state=50).reset_index(drop=True)

finalVal = valtestData.iloc[:splitIndex]
finalTest = valtestData.iloc[splitIndex:]

print(finalTrain.shape,finalVal.shape,finalTest.shape)

(54704, 4) (65632, 4) (663, 4)


## Save The Datasets

In [41]:
# Variable
folderName = "StuntingClassificationDataset"

# Do not change
outputDir = os.path.join(os.getcwd(), 'data', folderName)
os.makedirs(outputDir, exist_ok=True)

finalTrain.to_csv(os.path.join(outputDir, 'train.csv'), index=False)
finalVal.to_csv(os.path.join(outputDir, 'val.csv'), index=False)
finalTest.to_csv(os.path.join(outputDir, 'test.csv'), index=False)