In [135]:
import pandas as pd
import os
from sklearn.model_selection import StratifiedKFold, train_test_split


In [99]:
columns = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "sex",
    "capital_gain", "capital_loss", "hours_per_week", "native_country", "income"
]

In [101]:
data = pd.read_csv('./adult/adult.data', names=columns,na_values=' ?')

In [103]:
print('Data shape:', data.shape)

Data shape: (32561, 15)


In [105]:
X = data.drop('income', axis=1)
y = data['income']

stratified k-fold

In [108]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [110]:
train_val_index, test_index = next(skf.split(X, y))

In [112]:
X_train_val, X_test = X.iloc[train_val_index], X.iloc[test_index]
y_train_val, y_test = y.iloc[train_val_index], y.iloc[test_index]

In [114]:
inner_skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

In [116]:
train_index, val_index = next(inner_skf.split(X_train_val, y_train_val))

In [118]:
X_train, X_val = X.iloc[train_index], X.iloc[val_index]
y_train, y_val = y.iloc[train_index], y.iloc[val_index]

In [120]:
# Verify the split
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (19536, 14)
y_train shape: (19536,)
X_val shape: (6512, 14)
y_val shape: (6512,)
X_test shape: (6513, 14)
y_test shape: (6513,)


In [122]:
# Verify class distribution
print("\nClass distribution:")
print("Train set distribution:\n", y_train.value_counts(normalize=True))
print("Validation set distribution:\n", y_val.value_counts(normalize=True))
print("Test set distribution:\n", y_test.value_counts(normalize=True))


Class distribution:
Train set distribution:
 income
 <=50K    0.761722
 >50K     0.238278
Name: proportion, dtype: float64
Validation set distribution:
 income
 <=50K    0.75645
 >50K     0.24355
Name: proportion, dtype: float64
Test set distribution:
 income
 <=50K    0.759097
 >50K     0.240903
Name: proportion, dtype: float64


train_test_split

In [124]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)

In [127]:
# Verify the split
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

# Verify class distribution
print("\nClass distribution:")
print("Train set distribution:\n", y_train.value_counts(normalize=True))
print("Validation set distribution:\n", y_val.value_counts(normalize=True))
print("Test set distribution:\n", y_test.value_counts(normalize=True))

X_train shape: (19536, 14)
y_train shape: (19536,)
X_val shape: (6512, 14)
y_val shape: (6512,)
X_test shape: (6513, 14)
y_test shape: (6513,)

Class distribution:
Train set distribution:
 income
 <=50K    0.759163
 >50K     0.240837
Name: proportion, dtype: float64
Validation set distribution:
 income
 <=50K    0.759214
 >50K     0.240786
Name: proportion, dtype: float64
Test set distribution:
 income
 <=50K    0.759251
 >50K     0.240749
Name: proportion, dtype: float64


In [129]:
train_data = X_train.copy()
train_data['income'] = y_train
val_data = X_val.copy()
val_data['income'] = y_val
test_data = X_test.copy()
test_data['income'] = y_test

In [133]:
test_data

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
9009,31,Self-emp-inc,117963,Doctorate,16,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,<=50K
25134,20,,201490,Some-college,10,Never-married,,Own-child,White,Male,0,0,40,United-States,<=50K
16682,78,Self-emp-inc,385242,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,9386,0,45,United-States,>50K
27044,32,Private,164197,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,44,United-States,>50K
3302,61,Private,92691,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,3,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11814,42,Self-emp-not-inc,32185,Bachelors,13,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,60,United-States,>50K
27934,34,Private,180714,Some-college,10,Married-civ-spouse,Transport-moving,Husband,Black,Male,0,2179,40,United-States,<=50K
6890,39,Private,202027,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,45,United-States,>50K
32529,29,Private,125976,HS-grad,9,Separated,Sales,Unmarried,White,Female,0,0,35,United-States,<=50K


In [137]:
os.makedirs("splitted_data", exist_ok=True)
train_data.to_csv("./splitted_data/train_data.csv", index=False)
val_data.to_csv("./splitted_data/validation_data.csv", index=False)
test_data.to_csv("./splitted_data/test_data.csv", index=False)