In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
data = pd.read_csv("adult.csv")
data.describe()
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
numerical_column = data.select_dtypes(exclude = ['object']).columns
data = data.astype({'age':'float', 'fnlwgt':'float', 'education-num':'float', 'capital-gain':'float', 'capital-loss':'float',
       'hours-per-week':'float'}) 

In [4]:
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27.0,Private,257302.0,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0.0,0.0,38.0,United-States,<=50K
32557,40.0,Private,154374.0,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
32558,58.0,Private,151910.0,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0.0,0.0,40.0,United-States,<=50K
32559,22.0,Private,201490.0,HS-grad,9.0,Never-married,Adm-clerical,Own-child,White,Male,0.0,0.0,20.0,United-States,<=50K


In [5]:
 data["education-num_add"] = pd.cut(
                data["education-num"],
                bins=[0.0,3.0,6.0,9.0,12.0,15.0,18.0,np.inf],
                labels=[1,2,3,4,5,6,7]
            )

In [6]:
data.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary,education-num_add
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K,5
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K,5
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K,3


In [7]:
strat_train_set = None
strat_test_set = None
            
split = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

In [8]:
for train_index, test_index in split.split(data,data['education-num_add']):
                strat_train_set = data.loc[train_index].drop(['education-num_add'],axis=1)
                strat_test_set = data.loc[test_index].drop(['education-num_add'],axis=1)

In [9]:
import yaml
def read_yaml_file(file_path:str)->dict:
    """
    Reads a YAML file and returns the contents as a dictionary.
    file_path: str
    """
    try:
        with open(file_path,'rb') as yaml_file:
            return yaml.safe_load(yaml_file)
    except Exception as e:
        print(e)

schema_info = read_yaml_file(r"C:\Users\acer\Adult-Census-Income-Prediction\Adult-Census-Income-Prediction\config\schema.yaml")
schema_columns = list(schema_info["columns"].keys())

# schema_number_of_columns = len(schema_columns)
schema_columns

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education_num',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital_gain',
 'capital_loss',
 'hours_per_week',
 'country',
 'salary']

In [10]:
# schema_domain_values = list(schema_info["domain_value"]["marital-status","sex"])
schema_domain_values = schema_info["domain_value"]
schema_domain_values

{'marital-status': ['Never-married',
  'Married-civ-spouse',
  'Divorced',
  'Married-spouse-absent',
  'Separated',
  'Married-AF-spouse',
  'Widowed'],
 'sex': ['Male', 'Female']}

In [11]:
tmp_list = list()
tmp_list.extend(schema_info["domain_value"]["marital-status"])
tmp_list.extend(schema_info["domain_value"]["sex"])
tmp_list

['Never-married',
 'Married-civ-spouse',
 'Divorced',
 'Married-spouse-absent',
 'Separated',
 'Married-AF-spouse',
 'Widowed',
 'Male',
 'Female']

In [12]:
list(data["marital-status"].value_counts().index)

[' Married-civ-spouse',
 ' Never-married',
 ' Divorced',
 ' Separated',
 ' Widowed',
 ' Married-spouse-absent',
 ' Married-AF-spouse']

In [13]:
data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'country', 'salary',
       'education-num_add'],
      dtype='object')

In [14]:
list(data[["marital-status","sex"]].value_counts().index)

[(' Married-civ-spouse', ' Male'),
 (' Never-married', ' Male'),
 (' Never-married', ' Female'),
 (' Divorced', ' Female'),
 (' Divorced', ' Male'),
 (' Married-civ-spouse', ' Female'),
 (' Widowed', ' Female'),
 (' Separated', ' Female'),
 (' Separated', ' Male'),
 (' Married-spouse-absent', ' Male'),
 (' Married-spouse-absent', ' Female'),
 (' Widowed', ' Male'),
 (' Married-AF-spouse', ' Female'),
 (' Married-AF-spouse', ' Male')]