In [1]:
import os
import pandas as pd
import numpy as np
import re


In [2]:
data_root_path = os.path.join('artifacts', 'data')
files = os.listdir(data_root_path)
print(files)

['adult.data', 'adult.names', 'adult.test', 'Index', 'old.adult.names']


In [3]:
train_data_path = os.path.join(data_root_path, 'adult.data')
test_data_path =  os.path.join(data_root_path, 'adult.test')

In [4]:
train_data_path = os.path.join(data_root_path, 'adult.data')
test_data_path =  os.path.join(data_root_path, 'adult.test')
text_file_path =  os.path.join(data_root_path, 'adult.names')

In [5]:
with open(text_file_path) as text:
    cols = []
    for line in text:
        sre = re.match(r'(?P<colname>[a-z\-]+):.*\.', line)
        if sre:
            cols.append(sre.group('colname'))
    cols.append('label')

In [6]:
options = {'header': None, 'names': cols, 'skipinitialspace': True}
income_df = pd.read_csv(train_data_path, **options)
income_df.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [7]:
test_income_df = pd.read_csv(test_data_path, skiprows=1, **options)
test_income_df.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [8]:
print(len(income_df))
print(len(test_income_df)) 
print(len(income_df) + len(test_income_df)) 

32561
16281
48842


In [9]:
test_income_df.label = test_income_df.label.str.strip('.')

In [10]:
test_income_df.label

0        <=50K
1        <=50K
2         >50K
3         >50K
4        <=50K
         ...  
16276    <=50K
16277    <=50K
16278    <=50K
16279    <=50K
16280     >50K
Name: label, Length: 16281, dtype: object

In [11]:
test_income_df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K
16277,64,?,321403,HS-grad,9,Widowed,?,Other-relative,Black,Male,0,0,40,United-States,<=50K
16278,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K
16279,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K


In [12]:
# Numerical features that are marked as continuous
NUMERIC_FEATURES = ['fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

# Feature that the model will predict
LABEL_KEY = 'label'

# Feature that will be grouped into buckets
BUCKET_FEATURE = 'age'

# Features with string data types that will be converted to indices
CATEGORICAL_FEATURES = [
    'education', 'marital-status', 'occupation', 'race', 'relationship', 'workclass', 'sex', 'native-country'
]

In [13]:
def bucketize_with_labels(data, column_name, num_buckets, custom_labels):
    """
     Bcketize a numerical feature with custom labels.

    Args:
        data (pd.DataFrame): The input DataFrame.
        column_name (str): Name of the column to be bucketized.
        num_buckets (int): Number of desired buckets.
        custom_labels (list of str): Custom labels for each bucket.

    Returns:
        pd.Series: A new series with bucket labels.
    """
    # Extract the specified column
    column_values = data[column_name]

    # Use pd.qcut to create quantile-based buckets
    bucketized_series = pd.cut(column_values, bins=num_buckets, labels=custom_labels)

    return bucketized_series


In [14]:
 j= bucketize_with_labels(income_df, 'age', 4, ['young', 'adult', 'middle_age', 'old'])
type(j)

pandas.core.series.Series

In [15]:

# Create a bucket to categorize the age
income_df['age'] = bucketize_with_labels(income_df, 'age', 4, ['young', 'adult', 'middle_age', 'old'])
test_income_df['age'] = bucketize_with_labels(test_income_df, 'age', 4, ['young', 'adult', 'middle_age', 'old'])

In [16]:
# Replace values in the 'Category' column using the mapping
category_mapping = {'young':0, 'adult':1, 'middle_age':2, 'old':3}

income_df['age'] = income_df['age'].replace(category_mapping)

In [17]:
test_income_df['age'] = test_income_df['age'].replace(category_mapping)

In [18]:
def min_max_scaler(df, column_name):
    """
    Scales a specific column to the range [0, 1].

    Args:
        df (pd.DataFrame): The input DataFrame.
        column_name (str): Name of the column to be scaled.

    Returns:
        panda.core.series.Series: The DataFrame with the scaled column.
    """
    # Extract the specified column
    column_values = df[column_name]

    # Calculate the minimum and maximum values
    min_val = column_values.min()
    max_val = column_values.max()

    # Scale the column to [0, 1]
    scaled_column = (column_values - min_val) / (max_val - min_val)

    return scaled_column
    


In [19]:
for column in NUMERIC_FEATURES:
    income_df[column] = min_max_scaler(income_df, column)
    test_income_df[column] = min_max_scaler(test_income_df, column)
    

In [20]:
income_df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,1,State-gov,0.044302,Bachelors,0.8,Never-married,Adm-clerical,Not-in-family,White,Male,0.02174,0.0,0.397959,United-States,<=50K
1,1,Self-emp-not-inc,0.048238,Bachelors,0.8,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,0.122449,United-States,<=50K
2,1,Private,0.138113,HS-grad,0.533333,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,0.397959,United-States,<=50K
3,1,Private,0.151068,11th,0.4,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,0.397959,United-States,<=50K
4,0,Private,0.221488,Bachelors,0.8,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,0.397959,Cuba,<=50K
5,1,Private,0.184932,Masters,0.866667,Married-civ-spouse,Exec-managerial,Wife,White,Female,0.0,0.0,0.397959,United-States,<=50K
6,1,Private,0.100448,9th,0.266667,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0.0,0.0,0.153061,Jamaica,<=50K
7,1,Self-emp-not-inc,0.134036,HS-grad,0.533333,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,0.44898,United-States,>50K
8,0,Private,0.022749,Masters,0.866667,Never-married,Prof-specialty,Not-in-family,White,Female,0.140841,0.0,0.5,United-States,>50K
9,1,Private,0.099947,Bachelors,0.8,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.051781,0.0,0.397959,United-States,>50K


In [21]:
test_income_df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,0,Private,0.14443,11th,0.4,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,0.397959,United-States,<=50K
1,1,Private,0.051677,HS-grad,0.533333,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,0.5,United-States,<=50K
2,0,Local-gov,0.219011,Assoc-acdm,0.733333,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,0.397959,United-States,>50K
3,1,Private,0.099418,Some-college,0.6,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,0.076881,0.0,0.397959,United-States,>50K
4,0,?,0.060942,Some-college,0.6,Never-married,?,Own-child,White,Female,0.0,0.0,0.295918,United-States,<=50K
5,0,Private,0.125398,10th,0.333333,Never-married,Other-service,Not-in-family,White,Male,0.0,0.0,0.295918,United-States,<=50K
6,0,?,0.144582,HS-grad,0.533333,Never-married,?,Unmarried,Black,Male,0.0,0.0,0.397959,United-States,<=50K
7,2,Self-emp-not-inc,0.061706,Prof-school,0.933333,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.03103,0.0,0.316327,United-States,>50K
8,0,Private,0.241163,Some-college,0.6,Never-married,Other-service,Unmarried,White,Female,0.0,0.0,0.397959,United-States,<=50K
9,2,Private,0.061956,7th-8th,0.2,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,0.091837,United-States,<=50K


In [22]:
label_data = pd.get_dummies(income_df['label'])

In [23]:
label_data

Unnamed: 0,<=50K,>50K
0,True,False
1,True,False
2,True,False
3,True,False
4,True,False
...,...,...
32556,True,False
32557,False,True
32558,True,False
32559,True,False


In [24]:
income_df['label'] = np.where(income_df['label']=="<=50K", 0, 1)
test_income_df['label'] = np.where(test_income_df['label']=="<=50K", 0, 1)

In [25]:
income_df.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
32556,0,Private,0.166404,Assoc-acdm,0.733333,Married-civ-spouse,Tech-support,Wife,White,Female,0.0,0.0,0.377551,United-States,0
32557,1,Private,0.0965,HS-grad,0.533333,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,0.397959,United-States,1
32558,2,Private,0.094827,HS-grad,0.533333,Widowed,Adm-clerical,Unmarried,White,Female,0.0,0.0,0.397959,United-States,0
32559,0,Private,0.128499,HS-grad,0.533333,Never-married,Adm-clerical,Own-child,White,Male,0.0,0.0,0.193878,United-States,0
32560,1,Self-emp-inc,0.187203,HS-grad,0.533333,Married-civ-spouse,Exec-managerial,Wife,White,Female,0.150242,0.0,0.397959,United-States,1


In [26]:
test_income_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,0,Private,0.14443,11th,0.4,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,0.397959,United-States,0
1,1,Private,0.051677,HS-grad,0.533333,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,0.5,United-States,0
2,0,Local-gov,0.219011,Assoc-acdm,0.733333,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,0.397959,United-States,1
3,1,Private,0.099418,Some-college,0.6,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,0.076881,0.0,0.397959,United-States,1
4,0,?,0.060942,Some-college,0.6,Never-married,?,Own-child,White,Female,0.0,0.0,0.295918,United-States,0


In [27]:
def compute_and_apply_vocabulary(df, column):
    """
    Custom function to compute vocabulary and map words to integer indices.

    Args:
        df (DataFrame): pandas data frame.
        column (str): name of the desired feature.
        

    Returns:
        dict: A dictionary mapping unique words to integer indices.
        series: A Panda series of integers
    """
    # Step 1: Extract all words from the documents
    list_of_words = list(df[column])

    # Step 2: Create a vocabulary dictionary
    vocabulary = {}
    for idx, word in enumerate(set(list_of_words)):
        vocabulary[word] = idx

    # Step 3: Replace words in the documents with their corresponding indices
    
    new_list = df[column].replace(vocabulary)

    return new_list, vocabulary

In [28]:
transformed_list, vocabulary = compute_and_apply_vocabulary(income_df, 'education')

In [29]:
transformed_list

0         3
1         3
2         0
3        11
4         3
         ..
32556    10
32557     0
32558     0
32559     0
32560     0
Name: education, Length: 32561, dtype: int64

In [30]:
vocabulary

{'HS-grad': 0,
 'Some-college': 1,
 'Doctorate': 2,
 'Bachelors': 3,
 '12th': 4,
 '5th-6th': 5,
 '1st-4th': 6,
 'Prof-school': 7,
 'Assoc-voc': 8,
 '9th': 9,
 'Assoc-acdm': 10,
 '11th': 11,
 '10th': 12,
 'Preschool': 13,
 'Masters': 14,
 '7th-8th': 15}

In [31]:
for column in CATEGORICAL_FEATURES:
    income_df[column], vocabulary = compute_and_apply_vocabulary(income_df, column)
    print(vocabulary)
    test_income_df[column], vocabulary = compute_and_apply_vocabulary(test_income_df, column)
    print(vocabulary)

{'HS-grad': 0, 'Some-college': 1, 'Doctorate': 2, 'Bachelors': 3, '12th': 4, '5th-6th': 5, '1st-4th': 6, 'Prof-school': 7, 'Assoc-voc': 8, '9th': 9, 'Assoc-acdm': 10, '11th': 11, '10th': 12, 'Preschool': 13, 'Masters': 14, '7th-8th': 15}
{'HS-grad': 0, 'Some-college': 1, 'Masters': 2, 'Doctorate': 3, '12th': 4, '5th-6th': 5, '1st-4th': 6, 'Prof-school': 7, 'Assoc-voc': 8, 'Assoc-acdm': 9, '10th': 10, '11th': 11, '9th': 12, 'Preschool': 13, 'Bachelors': 14, '7th-8th': 15}
{'Married-spouse-absent': 0, 'Separated': 1, 'Married-AF-spouse': 2, 'Divorced': 3, 'Married-civ-spouse': 4, 'Widowed': 5, 'Never-married': 6}
{'Separated': 0, 'Married-spouse-absent': 1, 'Married-AF-spouse': 2, 'Divorced': 3, 'Married-civ-spouse': 4, 'Widowed': 5, 'Never-married': 6}
{'Protective-serv': 0, 'Adm-clerical': 1, 'Priv-house-serv': 2, 'Other-service': 3, 'Prof-specialty': 4, 'Craft-repair': 5, 'Tech-support': 6, '?': 7, 'Armed-Forces': 8, 'Sales': 9, 'Farming-fishing': 10, 'Machine-op-inspct': 11, 'Handler

In [32]:
income_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,1,8,0.044302,3,0.8,6,1,0,3,1,0.02174,0.0,0.397959,37,0
1,1,3,0.048238,3,0.8,4,14,3,3,1,0.0,0.0,0.122449,37,0
2,1,7,0.138113,0,0.533333,3,12,0,3,1,0.0,0.0,0.397959,37,0
3,1,7,0.151068,11,0.4,4,12,3,4,1,0.0,0.0,0.397959,37,0
4,0,7,0.221488,3,0.8,4,4,1,4,0,0.0,0.0,0.397959,18,0


In [33]:
from utils import read_yaml_file, create_directories
from pathlib import Path

config_box = read_yaml_file(Path("artifacts/ConfigFiles/config.yaml"))
config = config_box.data_transformation

yaml file: artifacts\ConfigFiles\config.yaml loaded successfully


In [34]:
if not os.path.exists(Path(config.root_dir)):
    create_directories([Path(config.root_dir)])
    
income_df.to_csv(Path(config.transformed_train_data), index= False)
test_income_df.to_csv(Path(config.transformed_test_data), index = False)