In [1]:
import src
import importlib
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from src.component.transformation import FrequencyEncoder, Winsorizer
from src.component.feature_extraction import FeatureExtractor
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder, MinMaxScaler
from src.utils import fetch_data, transforme_DataFrame, plot_categorical_features
importlib.reload(src.component.feature_extraction)
importlib.reload(src.utils)

<module 'src.utils' from 'f:\\data science\\ml projects\\ml project by engineering wala bhaiya\\ml_pipeline_project\\src\\utils.py'>

In [2]:
income_data = fetch_data(FILE_NAME="Imputed_Income_Dataset_RF.csv", DIRECTORY_NAME="processed")
income_data.columns = [col.replace('-', '_'). strip() for col in income_data.columns]
income_data.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass,education,marital_status,occupation,relationship,race,sex,native_country,income
0,39,77516,13,2174,0,40,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,50,83311,13,0,0,13,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,38,215646,9,0,0,40,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,53,234721,7,0,0,40,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,28,338409,13,0,0,40,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K


## Feature Extraction

In [3]:
with open('config/mappings.json', 'r') as json_file:
    mappings = json.load(json_file)
with open('config/transform_parameters.json', 'r') as json_file:
    transform_parameters = json.load(json_file)
with open('config/transform_features.json', 'r') as json_file:
    transform_features = json.load(json_file)

In [4]:
extract = FeatureExtractor()
income_data = extract.fit_transform(income_data)
income_data['age_group'] = income_data['age_group'].astype('object')
income_data['employment_type'] = income_data['employment_type'].astype('object')
income_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 25 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   age                          32561 non-null  int64  
 1   fnlwgt                       32561 non-null  int64  
 2   education_num                32561 non-null  int64  
 3   capital_gain                 32561 non-null  int64  
 4   capital_loss                 32561 non-null  int64  
 5   hours_per_week               32561 non-null  int64  
 6   workclass                    32561 non-null  object 
 7   education                    32561 non-null  object 
 8   marital_status               32561 non-null  object 
 9   occupation                   32561 non-null  object 
 10  relationship                 32561 non-null  object 
 11  race                         32561 non-null  object 
 12  sex                          32561 non-null  object 
 13  native_country  

## Feature Transformation

#### 1. Split Data

In [5]:
# Splitting data into train and test sets.
X_train, X_test, y_train, y_test = train_test_split(
    income_data.drop(columns=['income']),
    income_data['income'],
    test_size=0.15,
    random_state=42
    )

#### 2. Encode X Set

In [26]:
# Dropping those columns that are not suppose to encode.
X_train_droped = X_train.drop(columns=transform_features['target_features'])
X_test_droped = X_test.drop(columns=transform_features['target_features'])

# Applying Column Transformer to the whole dataset excetp on target features.
X_preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop='first'), transform_features['onehot_features']),
        ('ordinal', OrdinalEncoder(
            categories=list(transform_parameters.values())[:-1]
            ), transform_features['ordinal_features']),
        ('frequency', FrequencyEncoder(), transform_features['frequency_features']),
        ('winsorizer', Winsorizer(
            feature_limits=transform_parameters['winsorize_limit']
            ), transform_features['winsorize_featues']),
        ('minmax', MinMaxScaler(), transform_features['scale_features']),
        ('remainders', 'passthrough', transform_features['remander_features']),
        # We will use target encoder duting the training and validation process.
    ],
    remainder='passthrough'
)

# Fitting, Nameing & Transforming features on X_train_droped and X_test_droped.
X_preprocessor.fit(X=X_train_droped, y=y_train)
for name, cols in zip(
    ['frequency', 'winsorizer'],
    [transform_features['frequency_features'], transform_features['winsorize_featues']]
    ):
    X_preprocessor.named_transformers_[name].set_feature_names(cols)
X_train_transformed = X_preprocessor.transform(X_train_droped)
X_test_transformed = X_preprocessor.transform(X_test_droped)

# Converting them back into dataframes based on there columns name.
X_train_transformed = transforme_DataFrame(transformed=X_train_transformed, preprocessor=X_preprocessor)
X_test_transformed = transforme_DataFrame(transformed=X_test_transformed, preprocessor=X_preprocessor)

# Dropping duplicated columns by choosing scaled versioned of them.
duplicated_features = transform_features['winsorize_featues']
scale_X_train = X_train_transformed[duplicated_features]
scale_X_test = X_test_transformed[duplicated_features]
scale_X_train = scale_X_train.loc[:, (scale_X_train.ge(0) & scale_X_train.le(1)).all()]
scale_X_test = scale_X_test.loc[:, (scale_X_test.ge(0) & scale_X_test.le(1)).all()]

# Dropping all the duplicated columns that share intersection by name.
X_train_transformed.drop(columns=duplicated_features, axis=1, inplace=True)

# Adding the target features (dropped ones) and scaled features back into the dataframes.
X_train_transformed = pd.concat(
    [
        scale_X_train,
        X_train[transform_features['target_features']].reset_index(drop=True),
        X_train_transformed
        ],
    axis=1
    )
X_test_transformed = pd.concat(
    [
        scale_X_test,
        X_test[transform_features['target_features']].reset_index(drop=True),
        X_test_transformed
        ],
    axis=1
    )

#### 3. Encode Y Set

In [34]:
# Make object for the label encoder and converting them back into Pandas Series for X_train and X_test.
label_encoder = LabelEncoder()
y_train_transformed = transforme_DataFrame(
    label_encoder.fit_transform(y_train),
    label_encoder,
    matrix=False
    )
y_train_transformed.name = y_train.name

y_test_transformed = transforme_DataFrame(
    label_encoder.transform(y_test),
    label_encoder,
    matrix=False
    )
y_test_transformed.name = y_train.name

#### 4. Save All Set's

In [31]:
income_data.to_csv('data/featured/income_data.csv', index=False)
X_train_transformed.to_csv('data/featured/X_train_transformed.csv', index=False)
X_test_transformed.to_csv('data/featured/X_test_transformed.csv', index=False)
y_train_transformed.to_csv('data/featured/y_train_transformed.csv', index=False)
y_test_transformed.to_csv('data/featured/y_test_transformed.csv', index=False)

## Depreciated code

In [134]:
# Check is there is no outlier in higher capital gain -> So, found nothing
'''income_data.loc[income_data['capital_gain'].nlargest(n=30).index]'''

# Mapping country_to_region & education_mapping in to JSON File.
'''country_to_region = {
    'United-States': 'North America',
    'Cuba': 'Caribbean',
    'Jamaica': 'Caribbean',
    'India': 'Asia',
    'Mexico': 'North America',
    'South': 'Asia',
    'Puerto-Rico': 'Caribbean',
    'Honduras': 'Central America',
    'England': 'Europe',
    'Canada': 'North America',
    'Germany': 'Europe',
    'Iran': 'Asia',
    'Philippines': 'Asia',
    'Poland': 'Europe',
    'Columbia': 'South America',
    'Cambodia': 'Asia',
    'Thailand': 'Asia',
    'Ecuador': 'South America',
    'Laos': 'Asia',
    'Taiwan': 'Asia',
    'Haiti': 'Caribbean',
    'Portugal': 'Europe',
    'Dominican-Republic': 'Caribbean',
    'El-Salvador': 'Central America',
    'France': 'Europe',
    'Guatemala': 'Central America',
    'Italy': 'Europe',
    'China': 'Asia',
    'Japan': 'Asia',
    'Yugoslavia': 'Europe',
    'Peru': 'South America',
    'Outlying-US(Guam-USVI-etc)': 'Oceania',
    'Scotland': 'Europe',
    'Trinadad&Tobago': 'Caribbean',
    'Greece': 'Europe',
    'Nicaragua': 'Central America',
    'Vietnam': 'Asia',
    'Hong': 'Asia',
    'Ireland': 'Europe',
    'Hungary': 'Europe',
    'Holand-Netherlands': 'Europe'
}

education_map = {
    'Preschool': 'Low',
    '1st-4th': 'Low',
    '5th-6th': 'Low',
    '7th-8th': 'Medium',
    '9th': 'Medium',
    '10th': 'Medium',
    '11th': 'Medium',
    '12th': 'Medium',
    'HS-grad': 'Medium',
    'Some-college': 'High',
    'Assoc-voc': 'High',
    'Assoc-acdm': 'High',
    'Bachelors': 'High',
    'Masters': 'High',
    'Doctorate': 'High',
    'Prof-school': 'High'
}

combined_mappings = {
    'education_map': education_map,
    'continent_map': country_to_region
}

combined_mappings_json = json.dumps(obj=combined_mappings, indent=4)

with open('config/mappings.json', 'w') as json_file:
    json_file.write(combined_mappings_json)'''

# Feature Extraction    
'''
# 1. Extract Age Groups from -> Age
income_data['age_group'] = pd.cut(x=income_data['age'], bins=[0, 18, 35, 55, income_data['age'].max() + 20], labels=['Childern', 'Young Adults', 'Middle Aged', 'Seniors']).astype('object')

# 2. Obtain Employment Type from -> Hours Per Week
income_data['employment_type'] = pd.cut(x=income_data['hours_per_week'], bins=[0, 20, 40, income_data['hours_per_week'].max() + 1], labels=['Part-Time', 'Full-Time', 'Over-Time']).astype('object')

# 3. Get Work-Life Balance from -> Hours Per Week and
income_data['work_life_balance'] = income_data['hours_per_week']/168

# 4. Fetch Over Time Flag from -> Employment Type
income_data['over_time_flag'] = np.where(income_data['employment_type'] == 'Over-Time', 1, 0)  # this is also by using .apply()

# 5. Secure Net Capital from -> Capilat Gain & Capilat Loss
income_data['net_capital'] = income_data['capital_gain'] - income_data['capital_loss']

# 6. Gather Education Level Group from -> Education
education_map = mappings['education_map']
income_data['education_level_group'] = income_data['education'].map(education_map)

# 7. Collect Is Educated Flage from -> Education Number
income_data['is_educated_flag'] = income_data['education_num'].apply(lambda x: 1 if x >10 else 0)  # Educated if Education Level > 10 (Threshold)

# 8. Coin Year of Education Remaining from -> Education Number
income_data['year_of_education_remaining'] = income_data['education_num'].max() - income_data['education_num']

# 9. Attain Is Married Flage from -> Marital Status
is_married = income_data['marital_status'].str.contains(r'\bMarried\b', regex=True)
income_data['is_married_flag'] = np.where(is_married, 1, 0)

# 10. Extract Region from -> Native Country
country_to_region = mappings['country_to_region_mapping']
income_data['region'] = income_data['native_country'].map(country_to_region)
'''

# For Checking the Skewness
'''
for col in income_data_int.drop(columns='income').columns:
    if income_data_int[col].skew() > 1:
        print(f'Skewness of {col} is : {income_data_int[col].skew()}')

income_data_int = income_data.select_dtypes(exclude=['object'])
income_data_int = pd.concat([income_data_int, income_data['income']], axis=1)

plot_categorical_features(
    data=income_data_int,
    columns=income_data_int.drop(columns='income').columns,
    fixed_hue='income',
    plot_type='histplot',
    y_axis_label='Income',
    subplot_title=[f'{feature} vs income' for feature in income_data_int.columns.to_list()],
    main_title="features vs income distribution",
    palette='Set1',
    kde=True
)
'''

# This code is for grouping features under respective encoder & makeing it into JSON File.
'''
transform_features = {
    'onehot_features' : ['sex', 'race', 'region', 'employment_type'],
    'ordinal_features' : ['education', 'age', 'education_level_group', 'age_group'],
    'frequency_features' : ['workclass', 'occupation', 'native_country'],
    'target_features' : ['relationship', 'marital_status'],
    'winsorize_featues' : ['hours_per_week', 'capital_gain', 'capital_loss'],
    'label_features' : ['income'],
}
transform_features['remander_features'] = income_data.drop(
    columns=[
        item
        for items in list(
            transform_features.values()
            )
        for item in items]
    ).columns.tolist()
    
transform_features_json = json.dumps(transform_features, indent=4)
with open('config/transform_features.json', 'w') as json_file:
    json_file.write(transform_features_json)
'''

# This code is for making a dictionary of different features that contain info about the encoder
'''
limits = [(0.001, 0.988), (0.05, 0.993), (0.00, 0.993)]
feature_limits = {feature: limit for feature, limit in zip(transform_features['winsorize_featues'], limits)}

transform_parameters = {
    'education' : [
    'Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th',
    'HS-grad', 'Some-college', 'Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Masters',
    'Prof-school', 'Doctorate'
    ],
    
    'education_level' : ['Low', 'Medium', 'High'],
    
    'age_group' : ['Childern', 'Young Adults', 'Middle Aged', 'Seniors'],
    
    'winsorize_limit' : feature_limits
}

with open('config/transform_parameters.json', 'w') as json_file:
    json_file.write(json.dumps(transform_parameters, indent=4))
'''

# Making some changes to JSON file for adding scaling features and removing some features form remander_features
'''
scale_features = []
for col in X_train_transformed.iloc[:, 2:].columns:
    if (X_train_transformed[col] > 1).any():
        scale_features.append(col)
scale_features
scale_features = [col for col in scale_features if col not in transform_features['ordinal_features']]
transform_features['scale_features'] = scale_features
[col for col in transform_features['remander_features'] if col in transform_features['scale_features']]
with open('config/transform_features.json', 'w') as json_file:
    json_file.write(json.dumps(transform_features, indent=4))  # indent=4 for pretty
'''

# Instead of droping columns we can go through by usin sklearn pipelines
'''
winsorize_and_scale_features  = list(
    set(transform_features['winsorize_featues']) & set(transform_features['scale_features'])
    )

only_scale_features  = list(
    set(transform_features['scale_features']) - set(transform_features['winsorize_featues'])
    )

temp = Pipeline(steps=[
    ('winsorizer', Winsorizer(feature_limits=transform_parameters['winsorize_limit'])),
    ('minmax', MinMaxScaler())
])
'''