In [1]:
from incomeprediction.component.data_transformation import datatransformation
from incomeprediction.entity.artifact_entity import *
from incomeprediction.entity.config_entity import *

In [2]:
dt = datatransformation(data_validation_artifact=DataValidationArtifact,data_ingestion_artifact= DataIngestionArtifact,data_transformation_config=DataTransformationConfig)

In [3]:
from incomeprediction.exception import incomepredictionexception
import yaml
import sys
def read_yaml_file(file_path:str)->dict:
    """
    Reads a YAML file and returns the contents as a dictionary.
    file_path: str
    """
    try:
        with open(file_path, 'rb') as yaml_file:
            return yaml.safe_load(yaml_file)
    except Exception as e:
        raise incomepredictionexception(e,sys) from e

In [4]:
schema_file_path = "D:\Ineuron\Project\Adult_Census_Income_Prediction\config\schema.yaml"

In [5]:
dataset_schema = read_yaml_file(file_path=schema_file_path)

In [6]:
from incomeprediction.constant import NUMERICAL_COLUMN_KEY , CATEGORICAL_COLUMN_KEY,TARGET_COLUMN_KEY

In [7]:
numerical_columns = dataset_schema[NUMERICAL_COLUMN_KEY]
categorical_columns = dataset_schema[CATEGORICAL_COLUMN_KEY]
target_column_name = dataset_schema[TARGET_COLUMN_KEY]

In [8]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler 

In [9]:
num_pipeline = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy="median")),
                ('scaler', StandardScaler())
            ])

In [48]:
cat_pipeline = Pipeline(steps=[
                 ('impute', SimpleImputer(strategy="most_frequent"))
                 #('one_hot_encoder', OneHotEncoder())
            ])

In [49]:
preprocessing = ColumnTransformer([
                ('num_pipeline', num_pipeline, numerical_columns),
                ('cat_pipeline', cat_pipeline, categorical_columns),
            ])

In [79]:
train_file_path = "D:\\Ineuron\Project\\Adult_Census_Income_Prediction\\incomeprediction\\artifact\\data_ingestion\\22-08-10--22-42-53\\ingested_data\\train\\adultincomeprediction.csv"
test_file_path = "D:\\Ineuron\\Project\\Adult_Census_Income_Prediction\\incomeprediction\\artifact\\data_ingestion\\22-08-10--22-42-53\\ingested_data\\test\\adultincomeprediction.csv"
schema_file_path = "D:\Ineuron\Project\Adult_Census_Income_Prediction\config\schema.yaml"

In [80]:
import pandas as pd
import numpy as np
train_df = pd.read_csv(train_file_path)
new_train_df = train_df.replace(' ?', np.nan)

In [81]:
test_df = pd.read_csv(test_file_path)
new_test_df = test_df.replace(' ?', np.nan)

In [82]:
input_feature_train_df = new_train_df.drop(columns=[target_column_name],axis=1)
target_feature_train_df = new_train_df['salary']

In [83]:
input_feature_test_df = new_test_df.drop(columns=[target_column_name],axis=1)
target_feature_test_df = new_test_df[target_column_name]

In [91]:
for col in input_feature_train_df[categorical_columns]:
    df_frequency_map = input_feature_train_df[col].value_counts().to_dict()
    input_feature_train_df[col] = input_feature_train_df[col].map(df_frequency_map)


In [98]:
for col in input_feature_test_df[categorical_columns]:
    df_frequency_map = input_feature_test_df[col].value_counts().to_dict()
    input_feature_test_df[col] = input_feature_test_df[col].map(df_frequency_map)

In [95]:
input_feature_train_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country
0,34,1796.0,56460,7319,9,10524,700.0,1082,19418,7533,0,2179,12,20363.0
1,48,1796.0,243631,5095,10,10524,2888.0,9286,205,15259,7688,0,40,20363.0
2,23,919.0,56402,5095,10,10524,2894.0,9286,19418,15259,0,0,30,20363.0
3,56,1445.0,255406,7319,9,3090,2857.0,5755,19418,7533,0,0,40,20363.0
4,17,15882.0,297246,831,7,7491,108.0,3549,19418,7533,0,0,9,20363.0


In [99]:
input_feature_test_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country
0,27,6814.0,160178,2196,10,1353,1118.0,2550,8398,3238,0,0,38,8807.0
1,45,379.0,50567,3182,9,4452,1209.0,486,8398,3238,0,0,40,8807.0
2,29,6814.0,185908,1583,13,4452,1209.0,3907,895,6531,0,0,55,8807.0
3,30,6814.0,190040,1583,13,3192,567.0,2550,8398,3238,0,0,40,8807.0
4,29,745.0,189346,2196,10,1353,1211.0,2550,8398,6531,2202,0,50,8807.0


In [96]:
input_feature_train_arr=preprocessing.fit_transform(input_feature_train_df)

In [100]:
input_feature_test_arr = preprocessing.transform(input_feature_test_df)

In [97]:
print(input_feature_train_arr.shape)

(22792, 14)


In [101]:
print(target_feature_train_df.shape)

(22792,)


In [30]:
# target_feature_train_arr = np.array(target_feature_train_df).reshape(22792,1)
# target_feature_train_arr.shape

(22792, 1)

In [102]:
print(input_feature_train_arr)

[[-3.34770883e-01 -1.26129178e+00 -4.23218942e-01 ...  1.94180000e+04
   7.53300000e+03  2.03630000e+04]
 [ 6.91223177e-01  5.10642419e-01 -3.53774348e-02 ...  2.05000000e+02
   1.52590000e+04  2.03630000e+04]
 [-1.14090907e+00 -1.26184086e+00 -3.53774348e-02 ...  1.94180000e+04
   1.52590000e+04  2.03630000e+04]
 ...
 [-1.50733552e+00  2.51750366e-01 -1.19890196e+00 ...  1.94180000e+04
   7.53300000e+03  2.03630000e+04]
 [ 8.37793757e-01 -1.28210955e+00 -4.23218942e-01 ...  1.94180000e+04
   1.52590000e+04  2.03630000e+04]
 [-3.34770883e-01 -5.87331776e-01  1.12814709e+00 ...  1.94180000e+04
   1.52590000e+04  2.03630000e+04]]


In [103]:
pd.DataFrame(input_feature_train_arr)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,-0.334771,-1.261292,-0.423219,-0.146349,5.136118,-2.309851,1796.0,7319.0,10524.0,700.0,1082.0,19418.0,7533.0,20363.0
1,0.691223,0.510642,-0.035377,0.929742,-0.219921,-0.031295,1796.0,5095.0,10524.0,2888.0,9286.0,205.0,15259.0,20363.0
2,-1.140909,-1.261841,-0.035377,-0.146349,-0.219921,-0.845065,919.0,5095.0,10524.0,2894.0,9286.0,19418.0,15259.0,20363.0
3,1.277505,0.622115,-0.423219,-0.146349,-0.219921,-0.031295,1445.0,7319.0,3090.0,2857.0,5755.0,19418.0,7533.0,20363.0
4,-1.580621,1.018212,-1.198902,-0.146349,-0.219921,-2.553982,15882.0,831.0,7491.0,108.0,3549.0,19418.0,7533.0,20363.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22787,0.617938,1.607197,1.128147,-0.146349,-0.219921,-0.031295,15882.0,3772.0,10524.0,2888.0,9286.0,19418.0,15259.0,20363.0
22788,-0.554627,-0.402121,-0.811060,-0.146349,-0.219921,-1.577458,15882.0,293.0,3090.0,2279.0,2411.0,19418.0,7533.0,20363.0
22789,-1.507336,0.251750,-1.198902,-0.146349,-0.219921,-1.658835,15882.0,831.0,7491.0,2652.0,3549.0,19418.0,7533.0,20363.0
22790,0.837794,-1.282110,-0.423219,-0.146349,-0.219921,3.549294,1796.0,7319.0,10524.0,700.0,9286.0,19418.0,15259.0,20363.0


In [106]:
np.array(target_feature_train_df).shape

(22792,)

In [107]:
input_feature_train_arr.shape

(22792, 14)

In [108]:
print(input_feature_train_arr)

[[-3.34770883e-01 -1.26129178e+00 -4.23218942e-01 ...  1.94180000e+04
   7.53300000e+03  2.03630000e+04]
 [ 6.91223177e-01  5.10642419e-01 -3.53774348e-02 ...  2.05000000e+02
   1.52590000e+04  2.03630000e+04]
 [-1.14090907e+00 -1.26184086e+00 -3.53774348e-02 ...  1.94180000e+04
   1.52590000e+04  2.03630000e+04]
 ...
 [-1.50733552e+00  2.51750366e-01 -1.19890196e+00 ...  1.94180000e+04
   7.53300000e+03  2.03630000e+04]
 [ 8.37793757e-01 -1.28210955e+00 -4.23218942e-01 ...  1.94180000e+04
   1.52590000e+04  2.03630000e+04]
 [-3.34770883e-01 -5.87331776e-01  1.12814709e+00 ...  1.94180000e+04
   1.52590000e+04  2.03630000e+04]]


In [121]:
train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)]

In [122]:
train_arr.shape

(22792, 15)

In [111]:
target_feature_test_df.shape

(9769,)

In [123]:
print(train_arr)

[[-0.3347708831314897 -1.2612917773869472 -0.4232189422101526 ... 7533.0
  20363.0 ' <=50K']
 [0.6912231768662134 0.5106424192459057 -0.035377434797752544 ... 15259.0
  20363.0 ' >50K']
 [-1.140909073129685 -1.2618408591539128 -0.035377434797752544 ...
  15259.0 20363.0 ' <=50K']
 ...
 [-1.5073355231288645 0.2517503661216577 -1.1989019570349528 ... 7533.0
  20363.0 ' <=50K']
 [0.8377937568658852 -1.2821095498965542 -0.4232189422101526 ... 15259.0
  20363.0 ' <=50K']
 [-0.3347708831314897 -0.587331776144824 1.1281470874394477 ... 15259.0
  20363.0 ' <=50K']]


In [114]:
test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]

In [116]:
pd.DataFrame(test_arr)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,-0.847768,-0.279401,-0.035377,-0.146349,-0.219921,-0.194049,6814.0,2196.0,1353.0,1118.0,2550.0,8398.0,3238.0,8807.0,<=50K
1,0.471367,-1.31708,-0.423219,-0.146349,-0.219921,-0.031295,379.0,3182.0,4452.0,1209.0,486.0,8398.0,3238.0,8807.0,<=50K
2,-0.701197,-0.035817,1.128147,-0.146349,-0.219921,1.189361,6814.0,1583.0,4452.0,1209.0,3907.0,895.0,6531.0,8807.0,>50K
3,-0.627912,0.0033,1.128147,-0.146349,-0.219921,-0.031295,6814.0,1583.0,3192.0,567.0,2550.0,8398.0,3238.0,8807.0,<=50K
4,-0.701197,-0.00327,-0.035377,0.161865,-0.219921,0.782475,745.0,2196.0,1353.0,1211.0,2550.0,8398.0,6531.0,8807.0,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9764,-0.261486,-1.505415,-0.81106,-0.146349,-0.219921,3.549294,6814.0,140.0,4452.0,1211.0,3907.0,8398.0,6531.0,8807.0,<=50K
9765,-1.067624,-1.30377,1.128147,-0.146349,-0.219921,-1.658835,6814.0,1583.0,3192.0,1246.0,1519.0,8398.0,3238.0,8807.0,<=50K
9766,-1.067624,2.391181,-0.423219,-0.146349,-0.219921,-0.031295,6814.0,3182.0,3192.0,1209.0,1519.0,8398.0,6531.0,8807.0,<=50K
9767,1.277505,-0.10734,-0.035377,-0.146349,-0.219921,-0.031295,6814.0,2196.0,4452.0,1246.0,3907.0,8398.0,6531.0,8807.0,>50K


In [127]:
trasnformed_train_df = pd.DataFrame(train_arr, columns = train_df.columns)

In [128]:
trasnformed_train_df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,-0.334771,-1.261292,-0.423219,-0.146349,5.136118,-2.309851,1796.0,7319.0,10524.0,700.0,1082.0,19418.0,7533.0,20363.0,<=50K
1,0.691223,0.510642,-0.035377,0.929742,-0.219921,-0.031295,1796.0,5095.0,10524.0,2888.0,9286.0,205.0,15259.0,20363.0,>50K
2,-1.140909,-1.261841,-0.035377,-0.146349,-0.219921,-0.845065,919.0,5095.0,10524.0,2894.0,9286.0,19418.0,15259.0,20363.0,<=50K
3,1.277505,0.622115,-0.423219,-0.146349,-0.219921,-0.031295,1445.0,7319.0,3090.0,2857.0,5755.0,19418.0,7533.0,20363.0,<=50K
4,-1.580621,1.018212,-1.198902,-0.146349,-0.219921,-2.553982,15882.0,831.0,7491.0,108.0,3549.0,19418.0,7533.0,20363.0,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22787,0.617938,1.607197,1.128147,-0.146349,-0.219921,-0.031295,15882.0,3772.0,10524.0,2888.0,9286.0,19418.0,15259.0,20363.0,<=50K
22788,-0.554627,-0.402121,-0.81106,-0.146349,-0.219921,-1.577458,15882.0,293.0,3090.0,2279.0,2411.0,19418.0,7533.0,20363.0,<=50K
22789,-1.507336,0.25175,-1.198902,-0.146349,-0.219921,-1.658835,15882.0,831.0,7491.0,2652.0,3549.0,19418.0,7533.0,20363.0,<=50K
22790,0.837794,-1.28211,-0.423219,-0.146349,-0.219921,3.549294,1796.0,7319.0,10524.0,700.0,9286.0,19418.0,15259.0,20363.0,<=50K


In [129]:
x = trasnformed_train_df.iloc[:,:-1]
y = trasnformed_train_df.iloc[:,-1]

In [130]:
y.value_counts(normalize=True)

 <=50K    0.757503
 >50K     0.242497
Name: salary, dtype: float64

In [131]:
rs = RandomOverSampler(random_state=30)
rs.fit(x,y)

RandomOverSampler(random_state=30)

In [138]:
X_new,y_new = rs.fit_resample(x,y)

In [140]:
y_new.value_counts(normalize=True)

 <=50K    0.5
 >50K     0.5
Name: salary, dtype: float64

In [151]:
y_new

0         <=50K
1          >50K
2         <=50K
3         <=50K
4         <=50K
          ...  
34525      >50K
34526      >50K
34527      >50K
34528      >50K
34529      >50K
Name: salary, Length: 34530, dtype: object

In [145]:
y_new_df = pd.DataFrame(y_new)
x_frame = [X_new,y_new_df]
fianl_train_dataframe = pd.concat(x_frame,axis= 1)

In [149]:
fianl_train_dataframe.shape

(34530, 15)

In [147]:
np.array(fianl_train_dataframe)

array([[-0.3347708831314897, -1.2612917773869472, -0.4232189422101526,
        ..., 7533.0, 20363.0, ' <=50K'],
       [0.6912231768662134, 0.5106424192459057, -0.035377434797752544,
        ..., 15259.0, 20363.0, ' >50K'],
       [-1.140909073129685, -1.2618408591539128, -0.035377434797752544,
        ..., 15259.0, 20363.0, ' <=50K'],
       ...,
       [0.984364336865557, -0.24194041086947152, -0.4232189422101526,
        ..., 15259.0, 20363.0, ' >50K'],
       [0.10494085686752587, -0.7403930521499742, 1.1281470874394477,
        ..., 15259.0, 20363.0, ' >50K'],
       [-0.3347708831314897, -0.28813901471070935, -0.4232189422101526,
        ..., 15259.0, 20363.0, ' >50K']], dtype=object)

In [148]:
print(train_arr)

[[-0.3347708831314897 -1.2612917773869472 -0.4232189422101526 ... 7533.0
  20363.0 ' <=50K']
 [0.6912231768662134 0.5106424192459057 -0.035377434797752544 ... 15259.0
  20363.0 ' >50K']
 [-1.140909073129685 -1.2618408591539128 -0.035377434797752544 ...
  15259.0 20363.0 ' <=50K']
 ...
 [-1.5073355231288645 0.2517503661216577 -1.1989019570349528 ... 7533.0
  20363.0 ' <=50K']
 [0.8377937568658852 -1.2821095498965542 -0.4232189422101526 ... 15259.0
  20363.0 ' <=50K']
 [-0.3347708831314897 -0.587331776144824 1.1281470874394477 ... 15259.0
  20363.0 ' <=50K']]


In [150]:
train_arr.shape

(22792, 15)